]> git.openfabrics.org - ~shefty/rdma-dev.git/blob - net/ipv4/ip_gre.c
a85ae2f7a21cb15502bd69e9c63a1ec29020fa40
[~shefty/rdma-dev.git] / net / ipv4 / ip_gre.c
1 /*
2  *      Linux NET3:     GRE over IP protocol decoder.
3  *
4  *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *      This program is free software; you can redistribute it and/or
7  *      modify it under the terms of the GNU General Public License
8  *      as published by the Free Software Foundation; either version
9  *      2 of the License, or (at your option) any later version.
10  *
11  */
12
13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14
15 #include <linux/capability.h>
16 #include <linux/module.h>
17 #include <linux/types.h>
18 #include <linux/kernel.h>
19 #include <linux/slab.h>
20 #include <asm/uaccess.h>
21 #include <linux/skbuff.h>
22 #include <linux/netdevice.h>
23 #include <linux/in.h>
24 #include <linux/tcp.h>
25 #include <linux/udp.h>
26 #include <linux/if_arp.h>
27 #include <linux/mroute.h>
28 #include <linux/init.h>
29 #include <linux/in6.h>
30 #include <linux/inetdevice.h>
31 #include <linux/igmp.h>
32 #include <linux/netfilter_ipv4.h>
33 #include <linux/etherdevice.h>
34 #include <linux/if_ether.h>
35
36 #include <net/sock.h>
37 #include <net/ip.h>
38 #include <net/icmp.h>
39 #include <net/protocol.h>
40 #include <net/ipip.h>
41 #include <net/arp.h>
42 #include <net/checksum.h>
43 #include <net/dsfield.h>
44 #include <net/inet_ecn.h>
45 #include <net/xfrm.h>
46 #include <net/net_namespace.h>
47 #include <net/netns/generic.h>
48 #include <net/rtnetlink.h>
49 #include <net/gre.h>
50
51 #if IS_ENABLED(CONFIG_IPV6)
52 #include <net/ipv6.h>
53 #include <net/ip6_fib.h>
54 #include <net/ip6_route.h>
55 #endif
56
57 /*
58    Problems & solutions
59    --------------------
60
61    1. The most important issue is detecting local dead loops.
62    They would cause complete host lockup in transmit, which
63    would be "resolved" by stack overflow or, if queueing is enabled,
64    with infinite looping in net_bh.
65
66    We cannot track such dead loops during route installation,
67    it is infeasible task. The most general solutions would be
68    to keep skb->encapsulation counter (sort of local ttl),
69    and silently drop packet when it expires. It is a good
70    solution, but it supposes maintaining new variable in ALL
71    skb, even if no tunneling is used.
72
73    Current solution: xmit_recursion breaks dead loops. This is a percpu
74    counter, since when we enter the first ndo_xmit(), cpu migration is
75    forbidden. We force an exit if this counter reaches RECURSION_LIMIT
76
77    2. Networking dead loops would not kill routers, but would really
78    kill network. IP hop limit plays role of "t->recursion" in this case,
79    if we copy it from packet being encapsulated to upper header.
80    It is very good solution, but it introduces two problems:
81
82    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
83      do not work over tunnels.
84    - traceroute does not work. I planned to relay ICMP from tunnel,
85      so that this problem would be solved and traceroute output
86      would even more informative. This idea appeared to be wrong:
87      only Linux complies to rfc1812 now (yes, guys, Linux is the only
88      true router now :-)), all routers (at least, in neighbourhood of mine)
89      return only 8 bytes of payload. It is the end.
90
91    Hence, if we want that OSPF worked or traceroute said something reasonable,
92    we should search for another solution.
93
94    One of them is to parse packet trying to detect inner encapsulation
95    made by our node. It is difficult or even impossible, especially,
96    taking into account fragmentation. TO be short, ttl is not solution at all.
97
98    Current solution: The solution was UNEXPECTEDLY SIMPLE.
99    We force DF flag on tunnels with preconfigured hop limit,
100    that is ALL. :-) Well, it does not remove the problem completely,
101    but exponential growth of network traffic is changed to linear
102    (branches, that exceed pmtu are pruned) and tunnel mtu
103    rapidly degrades to value <68, where looping stops.
104    Yes, it is not good if there exists a router in the loop,
105    which does not force DF, even when encapsulating packets have DF set.
106    But it is not our problem! Nobody could accuse us, we made
107    all that we could make. Even if it is your gated who injected
108    fatal route to network, even if it were you who configured
109    fatal static route: you are innocent. :-)
110
111
112
113    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
114    practically identical code. It would be good to glue them
115    together, but it is not very evident, how to make them modular.
116    sit is integral part of IPv6, ipip and gre are naturally modular.
117    We could extract common parts (hash table, ioctl etc)
118    to a separate module (ip_tunnel.c).
119
120    Alexey Kuznetsov.
121  */
122
123 static bool log_ecn_error = true;
124 module_param(log_ecn_error, bool, 0644);
125 MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
126
127 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
128 static int ipgre_tunnel_init(struct net_device *dev);
129 static void ipgre_tunnel_setup(struct net_device *dev);
130 static int ipgre_tunnel_bind_dev(struct net_device *dev);
131
132 /* Fallback tunnel: no source, no destination, no key, no options */
133
134 #define HASH_SIZE  16
135
136 static int ipgre_net_id __read_mostly;
137 struct ipgre_net {
138         struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
139
140         struct net_device *fb_tunnel_dev;
141 };
142
143 /* Tunnel hash table */
144
145 /*
146    4 hash tables:
147
148    3: (remote,local)
149    2: (remote,*)
150    1: (*,local)
151    0: (*,*)
152
153    We require exact key match i.e. if a key is present in packet
154    it will match only tunnel with the same key; if it is not present,
155    it will match only keyless tunnel.
156
157    All keysless packets, if not matched configured keyless tunnels
158    will match fallback tunnel.
159  */
160
161 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
162
163 #define tunnels_r_l     tunnels[3]
164 #define tunnels_r       tunnels[2]
165 #define tunnels_l       tunnels[1]
166 #define tunnels_wc      tunnels[0]
167
168 static struct rtnl_link_stats64 *ipgre_get_stats64(struct net_device *dev,
169                                                    struct rtnl_link_stats64 *tot)
170 {
171         int i;
172
173         for_each_possible_cpu(i) {
174                 const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
175                 u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
176                 unsigned int start;
177
178                 do {
179                         start = u64_stats_fetch_begin_bh(&tstats->syncp);
180                         rx_packets = tstats->rx_packets;
181                         tx_packets = tstats->tx_packets;
182                         rx_bytes = tstats->rx_bytes;
183                         tx_bytes = tstats->tx_bytes;
184                 } while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
185
186                 tot->rx_packets += rx_packets;
187                 tot->tx_packets += tx_packets;
188                 tot->rx_bytes   += rx_bytes;
189                 tot->tx_bytes   += tx_bytes;
190         }
191
192         tot->multicast = dev->stats.multicast;
193         tot->rx_crc_errors = dev->stats.rx_crc_errors;
194         tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
195         tot->rx_length_errors = dev->stats.rx_length_errors;
196         tot->rx_frame_errors = dev->stats.rx_frame_errors;
197         tot->rx_errors = dev->stats.rx_errors;
198
199         tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
200         tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
201         tot->tx_dropped = dev->stats.tx_dropped;
202         tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
203         tot->tx_errors = dev->stats.tx_errors;
204
205         return tot;
206 }
207
208 /* Does key in tunnel parameters match packet */
209 static bool ipgre_key_match(const struct ip_tunnel_parm *p,
210                             __be16 flags, __be32 key)
211 {
212         if (p->i_flags & GRE_KEY) {
213                 if (flags & GRE_KEY)
214                         return key == p->i_key;
215                 else
216                         return false;   /* key expected, none present */
217         } else
218                 return !(flags & GRE_KEY);
219 }
220
221 /* Given src, dst and key, find appropriate for input tunnel. */
222
223 static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
224                                              __be32 remote, __be32 local,
225                                              __be16 flags, __be32 key,
226                                              __be16 gre_proto)
227 {
228         struct net *net = dev_net(dev);
229         int link = dev->ifindex;
230         unsigned int h0 = HASH(remote);
231         unsigned int h1 = HASH(key);
232         struct ip_tunnel *t, *cand = NULL;
233         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
234         int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
235                        ARPHRD_ETHER : ARPHRD_IPGRE;
236         int score, cand_score = 4;
237
238         for_each_ip_tunnel_rcu(t, ign->tunnels_r_l[h0 ^ h1]) {
239                 if (local != t->parms.iph.saddr ||
240                     remote != t->parms.iph.daddr ||
241                     !(t->dev->flags & IFF_UP))
242                         continue;
243
244                 if (!ipgre_key_match(&t->parms, flags, key))
245                         continue;
246
247                 if (t->dev->type != ARPHRD_IPGRE &&
248                     t->dev->type != dev_type)
249                         continue;
250
251                 score = 0;
252                 if (t->parms.link != link)
253                         score |= 1;
254                 if (t->dev->type != dev_type)
255                         score |= 2;
256                 if (score == 0)
257                         return t;
258
259                 if (score < cand_score) {
260                         cand = t;
261                         cand_score = score;
262                 }
263         }
264
265         for_each_ip_tunnel_rcu(t, ign->tunnels_r[h0 ^ h1]) {
266                 if (remote != t->parms.iph.daddr ||
267                     !(t->dev->flags & IFF_UP))
268                         continue;
269
270                 if (!ipgre_key_match(&t->parms, flags, key))
271                         continue;
272
273                 if (t->dev->type != ARPHRD_IPGRE &&
274                     t->dev->type != dev_type)
275                         continue;
276
277                 score = 0;
278                 if (t->parms.link != link)
279                         score |= 1;
280                 if (t->dev->type != dev_type)
281                         score |= 2;
282                 if (score == 0)
283                         return t;
284
285                 if (score < cand_score) {
286                         cand = t;
287                         cand_score = score;
288                 }
289         }
290
291         for_each_ip_tunnel_rcu(t, ign->tunnels_l[h1]) {
292                 if ((local != t->parms.iph.saddr &&
293                      (local != t->parms.iph.daddr ||
294                       !ipv4_is_multicast(local))) ||
295                     !(t->dev->flags & IFF_UP))
296                         continue;
297
298                 if (!ipgre_key_match(&t->parms, flags, key))
299                         continue;
300
301                 if (t->dev->type != ARPHRD_IPGRE &&
302                     t->dev->type != dev_type)
303                         continue;
304
305                 score = 0;
306                 if (t->parms.link != link)
307                         score |= 1;
308                 if (t->dev->type != dev_type)
309                         score |= 2;
310                 if (score == 0)
311                         return t;
312
313                 if (score < cand_score) {
314                         cand = t;
315                         cand_score = score;
316                 }
317         }
318
319         for_each_ip_tunnel_rcu(t, ign->tunnels_wc[h1]) {
320                 if (t->parms.i_key != key ||
321                     !(t->dev->flags & IFF_UP))
322                         continue;
323
324                 if (t->dev->type != ARPHRD_IPGRE &&
325                     t->dev->type != dev_type)
326                         continue;
327
328                 score = 0;
329                 if (t->parms.link != link)
330                         score |= 1;
331                 if (t->dev->type != dev_type)
332                         score |= 2;
333                 if (score == 0)
334                         return t;
335
336                 if (score < cand_score) {
337                         cand = t;
338                         cand_score = score;
339                 }
340         }
341
342         if (cand != NULL)
343                 return cand;
344
345         dev = ign->fb_tunnel_dev;
346         if (dev->flags & IFF_UP)
347                 return netdev_priv(dev);
348
349         return NULL;
350 }
351
352 static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
353                 struct ip_tunnel_parm *parms)
354 {
355         __be32 remote = parms->iph.daddr;
356         __be32 local = parms->iph.saddr;
357         __be32 key = parms->i_key;
358         unsigned int h = HASH(key);
359         int prio = 0;
360
361         if (local)
362                 prio |= 1;
363         if (remote && !ipv4_is_multicast(remote)) {
364                 prio |= 2;
365                 h ^= HASH(remote);
366         }
367
368         return &ign->tunnels[prio][h];
369 }
370
371 static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
372                 struct ip_tunnel *t)
373 {
374         return __ipgre_bucket(ign, &t->parms);
375 }
376
377 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
378 {
379         struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
380
381         rcu_assign_pointer(t->next, rtnl_dereference(*tp));
382         rcu_assign_pointer(*tp, t);
383 }
384
385 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
386 {
387         struct ip_tunnel __rcu **tp;
388         struct ip_tunnel *iter;
389
390         for (tp = ipgre_bucket(ign, t);
391              (iter = rtnl_dereference(*tp)) != NULL;
392              tp = &iter->next) {
393                 if (t == iter) {
394                         rcu_assign_pointer(*tp, t->next);
395                         break;
396                 }
397         }
398 }
399
400 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
401                                            struct ip_tunnel_parm *parms,
402                                            int type)
403 {
404         __be32 remote = parms->iph.daddr;
405         __be32 local = parms->iph.saddr;
406         __be32 key = parms->i_key;
407         int link = parms->link;
408         struct ip_tunnel *t;
409         struct ip_tunnel __rcu **tp;
410         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
411
412         for (tp = __ipgre_bucket(ign, parms);
413              (t = rtnl_dereference(*tp)) != NULL;
414              tp = &t->next)
415                 if (local == t->parms.iph.saddr &&
416                     remote == t->parms.iph.daddr &&
417                     key == t->parms.i_key &&
418                     link == t->parms.link &&
419                     type == t->dev->type)
420                         break;
421
422         return t;
423 }
424
425 static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
426                 struct ip_tunnel_parm *parms, int create)
427 {
428         struct ip_tunnel *t, *nt;
429         struct net_device *dev;
430         char name[IFNAMSIZ];
431         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
432
433         t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
434         if (t || !create)
435                 return t;
436
437         if (parms->name[0])
438                 strlcpy(name, parms->name, IFNAMSIZ);
439         else
440                 strcpy(name, "gre%d");
441
442         dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
443         if (!dev)
444                 return NULL;
445
446         dev_net_set(dev, net);
447
448         nt = netdev_priv(dev);
449         nt->parms = *parms;
450         dev->rtnl_link_ops = &ipgre_link_ops;
451
452         dev->mtu = ipgre_tunnel_bind_dev(dev);
453
454         if (register_netdevice(dev) < 0)
455                 goto failed_free;
456
457         /* Can use a lockless transmit, unless we generate output sequences */
458         if (!(nt->parms.o_flags & GRE_SEQ))
459                 dev->features |= NETIF_F_LLTX;
460
461         dev_hold(dev);
462         ipgre_tunnel_link(ign, nt);
463         return nt;
464
465 failed_free:
466         free_netdev(dev);
467         return NULL;
468 }
469
470 static void ipgre_tunnel_uninit(struct net_device *dev)
471 {
472         struct net *net = dev_net(dev);
473         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
474
475         ipgre_tunnel_unlink(ign, netdev_priv(dev));
476         dev_put(dev);
477 }
478
479
480 static void ipgre_err(struct sk_buff *skb, u32 info)
481 {
482
483 /* All the routers (except for Linux) return only
484    8 bytes of packet payload. It means, that precise relaying of
485    ICMP in the real Internet is absolutely infeasible.
486
487    Moreover, Cisco "wise men" put GRE key to the third word
488    in GRE header. It makes impossible maintaining even soft state for keyed
489    GRE tunnels with enabled checksum. Tell them "thank you".
490
491    Well, I wonder, rfc1812 was written by Cisco employee,
492    what the hell these idiots break standards established
493    by themselves???
494  */
495
496         const struct iphdr *iph = (const struct iphdr *)skb->data;
497         __be16       *p = (__be16 *)(skb->data+(iph->ihl<<2));
498         int grehlen = (iph->ihl<<2) + 4;
499         const int type = icmp_hdr(skb)->type;
500         const int code = icmp_hdr(skb)->code;
501         struct ip_tunnel *t;
502         __be16 flags;
503         __be32 key = 0;
504
505         flags = p[0];
506         if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
507                 if (flags&(GRE_VERSION|GRE_ROUTING))
508                         return;
509                 if (flags&GRE_KEY) {
510                         grehlen += 4;
511                         if (flags&GRE_CSUM)
512                                 grehlen += 4;
513                 }
514         }
515
516         /* If only 8 bytes returned, keyed message will be dropped here */
517         if (skb_headlen(skb) < grehlen)
518                 return;
519
520         if (flags & GRE_KEY)
521                 key = *(((__be32 *)p) + (grehlen / 4) - 1);
522
523         switch (type) {
524         default:
525         case ICMP_PARAMETERPROB:
526                 return;
527
528         case ICMP_DEST_UNREACH:
529                 switch (code) {
530                 case ICMP_SR_FAILED:
531                 case ICMP_PORT_UNREACH:
532                         /* Impossible event. */
533                         return;
534                 default:
535                         /* All others are translated to HOST_UNREACH.
536                            rfc2003 contains "deep thoughts" about NET_UNREACH,
537                            I believe they are just ether pollution. --ANK
538                          */
539                         break;
540                 }
541                 break;
542         case ICMP_TIME_EXCEEDED:
543                 if (code != ICMP_EXC_TTL)
544                         return;
545                 break;
546
547         case ICMP_REDIRECT:
548                 break;
549         }
550
551         t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
552                                 flags, key, p[1]);
553
554         if (t == NULL)
555                 return;
556
557         if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
558                 ipv4_update_pmtu(skb, dev_net(skb->dev), info,
559                                  t->parms.link, 0, IPPROTO_GRE, 0);
560                 return;
561         }
562         if (type == ICMP_REDIRECT) {
563                 ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0,
564                               IPPROTO_GRE, 0);
565                 return;
566         }
567         if (t->parms.iph.daddr == 0 ||
568             ipv4_is_multicast(t->parms.iph.daddr))
569                 return;
570
571         if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
572                 return;
573
574         if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
575                 t->err_count++;
576         else
577                 t->err_count = 1;
578         t->err_time = jiffies;
579 }
580
581 static inline u8
582 ipgre_ecn_encapsulate(u8 tos, const struct iphdr *old_iph, struct sk_buff *skb)
583 {
584         u8 inner = 0;
585         if (skb->protocol == htons(ETH_P_IP))
586                 inner = old_iph->tos;
587         else if (skb->protocol == htons(ETH_P_IPV6))
588                 inner = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
589         return INET_ECN_encapsulate(tos, inner);
590 }
591
592 static int ipgre_rcv(struct sk_buff *skb)
593 {
594         const struct iphdr *iph;
595         u8     *h;
596         __be16    flags;
597         __sum16   csum = 0;
598         __be32 key = 0;
599         u32    seqno = 0;
600         struct ip_tunnel *tunnel;
601         int    offset = 4;
602         __be16 gre_proto;
603         int    err;
604
605         if (!pskb_may_pull(skb, 16))
606                 goto drop;
607
608         iph = ip_hdr(skb);
609         h = skb->data;
610         flags = *(__be16 *)h;
611
612         if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
613                 /* - Version must be 0.
614                    - We do not support routing headers.
615                  */
616                 if (flags&(GRE_VERSION|GRE_ROUTING))
617                         goto drop;
618
619                 if (flags&GRE_CSUM) {
620                         switch (skb->ip_summed) {
621                         case CHECKSUM_COMPLETE:
622                                 csum = csum_fold(skb->csum);
623                                 if (!csum)
624                                         break;
625                                 /* fall through */
626                         case CHECKSUM_NONE:
627                                 skb->csum = 0;
628                                 csum = __skb_checksum_complete(skb);
629                                 skb->ip_summed = CHECKSUM_COMPLETE;
630                         }
631                         offset += 4;
632                 }
633                 if (flags&GRE_KEY) {
634                         key = *(__be32 *)(h + offset);
635                         offset += 4;
636                 }
637                 if (flags&GRE_SEQ) {
638                         seqno = ntohl(*(__be32 *)(h + offset));
639                         offset += 4;
640                 }
641         }
642
643         gre_proto = *(__be16 *)(h + 2);
644
645         tunnel = ipgre_tunnel_lookup(skb->dev,
646                                      iph->saddr, iph->daddr, flags, key,
647                                      gre_proto);
648         if (tunnel) {
649                 struct pcpu_tstats *tstats;
650
651                 secpath_reset(skb);
652
653                 skb->protocol = gre_proto;
654                 /* WCCP version 1 and 2 protocol decoding.
655                  * - Change protocol to IP
656                  * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
657                  */
658                 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
659                         skb->protocol = htons(ETH_P_IP);
660                         if ((*(h + offset) & 0xF0) != 0x40)
661                                 offset += 4;
662                 }
663
664                 skb->mac_header = skb->network_header;
665                 __pskb_pull(skb, offset);
666                 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
667                 skb->pkt_type = PACKET_HOST;
668 #ifdef CONFIG_NET_IPGRE_BROADCAST
669                 if (ipv4_is_multicast(iph->daddr)) {
670                         /* Looped back packet, drop it! */
671                         if (rt_is_output_route(skb_rtable(skb)))
672                                 goto drop;
673                         tunnel->dev->stats.multicast++;
674                         skb->pkt_type = PACKET_BROADCAST;
675                 }
676 #endif
677
678                 if (((flags&GRE_CSUM) && csum) ||
679                     (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
680                         tunnel->dev->stats.rx_crc_errors++;
681                         tunnel->dev->stats.rx_errors++;
682                         goto drop;
683                 }
684                 if (tunnel->parms.i_flags&GRE_SEQ) {
685                         if (!(flags&GRE_SEQ) ||
686                             (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
687                                 tunnel->dev->stats.rx_fifo_errors++;
688                                 tunnel->dev->stats.rx_errors++;
689                                 goto drop;
690                         }
691                         tunnel->i_seqno = seqno + 1;
692                 }
693
694                 /* Warning: All skb pointers will be invalidated! */
695                 if (tunnel->dev->type == ARPHRD_ETHER) {
696                         if (!pskb_may_pull(skb, ETH_HLEN)) {
697                                 tunnel->dev->stats.rx_length_errors++;
698                                 tunnel->dev->stats.rx_errors++;
699                                 goto drop;
700                         }
701
702                         iph = ip_hdr(skb);
703                         skb->protocol = eth_type_trans(skb, tunnel->dev);
704                         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
705                 }
706
707                 __skb_tunnel_rx(skb, tunnel->dev);
708
709                 skb_reset_network_header(skb);
710                 err = IP_ECN_decapsulate(iph, skb);
711                 if (unlikely(err)) {
712                         if (log_ecn_error)
713                                 net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
714                                                      &iph->saddr, iph->tos);
715                         if (err > 1) {
716                                 ++tunnel->dev->stats.rx_frame_errors;
717                                 ++tunnel->dev->stats.rx_errors;
718                                 goto drop;
719                         }
720                 }
721
722                 tstats = this_cpu_ptr(tunnel->dev->tstats);
723                 u64_stats_update_begin(&tstats->syncp);
724                 tstats->rx_packets++;
725                 tstats->rx_bytes += skb->len;
726                 u64_stats_update_end(&tstats->syncp);
727
728                 gro_cells_receive(&tunnel->gro_cells, skb);
729                 return 0;
730         }
731         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
732
733 drop:
734         kfree_skb(skb);
735         return 0;
736 }
737
738 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
739 {
740         struct ip_tunnel *tunnel = netdev_priv(dev);
741         const struct iphdr  *old_iph = ip_hdr(skb);
742         const struct iphdr  *tiph;
743         struct flowi4 fl4;
744         u8     tos;
745         __be16 df;
746         struct rtable *rt;                      /* Route to the other host */
747         struct net_device *tdev;                /* Device to other host */
748         struct iphdr  *iph;                     /* Our new IP header */
749         unsigned int max_headroom;              /* The extra header space needed */
750         int    gre_hlen;
751         __be32 dst;
752         int    mtu;
753
754         if (skb->ip_summed == CHECKSUM_PARTIAL &&
755             skb_checksum_help(skb))
756                 goto tx_error;
757
758         if (dev->type == ARPHRD_ETHER)
759                 IPCB(skb)->flags = 0;
760
761         if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
762                 gre_hlen = 0;
763                 tiph = (const struct iphdr *)skb->data;
764         } else {
765                 gre_hlen = tunnel->hlen;
766                 tiph = &tunnel->parms.iph;
767         }
768
769         if ((dst = tiph->daddr) == 0) {
770                 /* NBMA tunnel */
771
772                 if (skb_dst(skb) == NULL) {
773                         dev->stats.tx_fifo_errors++;
774                         goto tx_error;
775                 }
776
777                 if (skb->protocol == htons(ETH_P_IP)) {
778                         rt = skb_rtable(skb);
779                         dst = rt_nexthop(rt, old_iph->daddr);
780                 }
781 #if IS_ENABLED(CONFIG_IPV6)
782                 else if (skb->protocol == htons(ETH_P_IPV6)) {
783                         const struct in6_addr *addr6;
784                         struct neighbour *neigh;
785                         bool do_tx_error_icmp;
786                         int addr_type;
787
788                         neigh = dst_neigh_lookup(skb_dst(skb), &ipv6_hdr(skb)->daddr);
789                         if (neigh == NULL)
790                                 goto tx_error;
791
792                         addr6 = (const struct in6_addr *)&neigh->primary_key;
793                         addr_type = ipv6_addr_type(addr6);
794
795                         if (addr_type == IPV6_ADDR_ANY) {
796                                 addr6 = &ipv6_hdr(skb)->daddr;
797                                 addr_type = ipv6_addr_type(addr6);
798                         }
799
800                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
801                                 do_tx_error_icmp = true;
802                         else {
803                                 do_tx_error_icmp = false;
804                                 dst = addr6->s6_addr32[3];
805                         }
806                         neigh_release(neigh);
807                         if (do_tx_error_icmp)
808                                 goto tx_error_icmp;
809                 }
810 #endif
811                 else
812                         goto tx_error;
813         }
814
815         tos = tiph->tos;
816         if (tos == 1) {
817                 tos = 0;
818                 if (skb->protocol == htons(ETH_P_IP))
819                         tos = old_iph->tos;
820                 else if (skb->protocol == htons(ETH_P_IPV6))
821                         tos = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
822         }
823
824         rt = ip_route_output_gre(dev_net(dev), &fl4, dst, tiph->saddr,
825                                  tunnel->parms.o_key, RT_TOS(tos),
826                                  tunnel->parms.link);
827         if (IS_ERR(rt)) {
828                 dev->stats.tx_carrier_errors++;
829                 goto tx_error;
830         }
831         tdev = rt->dst.dev;
832
833         if (tdev == dev) {
834                 ip_rt_put(rt);
835                 dev->stats.collisions++;
836                 goto tx_error;
837         }
838
839         df = tiph->frag_off;
840         if (df)
841                 mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen;
842         else
843                 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
844
845         if (skb_dst(skb))
846                 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
847
848         if (skb->protocol == htons(ETH_P_IP)) {
849                 df |= (old_iph->frag_off&htons(IP_DF));
850
851                 if ((old_iph->frag_off&htons(IP_DF)) &&
852                     mtu < ntohs(old_iph->tot_len)) {
853                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
854                         ip_rt_put(rt);
855                         goto tx_error;
856                 }
857         }
858 #if IS_ENABLED(CONFIG_IPV6)
859         else if (skb->protocol == htons(ETH_P_IPV6)) {
860                 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
861
862                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
863                         if ((tunnel->parms.iph.daddr &&
864                              !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
865                             rt6->rt6i_dst.plen == 128) {
866                                 rt6->rt6i_flags |= RTF_MODIFIED;
867                                 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
868                         }
869                 }
870
871                 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
872                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
873                         ip_rt_put(rt);
874                         goto tx_error;
875                 }
876         }
877 #endif
878
879         if (tunnel->err_count > 0) {
880                 if (time_before(jiffies,
881                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
882                         tunnel->err_count--;
883
884                         dst_link_failure(skb);
885                 } else
886                         tunnel->err_count = 0;
887         }
888
889         max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len;
890
891         if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
892             (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
893                 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
894                 if (max_headroom > dev->needed_headroom)
895                         dev->needed_headroom = max_headroom;
896                 if (!new_skb) {
897                         ip_rt_put(rt);
898                         dev->stats.tx_dropped++;
899                         dev_kfree_skb(skb);
900                         return NETDEV_TX_OK;
901                 }
902                 if (skb->sk)
903                         skb_set_owner_w(new_skb, skb->sk);
904                 dev_kfree_skb(skb);
905                 skb = new_skb;
906                 old_iph = ip_hdr(skb);
907         }
908
909         skb_reset_transport_header(skb);
910         skb_push(skb, gre_hlen);
911         skb_reset_network_header(skb);
912         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
913         IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
914                               IPSKB_REROUTED);
915         skb_dst_drop(skb);
916         skb_dst_set(skb, &rt->dst);
917
918         /*
919          *      Push down and install the IPIP header.
920          */
921
922         iph                     =       ip_hdr(skb);
923         iph->version            =       4;
924         iph->ihl                =       sizeof(struct iphdr) >> 2;
925         iph->frag_off           =       df;
926         iph->protocol           =       IPPROTO_GRE;
927         iph->tos                =       ipgre_ecn_encapsulate(tos, old_iph, skb);
928         iph->daddr              =       fl4.daddr;
929         iph->saddr              =       fl4.saddr;
930
931         if ((iph->ttl = tiph->ttl) == 0) {
932                 if (skb->protocol == htons(ETH_P_IP))
933                         iph->ttl = old_iph->ttl;
934 #if IS_ENABLED(CONFIG_IPV6)
935                 else if (skb->protocol == htons(ETH_P_IPV6))
936                         iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit;
937 #endif
938                 else
939                         iph->ttl = ip4_dst_hoplimit(&rt->dst);
940         }
941
942         ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
943         ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
944                                    htons(ETH_P_TEB) : skb->protocol;
945
946         if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
947                 __be32 *ptr = (__be32 *)(((u8 *)iph) + tunnel->hlen - 4);
948
949                 if (tunnel->parms.o_flags&GRE_SEQ) {
950                         ++tunnel->o_seqno;
951                         *ptr = htonl(tunnel->o_seqno);
952                         ptr--;
953                 }
954                 if (tunnel->parms.o_flags&GRE_KEY) {
955                         *ptr = tunnel->parms.o_key;
956                         ptr--;
957                 }
958                 if (tunnel->parms.o_flags&GRE_CSUM) {
959                         *ptr = 0;
960                         *(__sum16 *)ptr = ip_compute_csum((void *)(iph+1), skb->len - sizeof(struct iphdr));
961                 }
962         }
963
964         iptunnel_xmit(skb, dev);
965         return NETDEV_TX_OK;
966
967 #if IS_ENABLED(CONFIG_IPV6)
968 tx_error_icmp:
969         dst_link_failure(skb);
970 #endif
971 tx_error:
972         dev->stats.tx_errors++;
973         dev_kfree_skb(skb);
974         return NETDEV_TX_OK;
975 }
976
977 static int ipgre_tunnel_bind_dev(struct net_device *dev)
978 {
979         struct net_device *tdev = NULL;
980         struct ip_tunnel *tunnel;
981         const struct iphdr *iph;
982         int hlen = LL_MAX_HEADER;
983         int mtu = ETH_DATA_LEN;
984         int addend = sizeof(struct iphdr) + 4;
985
986         tunnel = netdev_priv(dev);
987         iph = &tunnel->parms.iph;
988
989         /* Guess output device to choose reasonable mtu and needed_headroom */
990
991         if (iph->daddr) {
992                 struct flowi4 fl4;
993                 struct rtable *rt;
994
995                 rt = ip_route_output_gre(dev_net(dev), &fl4,
996                                          iph->daddr, iph->saddr,
997                                          tunnel->parms.o_key,
998                                          RT_TOS(iph->tos),
999                                          tunnel->parms.link);
1000                 if (!IS_ERR(rt)) {
1001                         tdev = rt->dst.dev;
1002                         ip_rt_put(rt);
1003                 }
1004
1005                 if (dev->type != ARPHRD_ETHER)
1006                         dev->flags |= IFF_POINTOPOINT;
1007         }
1008
1009         if (!tdev && tunnel->parms.link)
1010                 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
1011
1012         if (tdev) {
1013                 hlen = tdev->hard_header_len + tdev->needed_headroom;
1014                 mtu = tdev->mtu;
1015         }
1016         dev->iflink = tunnel->parms.link;
1017
1018         /* Precalculate GRE options length */
1019         if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
1020                 if (tunnel->parms.o_flags&GRE_CSUM)
1021                         addend += 4;
1022                 if (tunnel->parms.o_flags&GRE_KEY)
1023                         addend += 4;
1024                 if (tunnel->parms.o_flags&GRE_SEQ)
1025                         addend += 4;
1026         }
1027         dev->needed_headroom = addend + hlen;
1028         mtu -= dev->hard_header_len + addend;
1029
1030         if (mtu < 68)
1031                 mtu = 68;
1032
1033         tunnel->hlen = addend;
1034
1035         return mtu;
1036 }
1037
1038 static int
1039 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
1040 {
1041         int err = 0;
1042         struct ip_tunnel_parm p;
1043         struct ip_tunnel *t;
1044         struct net *net = dev_net(dev);
1045         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1046
1047         switch (cmd) {
1048         case SIOCGETTUNNEL:
1049                 t = NULL;
1050                 if (dev == ign->fb_tunnel_dev) {
1051                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
1052                                 err = -EFAULT;
1053                                 break;
1054                         }
1055                         t = ipgre_tunnel_locate(net, &p, 0);
1056                 }
1057                 if (t == NULL)
1058                         t = netdev_priv(dev);
1059                 memcpy(&p, &t->parms, sizeof(p));
1060                 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
1061                         err = -EFAULT;
1062                 break;
1063
1064         case SIOCADDTUNNEL:
1065         case SIOCCHGTUNNEL:
1066                 err = -EPERM;
1067                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
1068                         goto done;
1069
1070                 err = -EFAULT;
1071                 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1072                         goto done;
1073
1074                 err = -EINVAL;
1075                 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1076                     p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1077                     ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1078                         goto done;
1079                 if (p.iph.ttl)
1080                         p.iph.frag_off |= htons(IP_DF);
1081
1082                 if (!(p.i_flags&GRE_KEY))
1083                         p.i_key = 0;
1084                 if (!(p.o_flags&GRE_KEY))
1085                         p.o_key = 0;
1086
1087                 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1088
1089                 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1090                         if (t != NULL) {
1091                                 if (t->dev != dev) {
1092                                         err = -EEXIST;
1093                                         break;
1094                                 }
1095                         } else {
1096                                 unsigned int nflags = 0;
1097
1098                                 t = netdev_priv(dev);
1099
1100                                 if (ipv4_is_multicast(p.iph.daddr))
1101                                         nflags = IFF_BROADCAST;
1102                                 else if (p.iph.daddr)
1103                                         nflags = IFF_POINTOPOINT;
1104
1105                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1106                                         err = -EINVAL;
1107                                         break;
1108                                 }
1109                                 ipgre_tunnel_unlink(ign, t);
1110                                 synchronize_net();
1111                                 t->parms.iph.saddr = p.iph.saddr;
1112                                 t->parms.iph.daddr = p.iph.daddr;
1113                                 t->parms.i_key = p.i_key;
1114                                 t->parms.o_key = p.o_key;
1115                                 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1116                                 memcpy(dev->broadcast, &p.iph.daddr, 4);
1117                                 ipgre_tunnel_link(ign, t);
1118                                 netdev_state_change(dev);
1119                         }
1120                 }
1121
1122                 if (t) {
1123                         err = 0;
1124                         if (cmd == SIOCCHGTUNNEL) {
1125                                 t->parms.iph.ttl = p.iph.ttl;
1126                                 t->parms.iph.tos = p.iph.tos;
1127                                 t->parms.iph.frag_off = p.iph.frag_off;
1128                                 if (t->parms.link != p.link) {
1129                                         t->parms.link = p.link;
1130                                         dev->mtu = ipgre_tunnel_bind_dev(dev);
1131                                         netdev_state_change(dev);
1132                                 }
1133                         }
1134                         if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1135                                 err = -EFAULT;
1136                 } else
1137                         err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1138                 break;
1139
1140         case SIOCDELTUNNEL:
1141                 err = -EPERM;
1142                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
1143                         goto done;
1144
1145                 if (dev == ign->fb_tunnel_dev) {
1146                         err = -EFAULT;
1147                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1148                                 goto done;
1149                         err = -ENOENT;
1150                         if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1151                                 goto done;
1152                         err = -EPERM;
1153                         if (t == netdev_priv(ign->fb_tunnel_dev))
1154                                 goto done;
1155                         dev = t->dev;
1156                 }
1157                 unregister_netdevice(dev);
1158                 err = 0;
1159                 break;
1160
1161         default:
1162                 err = -EINVAL;
1163         }
1164
1165 done:
1166         return err;
1167 }
1168
1169 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1170 {
1171         struct ip_tunnel *tunnel = netdev_priv(dev);
1172         if (new_mtu < 68 ||
1173             new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1174                 return -EINVAL;
1175         dev->mtu = new_mtu;
1176         return 0;
1177 }
1178
1179 /* Nice toy. Unfortunately, useless in real life :-)
1180    It allows to construct virtual multiprotocol broadcast "LAN"
1181    over the Internet, provided multicast routing is tuned.
1182
1183
1184    I have no idea was this bicycle invented before me,
1185    so that I had to set ARPHRD_IPGRE to a random value.
1186    I have an impression, that Cisco could make something similar,
1187    but this feature is apparently missing in IOS<=11.2(8).
1188
1189    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1190    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1191
1192    ping -t 255 224.66.66.66
1193
1194    If nobody answers, mbone does not work.
1195
1196    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1197    ip addr add 10.66.66.<somewhat>/24 dev Universe
1198    ifconfig Universe up
1199    ifconfig Universe add fe80::<Your_real_addr>/10
1200    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1201    ftp 10.66.66.66
1202    ...
1203    ftp fec0:6666:6666::193.233.7.65
1204    ...
1205
1206  */
1207
1208 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1209                         unsigned short type,
1210                         const void *daddr, const void *saddr, unsigned int len)
1211 {
1212         struct ip_tunnel *t = netdev_priv(dev);
1213         struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1214         __be16 *p = (__be16 *)(iph+1);
1215
1216         memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1217         p[0]            = t->parms.o_flags;
1218         p[1]            = htons(type);
1219
1220         /*
1221          *      Set the source hardware address.
1222          */
1223
1224         if (saddr)
1225                 memcpy(&iph->saddr, saddr, 4);
1226         if (daddr)
1227                 memcpy(&iph->daddr, daddr, 4);
1228         if (iph->daddr)
1229                 return t->hlen;
1230
1231         return -t->hlen;
1232 }
1233
1234 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1235 {
1236         const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
1237         memcpy(haddr, &iph->saddr, 4);
1238         return 4;
1239 }
1240
1241 static const struct header_ops ipgre_header_ops = {
1242         .create = ipgre_header,
1243         .parse  = ipgre_header_parse,
1244 };
1245
1246 #ifdef CONFIG_NET_IPGRE_BROADCAST
1247 static int ipgre_open(struct net_device *dev)
1248 {
1249         struct ip_tunnel *t = netdev_priv(dev);
1250
1251         if (ipv4_is_multicast(t->parms.iph.daddr)) {
1252                 struct flowi4 fl4;
1253                 struct rtable *rt;
1254
1255                 rt = ip_route_output_gre(dev_net(dev), &fl4,
1256                                          t->parms.iph.daddr,
1257                                          t->parms.iph.saddr,
1258                                          t->parms.o_key,
1259                                          RT_TOS(t->parms.iph.tos),
1260                                          t->parms.link);
1261                 if (IS_ERR(rt))
1262                         return -EADDRNOTAVAIL;
1263                 dev = rt->dst.dev;
1264                 ip_rt_put(rt);
1265                 if (__in_dev_get_rtnl(dev) == NULL)
1266                         return -EADDRNOTAVAIL;
1267                 t->mlink = dev->ifindex;
1268                 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1269         }
1270         return 0;
1271 }
1272
1273 static int ipgre_close(struct net_device *dev)
1274 {
1275         struct ip_tunnel *t = netdev_priv(dev);
1276
1277         if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1278                 struct in_device *in_dev;
1279                 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1280                 if (in_dev)
1281                         ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1282         }
1283         return 0;
1284 }
1285
1286 #endif
1287
1288 static const struct net_device_ops ipgre_netdev_ops = {
1289         .ndo_init               = ipgre_tunnel_init,
1290         .ndo_uninit             = ipgre_tunnel_uninit,
1291 #ifdef CONFIG_NET_IPGRE_BROADCAST
1292         .ndo_open               = ipgre_open,
1293         .ndo_stop               = ipgre_close,
1294 #endif
1295         .ndo_start_xmit         = ipgre_tunnel_xmit,
1296         .ndo_do_ioctl           = ipgre_tunnel_ioctl,
1297         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1298         .ndo_get_stats64        = ipgre_get_stats64,
1299 };
1300
1301 static void ipgre_dev_free(struct net_device *dev)
1302 {
1303         struct ip_tunnel *tunnel = netdev_priv(dev);
1304
1305         gro_cells_destroy(&tunnel->gro_cells);
1306         free_percpu(dev->tstats);
1307         free_netdev(dev);
1308 }
1309
1310 #define GRE_FEATURES (NETIF_F_SG |              \
1311                       NETIF_F_FRAGLIST |        \
1312                       NETIF_F_HIGHDMA |         \
1313                       NETIF_F_HW_CSUM)
1314
1315 static void ipgre_tunnel_setup(struct net_device *dev)
1316 {
1317         dev->netdev_ops         = &ipgre_netdev_ops;
1318         dev->destructor         = ipgre_dev_free;
1319
1320         dev->type               = ARPHRD_IPGRE;
1321         dev->needed_headroom    = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1322         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1323         dev->flags              = IFF_NOARP;
1324         dev->iflink             = 0;
1325         dev->addr_len           = 4;
1326         dev->features           |= NETIF_F_NETNS_LOCAL;
1327         dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
1328
1329         dev->features           |= GRE_FEATURES;
1330         dev->hw_features        |= GRE_FEATURES;
1331 }
1332
1333 static int ipgre_tunnel_init(struct net_device *dev)
1334 {
1335         struct ip_tunnel *tunnel;
1336         struct iphdr *iph;
1337         int err;
1338
1339         tunnel = netdev_priv(dev);
1340         iph = &tunnel->parms.iph;
1341
1342         tunnel->dev = dev;
1343         strcpy(tunnel->parms.name, dev->name);
1344
1345         memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1346         memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1347
1348         if (iph->daddr) {
1349 #ifdef CONFIG_NET_IPGRE_BROADCAST
1350                 if (ipv4_is_multicast(iph->daddr)) {
1351                         if (!iph->saddr)
1352                                 return -EINVAL;
1353                         dev->flags = IFF_BROADCAST;
1354                         dev->header_ops = &ipgre_header_ops;
1355                 }
1356 #endif
1357         } else
1358                 dev->header_ops = &ipgre_header_ops;
1359
1360         dev->tstats = alloc_percpu(struct pcpu_tstats);
1361         if (!dev->tstats)
1362                 return -ENOMEM;
1363
1364         err = gro_cells_init(&tunnel->gro_cells, dev);
1365         if (err) {
1366                 free_percpu(dev->tstats);
1367                 return err;
1368         }
1369
1370         return 0;
1371 }
1372
1373 static void ipgre_fb_tunnel_init(struct net_device *dev)
1374 {
1375         struct ip_tunnel *tunnel = netdev_priv(dev);
1376         struct iphdr *iph = &tunnel->parms.iph;
1377
1378         tunnel->dev = dev;
1379         strcpy(tunnel->parms.name, dev->name);
1380
1381         iph->version            = 4;
1382         iph->protocol           = IPPROTO_GRE;
1383         iph->ihl                = 5;
1384         tunnel->hlen            = sizeof(struct iphdr) + 4;
1385
1386         dev_hold(dev);
1387 }
1388
1389
1390 static const struct gre_protocol ipgre_protocol = {
1391         .handler     = ipgre_rcv,
1392         .err_handler = ipgre_err,
1393 };
1394
1395 static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1396 {
1397         int prio;
1398
1399         for (prio = 0; prio < 4; prio++) {
1400                 int h;
1401                 for (h = 0; h < HASH_SIZE; h++) {
1402                         struct ip_tunnel *t;
1403
1404                         t = rtnl_dereference(ign->tunnels[prio][h]);
1405
1406                         while (t != NULL) {
1407                                 unregister_netdevice_queue(t->dev, head);
1408                                 t = rtnl_dereference(t->next);
1409                         }
1410                 }
1411         }
1412 }
1413
1414 static int __net_init ipgre_init_net(struct net *net)
1415 {
1416         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1417         int err;
1418
1419         ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1420                                            ipgre_tunnel_setup);
1421         if (!ign->fb_tunnel_dev) {
1422                 err = -ENOMEM;
1423                 goto err_alloc_dev;
1424         }
1425         dev_net_set(ign->fb_tunnel_dev, net);
1426
1427         ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1428         ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1429
1430         if ((err = register_netdev(ign->fb_tunnel_dev)))
1431                 goto err_reg_dev;
1432
1433         rcu_assign_pointer(ign->tunnels_wc[0],
1434                            netdev_priv(ign->fb_tunnel_dev));
1435         return 0;
1436
1437 err_reg_dev:
1438         ipgre_dev_free(ign->fb_tunnel_dev);
1439 err_alloc_dev:
1440         return err;
1441 }
1442
1443 static void __net_exit ipgre_exit_net(struct net *net)
1444 {
1445         struct ipgre_net *ign;
1446         LIST_HEAD(list);
1447
1448         ign = net_generic(net, ipgre_net_id);
1449         rtnl_lock();
1450         ipgre_destroy_tunnels(ign, &list);
1451         unregister_netdevice_many(&list);
1452         rtnl_unlock();
1453 }
1454
1455 static struct pernet_operations ipgre_net_ops = {
1456         .init = ipgre_init_net,
1457         .exit = ipgre_exit_net,
1458         .id   = &ipgre_net_id,
1459         .size = sizeof(struct ipgre_net),
1460 };
1461
1462 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1463 {
1464         __be16 flags;
1465
1466         if (!data)
1467                 return 0;
1468
1469         flags = 0;
1470         if (data[IFLA_GRE_IFLAGS])
1471                 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1472         if (data[IFLA_GRE_OFLAGS])
1473                 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1474         if (flags & (GRE_VERSION|GRE_ROUTING))
1475                 return -EINVAL;
1476
1477         return 0;
1478 }
1479
1480 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1481 {
1482         __be32 daddr;
1483
1484         if (tb[IFLA_ADDRESS]) {
1485                 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1486                         return -EINVAL;
1487                 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1488                         return -EADDRNOTAVAIL;
1489         }
1490
1491         if (!data)
1492                 goto out;
1493
1494         if (data[IFLA_GRE_REMOTE]) {
1495                 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1496                 if (!daddr)
1497                         return -EINVAL;
1498         }
1499
1500 out:
1501         return ipgre_tunnel_validate(tb, data);
1502 }
1503
1504 static void ipgre_netlink_parms(struct nlattr *data[],
1505                                 struct ip_tunnel_parm *parms)
1506 {
1507         memset(parms, 0, sizeof(*parms));
1508
1509         parms->iph.protocol = IPPROTO_GRE;
1510
1511         if (!data)
1512                 return;
1513
1514         if (data[IFLA_GRE_LINK])
1515                 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1516
1517         if (data[IFLA_GRE_IFLAGS])
1518                 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1519
1520         if (data[IFLA_GRE_OFLAGS])
1521                 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1522
1523         if (data[IFLA_GRE_IKEY])
1524                 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1525
1526         if (data[IFLA_GRE_OKEY])
1527                 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1528
1529         if (data[IFLA_GRE_LOCAL])
1530                 parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1531
1532         if (data[IFLA_GRE_REMOTE])
1533                 parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1534
1535         if (data[IFLA_GRE_TTL])
1536                 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1537
1538         if (data[IFLA_GRE_TOS])
1539                 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1540
1541         if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1542                 parms->iph.frag_off = htons(IP_DF);
1543 }
1544
1545 static int ipgre_tap_init(struct net_device *dev)
1546 {
1547         struct ip_tunnel *tunnel;
1548
1549         tunnel = netdev_priv(dev);
1550
1551         tunnel->dev = dev;
1552         strcpy(tunnel->parms.name, dev->name);
1553
1554         ipgre_tunnel_bind_dev(dev);
1555
1556         dev->tstats = alloc_percpu(struct pcpu_tstats);
1557         if (!dev->tstats)
1558                 return -ENOMEM;
1559
1560         return 0;
1561 }
1562
1563 static const struct net_device_ops ipgre_tap_netdev_ops = {
1564         .ndo_init               = ipgre_tap_init,
1565         .ndo_uninit             = ipgre_tunnel_uninit,
1566         .ndo_start_xmit         = ipgre_tunnel_xmit,
1567         .ndo_set_mac_address    = eth_mac_addr,
1568         .ndo_validate_addr      = eth_validate_addr,
1569         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1570         .ndo_get_stats64        = ipgre_get_stats64,
1571 };
1572
1573 static void ipgre_tap_setup(struct net_device *dev)
1574 {
1575
1576         ether_setup(dev);
1577
1578         dev->netdev_ops         = &ipgre_tap_netdev_ops;
1579         dev->destructor         = ipgre_dev_free;
1580
1581         dev->iflink             = 0;
1582         dev->features           |= NETIF_F_NETNS_LOCAL;
1583 }
1584
1585 static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
1586                          struct nlattr *data[])
1587 {
1588         struct ip_tunnel *nt;
1589         struct net *net = dev_net(dev);
1590         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1591         int mtu;
1592         int err;
1593
1594         nt = netdev_priv(dev);
1595         ipgre_netlink_parms(data, &nt->parms);
1596
1597         if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1598                 return -EEXIST;
1599
1600         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1601                 eth_hw_addr_random(dev);
1602
1603         mtu = ipgre_tunnel_bind_dev(dev);
1604         if (!tb[IFLA_MTU])
1605                 dev->mtu = mtu;
1606
1607         /* Can use a lockless transmit, unless we generate output sequences */
1608         if (!(nt->parms.o_flags & GRE_SEQ))
1609                 dev->features |= NETIF_F_LLTX;
1610
1611         err = register_netdevice(dev);
1612         if (err)
1613                 goto out;
1614
1615         dev_hold(dev);
1616         ipgre_tunnel_link(ign, nt);
1617
1618 out:
1619         return err;
1620 }
1621
1622 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1623                             struct nlattr *data[])
1624 {
1625         struct ip_tunnel *t, *nt;
1626         struct net *net = dev_net(dev);
1627         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1628         struct ip_tunnel_parm p;
1629         int mtu;
1630
1631         if (dev == ign->fb_tunnel_dev)
1632                 return -EINVAL;
1633
1634         nt = netdev_priv(dev);
1635         ipgre_netlink_parms(data, &p);
1636
1637         t = ipgre_tunnel_locate(net, &p, 0);
1638
1639         if (t) {
1640                 if (t->dev != dev)
1641                         return -EEXIST;
1642         } else {
1643                 t = nt;
1644
1645                 if (dev->type != ARPHRD_ETHER) {
1646                         unsigned int nflags = 0;
1647
1648                         if (ipv4_is_multicast(p.iph.daddr))
1649                                 nflags = IFF_BROADCAST;
1650                         else if (p.iph.daddr)
1651                                 nflags = IFF_POINTOPOINT;
1652
1653                         if ((dev->flags ^ nflags) &
1654                             (IFF_POINTOPOINT | IFF_BROADCAST))
1655                                 return -EINVAL;
1656                 }
1657
1658                 ipgre_tunnel_unlink(ign, t);
1659                 t->parms.iph.saddr = p.iph.saddr;
1660                 t->parms.iph.daddr = p.iph.daddr;
1661                 t->parms.i_key = p.i_key;
1662                 if (dev->type != ARPHRD_ETHER) {
1663                         memcpy(dev->dev_addr, &p.iph.saddr, 4);
1664                         memcpy(dev->broadcast, &p.iph.daddr, 4);
1665                 }
1666                 ipgre_tunnel_link(ign, t);
1667                 netdev_state_change(dev);
1668         }
1669
1670         t->parms.o_key = p.o_key;
1671         t->parms.iph.ttl = p.iph.ttl;
1672         t->parms.iph.tos = p.iph.tos;
1673         t->parms.iph.frag_off = p.iph.frag_off;
1674
1675         if (t->parms.link != p.link) {
1676                 t->parms.link = p.link;
1677                 mtu = ipgre_tunnel_bind_dev(dev);
1678                 if (!tb[IFLA_MTU])
1679                         dev->mtu = mtu;
1680                 netdev_state_change(dev);
1681         }
1682
1683         return 0;
1684 }
1685
1686 static size_t ipgre_get_size(const struct net_device *dev)
1687 {
1688         return
1689                 /* IFLA_GRE_LINK */
1690                 nla_total_size(4) +
1691                 /* IFLA_GRE_IFLAGS */
1692                 nla_total_size(2) +
1693                 /* IFLA_GRE_OFLAGS */
1694                 nla_total_size(2) +
1695                 /* IFLA_GRE_IKEY */
1696                 nla_total_size(4) +
1697                 /* IFLA_GRE_OKEY */
1698                 nla_total_size(4) +
1699                 /* IFLA_GRE_LOCAL */
1700                 nla_total_size(4) +
1701                 /* IFLA_GRE_REMOTE */
1702                 nla_total_size(4) +
1703                 /* IFLA_GRE_TTL */
1704                 nla_total_size(1) +
1705                 /* IFLA_GRE_TOS */
1706                 nla_total_size(1) +
1707                 /* IFLA_GRE_PMTUDISC */
1708                 nla_total_size(1) +
1709                 0;
1710 }
1711
1712 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1713 {
1714         struct ip_tunnel *t = netdev_priv(dev);
1715         struct ip_tunnel_parm *p = &t->parms;
1716
1717         if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
1718             nla_put_be16(skb, IFLA_GRE_IFLAGS, p->i_flags) ||
1719             nla_put_be16(skb, IFLA_GRE_OFLAGS, p->o_flags) ||
1720             nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
1721             nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
1722             nla_put_be32(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
1723             nla_put_be32(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
1724             nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
1725             nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
1726             nla_put_u8(skb, IFLA_GRE_PMTUDISC,
1727                        !!(p->iph.frag_off & htons(IP_DF))))
1728                 goto nla_put_failure;
1729         return 0;
1730
1731 nla_put_failure:
1732         return -EMSGSIZE;
1733 }
1734
1735 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1736         [IFLA_GRE_LINK]         = { .type = NLA_U32 },
1737         [IFLA_GRE_IFLAGS]       = { .type = NLA_U16 },
1738         [IFLA_GRE_OFLAGS]       = { .type = NLA_U16 },
1739         [IFLA_GRE_IKEY]         = { .type = NLA_U32 },
1740         [IFLA_GRE_OKEY]         = { .type = NLA_U32 },
1741         [IFLA_GRE_LOCAL]        = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1742         [IFLA_GRE_REMOTE]       = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1743         [IFLA_GRE_TTL]          = { .type = NLA_U8 },
1744         [IFLA_GRE_TOS]          = { .type = NLA_U8 },
1745         [IFLA_GRE_PMTUDISC]     = { .type = NLA_U8 },
1746 };
1747
1748 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1749         .kind           = "gre",
1750         .maxtype        = IFLA_GRE_MAX,
1751         .policy         = ipgre_policy,
1752         .priv_size      = sizeof(struct ip_tunnel),
1753         .setup          = ipgre_tunnel_setup,
1754         .validate       = ipgre_tunnel_validate,
1755         .newlink        = ipgre_newlink,
1756         .changelink     = ipgre_changelink,
1757         .get_size       = ipgre_get_size,
1758         .fill_info      = ipgre_fill_info,
1759 };
1760
1761 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1762         .kind           = "gretap",
1763         .maxtype        = IFLA_GRE_MAX,
1764         .policy         = ipgre_policy,
1765         .priv_size      = sizeof(struct ip_tunnel),
1766         .setup          = ipgre_tap_setup,
1767         .validate       = ipgre_tap_validate,
1768         .newlink        = ipgre_newlink,
1769         .changelink     = ipgre_changelink,
1770         .get_size       = ipgre_get_size,
1771         .fill_info      = ipgre_fill_info,
1772 };
1773
1774 /*
1775  *      And now the modules code and kernel interface.
1776  */
1777
1778 static int __init ipgre_init(void)
1779 {
1780         int err;
1781
1782         pr_info("GRE over IPv4 tunneling driver\n");
1783
1784         err = register_pernet_device(&ipgre_net_ops);
1785         if (err < 0)
1786                 return err;
1787
1788         err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1789         if (err < 0) {
1790                 pr_info("%s: can't add protocol\n", __func__);
1791                 goto add_proto_failed;
1792         }
1793
1794         err = rtnl_link_register(&ipgre_link_ops);
1795         if (err < 0)
1796                 goto rtnl_link_failed;
1797
1798         err = rtnl_link_register(&ipgre_tap_ops);
1799         if (err < 0)
1800                 goto tap_ops_failed;
1801
1802 out:
1803         return err;
1804
1805 tap_ops_failed:
1806         rtnl_link_unregister(&ipgre_link_ops);
1807 rtnl_link_failed:
1808         gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1809 add_proto_failed:
1810         unregister_pernet_device(&ipgre_net_ops);
1811         goto out;
1812 }
1813
1814 static void __exit ipgre_fini(void)
1815 {
1816         rtnl_link_unregister(&ipgre_tap_ops);
1817         rtnl_link_unregister(&ipgre_link_ops);
1818         if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
1819                 pr_info("%s: can't remove protocol\n", __func__);
1820         unregister_pernet_device(&ipgre_net_ops);
1821 }
1822
1823 module_init(ipgre_init);
1824 module_exit(ipgre_fini);
1825 MODULE_LICENSE("GPL");
1826 MODULE_ALIAS_RTNL_LINK("gre");
1827 MODULE_ALIAS_RTNL_LINK("gretap");
1828 MODULE_ALIAS_NETDEV("gre0");