]> git.openfabrics.org - ~shefty/rdma-dev.git/blob - net/ipv4/tcp_ipv4.c
tcp: md5: protects md5sig_info with RCU
[~shefty/rdma-dev.git] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  *              IPv4 specific functions
9  *
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  *
18  *      This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23
24 /*
25  * Changes:
26  *              David S. Miller :       New socket lookup architecture.
27  *                                      This code is dedicated to John Dyson.
28  *              David S. Miller :       Change semantics of established hash,
29  *                                      half is devoted to TIME_WAIT sockets
30  *                                      and the rest go in the other half.
31  *              Andi Kleen :            Add support for syncookies and fixed
32  *                                      some bugs: ip options weren't passed to
33  *                                      the TCP layer, missed a check for an
34  *                                      ACK bit.
35  *              Andi Kleen :            Implemented fast path mtu discovery.
36  *                                      Fixed many serious bugs in the
37  *                                      request_sock handling and moved
38  *                                      most of it into the af independent code.
39  *                                      Added tail drop and some other bugfixes.
40  *                                      Added new listen semantics.
41  *              Mike McLagan    :       Routing by source
42  *      Juan Jose Ciarlante:            ip_dynaddr bits
43  *              Andi Kleen:             various fixes.
44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
45  *                                      coma.
46  *      Andi Kleen              :       Fix new listen.
47  *      Andi Kleen              :       Fix accept error reporting.
48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
50  *                                      a single port at the same time.
51  */
52
53
54 #include <linux/bottom_half.h>
55 #include <linux/types.h>
56 #include <linux/fcntl.h>
57 #include <linux/module.h>
58 #include <linux/random.h>
59 #include <linux/cache.h>
60 #include <linux/jhash.h>
61 #include <linux/init.h>
62 #include <linux/times.h>
63 #include <linux/slab.h>
64
65 #include <net/net_namespace.h>
66 #include <net/icmp.h>
67 #include <net/inet_hashtables.h>
68 #include <net/tcp.h>
69 #include <net/transp_v6.h>
70 #include <net/ipv6.h>
71 #include <net/inet_common.h>
72 #include <net/timewait_sock.h>
73 #include <net/xfrm.h>
74 #include <net/netdma.h>
75 #include <net/secure_seq.h>
76 #include <net/tcp_memcontrol.h>
77
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83
84 #include <linux/crypto.h>
85 #include <linux/scatterlist.h>
86
87 int sysctl_tcp_tw_reuse __read_mostly;
88 int sysctl_tcp_low_latency __read_mostly;
89 EXPORT_SYMBOL(sysctl_tcp_low_latency);
90
91
92 #ifdef CONFIG_TCP_MD5SIG
93 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
94                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
95 #endif
96
97 struct inet_hashinfo tcp_hashinfo;
98 EXPORT_SYMBOL(tcp_hashinfo);
99
100 static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
101 {
102         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
103                                           ip_hdr(skb)->saddr,
104                                           tcp_hdr(skb)->dest,
105                                           tcp_hdr(skb)->source);
106 }
107
108 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
109 {
110         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
111         struct tcp_sock *tp = tcp_sk(sk);
112
113         /* With PAWS, it is safe from the viewpoint
114            of data integrity. Even without PAWS it is safe provided sequence
115            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
116
117            Actually, the idea is close to VJ's one, only timestamp cache is
118            held not per host, but per port pair and TW bucket is used as state
119            holder.
120
121            If TW bucket has been already destroyed we fall back to VJ's scheme
122            and use initial timestamp retrieved from peer table.
123          */
124         if (tcptw->tw_ts_recent_stamp &&
125             (twp == NULL || (sysctl_tcp_tw_reuse &&
126                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
127                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
128                 if (tp->write_seq == 0)
129                         tp->write_seq = 1;
130                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
131                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
132                 sock_hold(sktw);
133                 return 1;
134         }
135
136         return 0;
137 }
138 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
139
140 /* This will initiate an outgoing connection. */
141 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
142 {
143         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
144         struct inet_sock *inet = inet_sk(sk);
145         struct tcp_sock *tp = tcp_sk(sk);
146         __be16 orig_sport, orig_dport;
147         __be32 daddr, nexthop;
148         struct flowi4 *fl4;
149         struct rtable *rt;
150         int err;
151         struct ip_options_rcu *inet_opt;
152
153         if (addr_len < sizeof(struct sockaddr_in))
154                 return -EINVAL;
155
156         if (usin->sin_family != AF_INET)
157                 return -EAFNOSUPPORT;
158
159         nexthop = daddr = usin->sin_addr.s_addr;
160         inet_opt = rcu_dereference_protected(inet->inet_opt,
161                                              sock_owned_by_user(sk));
162         if (inet_opt && inet_opt->opt.srr) {
163                 if (!daddr)
164                         return -EINVAL;
165                 nexthop = inet_opt->opt.faddr;
166         }
167
168         orig_sport = inet->inet_sport;
169         orig_dport = usin->sin_port;
170         fl4 = &inet->cork.fl.u.ip4;
171         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
172                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
173                               IPPROTO_TCP,
174                               orig_sport, orig_dport, sk, true);
175         if (IS_ERR(rt)) {
176                 err = PTR_ERR(rt);
177                 if (err == -ENETUNREACH)
178                         IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
179                 return err;
180         }
181
182         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
183                 ip_rt_put(rt);
184                 return -ENETUNREACH;
185         }
186
187         if (!inet_opt || !inet_opt->opt.srr)
188                 daddr = fl4->daddr;
189
190         if (!inet->inet_saddr)
191                 inet->inet_saddr = fl4->saddr;
192         inet->inet_rcv_saddr = inet->inet_saddr;
193
194         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
195                 /* Reset inherited state */
196                 tp->rx_opt.ts_recent       = 0;
197                 tp->rx_opt.ts_recent_stamp = 0;
198                 tp->write_seq              = 0;
199         }
200
201         if (tcp_death_row.sysctl_tw_recycle &&
202             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) {
203                 struct inet_peer *peer = rt_get_peer(rt, fl4->daddr);
204                 /*
205                  * VJ's idea. We save last timestamp seen from
206                  * the destination in peer table, when entering state
207                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
208                  * when trying new connection.
209                  */
210                 if (peer) {
211                         inet_peer_refcheck(peer);
212                         if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
213                                 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
214                                 tp->rx_opt.ts_recent = peer->tcp_ts;
215                         }
216                 }
217         }
218
219         inet->inet_dport = usin->sin_port;
220         inet->inet_daddr = daddr;
221
222         inet_csk(sk)->icsk_ext_hdr_len = 0;
223         if (inet_opt)
224                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
225
226         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
227
228         /* Socket identity is still unknown (sport may be zero).
229          * However we set state to SYN-SENT and not releasing socket
230          * lock select source port, enter ourselves into the hash tables and
231          * complete initialization after this.
232          */
233         tcp_set_state(sk, TCP_SYN_SENT);
234         err = inet_hash_connect(&tcp_death_row, sk);
235         if (err)
236                 goto failure;
237
238         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
239                                inet->inet_sport, inet->inet_dport, sk);
240         if (IS_ERR(rt)) {
241                 err = PTR_ERR(rt);
242                 rt = NULL;
243                 goto failure;
244         }
245         /* OK, now commit destination to socket.  */
246         sk->sk_gso_type = SKB_GSO_TCPV4;
247         sk_setup_caps(sk, &rt->dst);
248
249         if (!tp->write_seq)
250                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
251                                                            inet->inet_daddr,
252                                                            inet->inet_sport,
253                                                            usin->sin_port);
254
255         inet->inet_id = tp->write_seq ^ jiffies;
256
257         err = tcp_connect(sk);
258         rt = NULL;
259         if (err)
260                 goto failure;
261
262         return 0;
263
264 failure:
265         /*
266          * This unhashes the socket and releases the local port,
267          * if necessary.
268          */
269         tcp_set_state(sk, TCP_CLOSE);
270         ip_rt_put(rt);
271         sk->sk_route_caps = 0;
272         inet->inet_dport = 0;
273         return err;
274 }
275 EXPORT_SYMBOL(tcp_v4_connect);
276
277 /*
278  * This routine does path mtu discovery as defined in RFC1191.
279  */
280 static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
281 {
282         struct dst_entry *dst;
283         struct inet_sock *inet = inet_sk(sk);
284
285         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
286          * send out by Linux are always <576bytes so they should go through
287          * unfragmented).
288          */
289         if (sk->sk_state == TCP_LISTEN)
290                 return;
291
292         /* We don't check in the destentry if pmtu discovery is forbidden
293          * on this route. We just assume that no packet_to_big packets
294          * are send back when pmtu discovery is not active.
295          * There is a small race when the user changes this flag in the
296          * route, but I think that's acceptable.
297          */
298         if ((dst = __sk_dst_check(sk, 0)) == NULL)
299                 return;
300
301         dst->ops->update_pmtu(dst, mtu);
302
303         /* Something is about to be wrong... Remember soft error
304          * for the case, if this connection will not able to recover.
305          */
306         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
307                 sk->sk_err_soft = EMSGSIZE;
308
309         mtu = dst_mtu(dst);
310
311         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
312             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
313                 tcp_sync_mss(sk, mtu);
314
315                 /* Resend the TCP packet because it's
316                  * clear that the old packet has been
317                  * dropped. This is the new "fast" path mtu
318                  * discovery.
319                  */
320                 tcp_simple_retransmit(sk);
321         } /* else let the usual retransmit timer handle it */
322 }
323
324 /*
325  * This routine is called by the ICMP module when it gets some
326  * sort of error condition.  If err < 0 then the socket should
327  * be closed and the error returned to the user.  If err > 0
328  * it's just the icmp type << 8 | icmp code.  After adjustment
329  * header points to the first 8 bytes of the tcp header.  We need
330  * to find the appropriate port.
331  *
332  * The locking strategy used here is very "optimistic". When
333  * someone else accesses the socket the ICMP is just dropped
334  * and for some paths there is no check at all.
335  * A more general error queue to queue errors for later handling
336  * is probably better.
337  *
338  */
339
340 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
341 {
342         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
343         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
344         struct inet_connection_sock *icsk;
345         struct tcp_sock *tp;
346         struct inet_sock *inet;
347         const int type = icmp_hdr(icmp_skb)->type;
348         const int code = icmp_hdr(icmp_skb)->code;
349         struct sock *sk;
350         struct sk_buff *skb;
351         __u32 seq;
352         __u32 remaining;
353         int err;
354         struct net *net = dev_net(icmp_skb->dev);
355
356         if (icmp_skb->len < (iph->ihl << 2) + 8) {
357                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
358                 return;
359         }
360
361         sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
362                         iph->saddr, th->source, inet_iif(icmp_skb));
363         if (!sk) {
364                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
365                 return;
366         }
367         if (sk->sk_state == TCP_TIME_WAIT) {
368                 inet_twsk_put(inet_twsk(sk));
369                 return;
370         }
371
372         bh_lock_sock(sk);
373         /* If too many ICMPs get dropped on busy
374          * servers this needs to be solved differently.
375          */
376         if (sock_owned_by_user(sk))
377                 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
378
379         if (sk->sk_state == TCP_CLOSE)
380                 goto out;
381
382         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
383                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
384                 goto out;
385         }
386
387         icsk = inet_csk(sk);
388         tp = tcp_sk(sk);
389         seq = ntohl(th->seq);
390         if (sk->sk_state != TCP_LISTEN &&
391             !between(seq, tp->snd_una, tp->snd_nxt)) {
392                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
393                 goto out;
394         }
395
396         switch (type) {
397         case ICMP_SOURCE_QUENCH:
398                 /* Just silently ignore these. */
399                 goto out;
400         case ICMP_PARAMETERPROB:
401                 err = EPROTO;
402                 break;
403         case ICMP_DEST_UNREACH:
404                 if (code > NR_ICMP_UNREACH)
405                         goto out;
406
407                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
408                         if (!sock_owned_by_user(sk))
409                                 do_pmtu_discovery(sk, iph, info);
410                         goto out;
411                 }
412
413                 err = icmp_err_convert[code].errno;
414                 /* check if icmp_skb allows revert of backoff
415                  * (see draft-zimmermann-tcp-lcd) */
416                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
417                         break;
418                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
419                     !icsk->icsk_backoff)
420                         break;
421
422                 if (sock_owned_by_user(sk))
423                         break;
424
425                 icsk->icsk_backoff--;
426                 inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) :
427                         TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
428                 tcp_bound_rto(sk);
429
430                 skb = tcp_write_queue_head(sk);
431                 BUG_ON(!skb);
432
433                 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
434                                 tcp_time_stamp - TCP_SKB_CB(skb)->when);
435
436                 if (remaining) {
437                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
438                                                   remaining, TCP_RTO_MAX);
439                 } else {
440                         /* RTO revert clocked out retransmission.
441                          * Will retransmit now */
442                         tcp_retransmit_timer(sk);
443                 }
444
445                 break;
446         case ICMP_TIME_EXCEEDED:
447                 err = EHOSTUNREACH;
448                 break;
449         default:
450                 goto out;
451         }
452
453         switch (sk->sk_state) {
454                 struct request_sock *req, **prev;
455         case TCP_LISTEN:
456                 if (sock_owned_by_user(sk))
457                         goto out;
458
459                 req = inet_csk_search_req(sk, &prev, th->dest,
460                                           iph->daddr, iph->saddr);
461                 if (!req)
462                         goto out;
463
464                 /* ICMPs are not backlogged, hence we cannot get
465                    an established socket here.
466                  */
467                 WARN_ON(req->sk);
468
469                 if (seq != tcp_rsk(req)->snt_isn) {
470                         NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
471                         goto out;
472                 }
473
474                 /*
475                  * Still in SYN_RECV, just remove it silently.
476                  * There is no good way to pass the error to the newly
477                  * created socket, and POSIX does not want network
478                  * errors returned from accept().
479                  */
480                 inet_csk_reqsk_queue_drop(sk, req, prev);
481                 goto out;
482
483         case TCP_SYN_SENT:
484         case TCP_SYN_RECV:  /* Cannot happen.
485                                It can f.e. if SYNs crossed.
486                              */
487                 if (!sock_owned_by_user(sk)) {
488                         sk->sk_err = err;
489
490                         sk->sk_error_report(sk);
491
492                         tcp_done(sk);
493                 } else {
494                         sk->sk_err_soft = err;
495                 }
496                 goto out;
497         }
498
499         /* If we've already connected we will keep trying
500          * until we time out, or the user gives up.
501          *
502          * rfc1122 4.2.3.9 allows to consider as hard errors
503          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
504          * but it is obsoleted by pmtu discovery).
505          *
506          * Note, that in modern internet, where routing is unreliable
507          * and in each dark corner broken firewalls sit, sending random
508          * errors ordered by their masters even this two messages finally lose
509          * their original sense (even Linux sends invalid PORT_UNREACHs)
510          *
511          * Now we are in compliance with RFCs.
512          *                                                      --ANK (980905)
513          */
514
515         inet = inet_sk(sk);
516         if (!sock_owned_by_user(sk) && inet->recverr) {
517                 sk->sk_err = err;
518                 sk->sk_error_report(sk);
519         } else  { /* Only an error on timeout */
520                 sk->sk_err_soft = err;
521         }
522
523 out:
524         bh_unlock_sock(sk);
525         sock_put(sk);
526 }
527
528 static void __tcp_v4_send_check(struct sk_buff *skb,
529                                 __be32 saddr, __be32 daddr)
530 {
531         struct tcphdr *th = tcp_hdr(skb);
532
533         if (skb->ip_summed == CHECKSUM_PARTIAL) {
534                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
535                 skb->csum_start = skb_transport_header(skb) - skb->head;
536                 skb->csum_offset = offsetof(struct tcphdr, check);
537         } else {
538                 th->check = tcp_v4_check(skb->len, saddr, daddr,
539                                          csum_partial(th,
540                                                       th->doff << 2,
541                                                       skb->csum));
542         }
543 }
544
545 /* This routine computes an IPv4 TCP checksum. */
546 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
547 {
548         const struct inet_sock *inet = inet_sk(sk);
549
550         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
551 }
552 EXPORT_SYMBOL(tcp_v4_send_check);
553
554 int tcp_v4_gso_send_check(struct sk_buff *skb)
555 {
556         const struct iphdr *iph;
557         struct tcphdr *th;
558
559         if (!pskb_may_pull(skb, sizeof(*th)))
560                 return -EINVAL;
561
562         iph = ip_hdr(skb);
563         th = tcp_hdr(skb);
564
565         th->check = 0;
566         skb->ip_summed = CHECKSUM_PARTIAL;
567         __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
568         return 0;
569 }
570
571 /*
572  *      This routine will send an RST to the other tcp.
573  *
574  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
575  *                    for reset.
576  *      Answer: if a packet caused RST, it is not for a socket
577  *              existing in our system, if it is matched to a socket,
578  *              it is just duplicate segment or bug in other side's TCP.
579  *              So that we build reply only basing on parameters
580  *              arrived with segment.
581  *      Exception: precedence violation. We do not implement it in any case.
582  */
583
584 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
585 {
586         const struct tcphdr *th = tcp_hdr(skb);
587         struct {
588                 struct tcphdr th;
589 #ifdef CONFIG_TCP_MD5SIG
590                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
591 #endif
592         } rep;
593         struct ip_reply_arg arg;
594 #ifdef CONFIG_TCP_MD5SIG
595         struct tcp_md5sig_key *key;
596 #endif
597         struct net *net;
598
599         /* Never send a reset in response to a reset. */
600         if (th->rst)
601                 return;
602
603         if (skb_rtable(skb)->rt_type != RTN_LOCAL)
604                 return;
605
606         /* Swap the send and the receive. */
607         memset(&rep, 0, sizeof(rep));
608         rep.th.dest   = th->source;
609         rep.th.source = th->dest;
610         rep.th.doff   = sizeof(struct tcphdr) / 4;
611         rep.th.rst    = 1;
612
613         if (th->ack) {
614                 rep.th.seq = th->ack_seq;
615         } else {
616                 rep.th.ack = 1;
617                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
618                                        skb->len - (th->doff << 2));
619         }
620
621         memset(&arg, 0, sizeof(arg));
622         arg.iov[0].iov_base = (unsigned char *)&rep;
623         arg.iov[0].iov_len  = sizeof(rep.th);
624
625 #ifdef CONFIG_TCP_MD5SIG
626         key = sk ? tcp_md5_do_lookup(sk,
627                                      (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
628                                      AF_INET) : NULL;
629         if (key) {
630                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
631                                    (TCPOPT_NOP << 16) |
632                                    (TCPOPT_MD5SIG << 8) |
633                                    TCPOLEN_MD5SIG);
634                 /* Update length and the length the header thinks exists */
635                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
636                 rep.th.doff = arg.iov[0].iov_len / 4;
637
638                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
639                                      key, ip_hdr(skb)->saddr,
640                                      ip_hdr(skb)->daddr, &rep.th);
641         }
642 #endif
643         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
644                                       ip_hdr(skb)->saddr, /* XXX */
645                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
646         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
647         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
648
649         net = dev_net(skb_dst(skb)->dev);
650         arg.tos = ip_hdr(skb)->tos;
651         ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
652                       &arg, arg.iov[0].iov_len);
653
654         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
655         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
656 }
657
658 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
659    outside socket context is ugly, certainly. What can I do?
660  */
661
662 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
663                             u32 win, u32 ts, int oif,
664                             struct tcp_md5sig_key *key,
665                             int reply_flags, u8 tos)
666 {
667         const struct tcphdr *th = tcp_hdr(skb);
668         struct {
669                 struct tcphdr th;
670                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
671 #ifdef CONFIG_TCP_MD5SIG
672                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
673 #endif
674                         ];
675         } rep;
676         struct ip_reply_arg arg;
677         struct net *net = dev_net(skb_dst(skb)->dev);
678
679         memset(&rep.th, 0, sizeof(struct tcphdr));
680         memset(&arg, 0, sizeof(arg));
681
682         arg.iov[0].iov_base = (unsigned char *)&rep;
683         arg.iov[0].iov_len  = sizeof(rep.th);
684         if (ts) {
685                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
686                                    (TCPOPT_TIMESTAMP << 8) |
687                                    TCPOLEN_TIMESTAMP);
688                 rep.opt[1] = htonl(tcp_time_stamp);
689                 rep.opt[2] = htonl(ts);
690                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
691         }
692
693         /* Swap the send and the receive. */
694         rep.th.dest    = th->source;
695         rep.th.source  = th->dest;
696         rep.th.doff    = arg.iov[0].iov_len / 4;
697         rep.th.seq     = htonl(seq);
698         rep.th.ack_seq = htonl(ack);
699         rep.th.ack     = 1;
700         rep.th.window  = htons(win);
701
702 #ifdef CONFIG_TCP_MD5SIG
703         if (key) {
704                 int offset = (ts) ? 3 : 0;
705
706                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
707                                           (TCPOPT_NOP << 16) |
708                                           (TCPOPT_MD5SIG << 8) |
709                                           TCPOLEN_MD5SIG);
710                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
711                 rep.th.doff = arg.iov[0].iov_len/4;
712
713                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
714                                     key, ip_hdr(skb)->saddr,
715                                     ip_hdr(skb)->daddr, &rep.th);
716         }
717 #endif
718         arg.flags = reply_flags;
719         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
720                                       ip_hdr(skb)->saddr, /* XXX */
721                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
722         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
723         if (oif)
724                 arg.bound_dev_if = oif;
725         arg.tos = tos;
726         ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
727                       &arg, arg.iov[0].iov_len);
728
729         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
730 }
731
732 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
733 {
734         struct inet_timewait_sock *tw = inet_twsk(sk);
735         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
736
737         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
738                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
739                         tcptw->tw_ts_recent,
740                         tw->tw_bound_dev_if,
741                         tcp_twsk_md5_key(tcptw),
742                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
743                         tw->tw_tos
744                         );
745
746         inet_twsk_put(tw);
747 }
748
749 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
750                                   struct request_sock *req)
751 {
752         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
753                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
754                         req->ts_recent,
755                         0,
756                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
757                                           AF_INET),
758                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
759                         ip_hdr(skb)->tos);
760 }
761
762 /*
763  *      Send a SYN-ACK after having received a SYN.
764  *      This still operates on a request_sock only, not on a big
765  *      socket.
766  */
767 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
768                               struct request_sock *req,
769                               struct request_values *rvp)
770 {
771         const struct inet_request_sock *ireq = inet_rsk(req);
772         struct flowi4 fl4;
773         int err = -1;
774         struct sk_buff * skb;
775
776         /* First, grab a route. */
777         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
778                 return -1;
779
780         skb = tcp_make_synack(sk, dst, req, rvp);
781
782         if (skb) {
783                 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
784
785                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
786                                             ireq->rmt_addr,
787                                             ireq->opt);
788                 err = net_xmit_eval(err);
789         }
790
791         dst_release(dst);
792         return err;
793 }
794
795 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
796                               struct request_values *rvp)
797 {
798         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
799         return tcp_v4_send_synack(sk, NULL, req, rvp);
800 }
801
802 /*
803  *      IPv4 request_sock destructor.
804  */
805 static void tcp_v4_reqsk_destructor(struct request_sock *req)
806 {
807         kfree(inet_rsk(req)->opt);
808 }
809
810 /*
811  * Return 1 if a syncookie should be sent
812  */
813 int tcp_syn_flood_action(struct sock *sk,
814                          const struct sk_buff *skb,
815                          const char *proto)
816 {
817         const char *msg = "Dropping request";
818         int want_cookie = 0;
819         struct listen_sock *lopt;
820
821
822
823 #ifdef CONFIG_SYN_COOKIES
824         if (sysctl_tcp_syncookies) {
825                 msg = "Sending cookies";
826                 want_cookie = 1;
827                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
828         } else
829 #endif
830                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
831
832         lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
833         if (!lopt->synflood_warned) {
834                 lopt->synflood_warned = 1;
835                 pr_info("%s: Possible SYN flooding on port %d. %s. "
836                         " Check SNMP counters.\n",
837                         proto, ntohs(tcp_hdr(skb)->dest), msg);
838         }
839         return want_cookie;
840 }
841 EXPORT_SYMBOL(tcp_syn_flood_action);
842
843 /*
844  * Save and compile IPv4 options into the request_sock if needed.
845  */
846 static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk,
847                                                   struct sk_buff *skb)
848 {
849         const struct ip_options *opt = &(IPCB(skb)->opt);
850         struct ip_options_rcu *dopt = NULL;
851
852         if (opt && opt->optlen) {
853                 int opt_size = sizeof(*dopt) + opt->optlen;
854
855                 dopt = kmalloc(opt_size, GFP_ATOMIC);
856                 if (dopt) {
857                         if (ip_options_echo(&dopt->opt, skb)) {
858                                 kfree(dopt);
859                                 dopt = NULL;
860                         }
861                 }
862         }
863         return dopt;
864 }
865
866 #ifdef CONFIG_TCP_MD5SIG
867 /*
868  * RFC2385 MD5 checksumming requires a mapping of
869  * IP address->MD5 Key.
870  * We need to maintain these in the sk structure.
871  */
872
873 /* Find the Key structure for an address.  */
874 struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
875                                          const union tcp_md5_addr *addr,
876                                          int family)
877 {
878         struct tcp_sock *tp = tcp_sk(sk);
879         struct tcp_md5sig_key *key;
880         struct hlist_node *pos;
881         unsigned int size = sizeof(struct in_addr);
882         struct tcp_md5sig_info *md5sig;
883
884         /* caller either holds rcu_read_lock() or socket lock */
885         md5sig = rcu_dereference_check(tp->md5sig_info,
886                                        sock_owned_by_user(sk));
887         if (!md5sig)
888                 return NULL;
889 #if IS_ENABLED(CONFIG_IPV6)
890         if (family == AF_INET6)
891                 size = sizeof(struct in6_addr);
892 #endif
893         hlist_for_each_entry_rcu(key, pos, &md5sig->head, node) {
894                 if (key->family != family)
895                         continue;
896                 if (!memcmp(&key->addr, addr, size))
897                         return key;
898         }
899         return NULL;
900 }
901 EXPORT_SYMBOL(tcp_md5_do_lookup);
902
903 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
904                                          struct sock *addr_sk)
905 {
906         union tcp_md5_addr *addr;
907
908         addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
909         return tcp_md5_do_lookup(sk, addr, AF_INET);
910 }
911 EXPORT_SYMBOL(tcp_v4_md5_lookup);
912
913 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
914                                                       struct request_sock *req)
915 {
916         union tcp_md5_addr *addr;
917
918         addr = (union tcp_md5_addr *)&inet_rsk(req)->rmt_addr;
919         return tcp_md5_do_lookup(sk, addr, AF_INET);
920 }
921
922 /* This can be called on a newly created socket, from other files */
923 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
924                    int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
925 {
926         /* Add Key to the list */
927         struct tcp_md5sig_key *key;
928         struct tcp_sock *tp = tcp_sk(sk);
929         struct tcp_md5sig_info *md5sig;
930
931         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
932         if (key) {
933                 /* Pre-existing entry - just update that one. */
934                 memcpy(key->key, newkey, newkeylen);
935                 key->keylen = newkeylen;
936                 return 0;
937         }
938
939         md5sig = rcu_dereference_protected(tp->md5sig_info,
940                                            sock_owned_by_user(sk));
941         if (!md5sig) {
942                 md5sig = kmalloc(sizeof(*md5sig), gfp);
943                 if (!md5sig)
944                         return -ENOMEM;
945
946                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
947                 INIT_HLIST_HEAD(&md5sig->head);
948                 rcu_assign_pointer(tp->md5sig_info, md5sig);
949         }
950
951         key = sock_kmalloc(sk, sizeof(*key), gfp);
952         if (!key)
953                 return -ENOMEM;
954         if (hlist_empty(&md5sig->head) && !tcp_alloc_md5sig_pool(sk)) {
955                 sock_kfree_s(sk, key, sizeof(*key));
956                 return -ENOMEM;
957         }
958
959         memcpy(key->key, newkey, newkeylen);
960         key->keylen = newkeylen;
961         key->family = family;
962         memcpy(&key->addr, addr,
963                (family == AF_INET6) ? sizeof(struct in6_addr) :
964                                       sizeof(struct in_addr));
965         hlist_add_head_rcu(&key->node, &md5sig->head);
966         return 0;
967 }
968 EXPORT_SYMBOL(tcp_md5_do_add);
969
970 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
971 {
972         struct tcp_sock *tp = tcp_sk(sk);
973         struct tcp_md5sig_key *key;
974         struct tcp_md5sig_info *md5sig;
975
976         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
977         if (!key)
978                 return -ENOENT;
979         hlist_del_rcu(&key->node);
980         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
981         kfree_rcu(key, rcu);
982         md5sig = rcu_dereference_protected(tp->md5sig_info,
983                                            sock_owned_by_user(sk));
984         if (hlist_empty(&md5sig->head))
985                 tcp_free_md5sig_pool();
986         return 0;
987 }
988 EXPORT_SYMBOL(tcp_md5_do_del);
989
990 void tcp_clear_md5_list(struct sock *sk)
991 {
992         struct tcp_sock *tp = tcp_sk(sk);
993         struct tcp_md5sig_key *key;
994         struct hlist_node *pos, *n;
995         struct tcp_md5sig_info *md5sig;
996
997         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
998
999         if (!hlist_empty(&md5sig->head))
1000                 tcp_free_md5sig_pool();
1001         hlist_for_each_entry_safe(key, pos, n, &md5sig->head, node) {
1002                 hlist_del_rcu(&key->node);
1003                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1004                 kfree_rcu(key, rcu);
1005         }
1006 }
1007
1008 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1009                                  int optlen)
1010 {
1011         struct tcp_md5sig cmd;
1012         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1013
1014         if (optlen < sizeof(cmd))
1015                 return -EINVAL;
1016
1017         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1018                 return -EFAULT;
1019
1020         if (sin->sin_family != AF_INET)
1021                 return -EINVAL;
1022
1023         if (!cmd.tcpm_key || !cmd.tcpm_keylen)
1024                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1025                                       AF_INET);
1026
1027         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1028                 return -EINVAL;
1029
1030         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1031                               AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1032                               GFP_KERNEL);
1033 }
1034
1035 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1036                                         __be32 daddr, __be32 saddr, int nbytes)
1037 {
1038         struct tcp4_pseudohdr *bp;
1039         struct scatterlist sg;
1040
1041         bp = &hp->md5_blk.ip4;
1042
1043         /*
1044          * 1. the TCP pseudo-header (in the order: source IP address,
1045          * destination IP address, zero-padded protocol number, and
1046          * segment length)
1047          */
1048         bp->saddr = saddr;
1049         bp->daddr = daddr;
1050         bp->pad = 0;
1051         bp->protocol = IPPROTO_TCP;
1052         bp->len = cpu_to_be16(nbytes);
1053
1054         sg_init_one(&sg, bp, sizeof(*bp));
1055         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1056 }
1057
1058 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1059                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1060 {
1061         struct tcp_md5sig_pool *hp;
1062         struct hash_desc *desc;
1063
1064         hp = tcp_get_md5sig_pool();
1065         if (!hp)
1066                 goto clear_hash_noput;
1067         desc = &hp->md5_desc;
1068
1069         if (crypto_hash_init(desc))
1070                 goto clear_hash;
1071         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1072                 goto clear_hash;
1073         if (tcp_md5_hash_header(hp, th))
1074                 goto clear_hash;
1075         if (tcp_md5_hash_key(hp, key))
1076                 goto clear_hash;
1077         if (crypto_hash_final(desc, md5_hash))
1078                 goto clear_hash;
1079
1080         tcp_put_md5sig_pool();
1081         return 0;
1082
1083 clear_hash:
1084         tcp_put_md5sig_pool();
1085 clear_hash_noput:
1086         memset(md5_hash, 0, 16);
1087         return 1;
1088 }
1089
1090 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1091                         const struct sock *sk, const struct request_sock *req,
1092                         const struct sk_buff *skb)
1093 {
1094         struct tcp_md5sig_pool *hp;
1095         struct hash_desc *desc;
1096         const struct tcphdr *th = tcp_hdr(skb);
1097         __be32 saddr, daddr;
1098
1099         if (sk) {
1100                 saddr = inet_sk(sk)->inet_saddr;
1101                 daddr = inet_sk(sk)->inet_daddr;
1102         } else if (req) {
1103                 saddr = inet_rsk(req)->loc_addr;
1104                 daddr = inet_rsk(req)->rmt_addr;
1105         } else {
1106                 const struct iphdr *iph = ip_hdr(skb);
1107                 saddr = iph->saddr;
1108                 daddr = iph->daddr;
1109         }
1110
1111         hp = tcp_get_md5sig_pool();
1112         if (!hp)
1113                 goto clear_hash_noput;
1114         desc = &hp->md5_desc;
1115
1116         if (crypto_hash_init(desc))
1117                 goto clear_hash;
1118
1119         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1120                 goto clear_hash;
1121         if (tcp_md5_hash_header(hp, th))
1122                 goto clear_hash;
1123         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1124                 goto clear_hash;
1125         if (tcp_md5_hash_key(hp, key))
1126                 goto clear_hash;
1127         if (crypto_hash_final(desc, md5_hash))
1128                 goto clear_hash;
1129
1130         tcp_put_md5sig_pool();
1131         return 0;
1132
1133 clear_hash:
1134         tcp_put_md5sig_pool();
1135 clear_hash_noput:
1136         memset(md5_hash, 0, 16);
1137         return 1;
1138 }
1139 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1140
1141 static int tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1142 {
1143         /*
1144          * This gets called for each TCP segment that arrives
1145          * so we want to be efficient.
1146          * We have 3 drop cases:
1147          * o No MD5 hash and one expected.
1148          * o MD5 hash and we're not expecting one.
1149          * o MD5 hash and its wrong.
1150          */
1151         const __u8 *hash_location = NULL;
1152         struct tcp_md5sig_key *hash_expected;
1153         const struct iphdr *iph = ip_hdr(skb);
1154         const struct tcphdr *th = tcp_hdr(skb);
1155         int genhash;
1156         unsigned char newhash[16];
1157
1158         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1159                                           AF_INET);
1160         hash_location = tcp_parse_md5sig_option(th);
1161
1162         /* We've parsed the options - do we have a hash? */
1163         if (!hash_expected && !hash_location)
1164                 return 0;
1165
1166         if (hash_expected && !hash_location) {
1167                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1168                 return 1;
1169         }
1170
1171         if (!hash_expected && hash_location) {
1172                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1173                 return 1;
1174         }
1175
1176         /* Okay, so this is hash_expected and hash_location -
1177          * so we need to calculate the checksum.
1178          */
1179         genhash = tcp_v4_md5_hash_skb(newhash,
1180                                       hash_expected,
1181                                       NULL, NULL, skb);
1182
1183         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1184                 if (net_ratelimit()) {
1185                         printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1186                                &iph->saddr, ntohs(th->source),
1187                                &iph->daddr, ntohs(th->dest),
1188                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1189                 }
1190                 return 1;
1191         }
1192         return 0;
1193 }
1194
1195 #endif
1196
1197 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1198         .family         =       PF_INET,
1199         .obj_size       =       sizeof(struct tcp_request_sock),
1200         .rtx_syn_ack    =       tcp_v4_rtx_synack,
1201         .send_ack       =       tcp_v4_reqsk_send_ack,
1202         .destructor     =       tcp_v4_reqsk_destructor,
1203         .send_reset     =       tcp_v4_send_reset,
1204         .syn_ack_timeout =      tcp_syn_ack_timeout,
1205 };
1206
1207 #ifdef CONFIG_TCP_MD5SIG
1208 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1209         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1210         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1211 };
1212 #endif
1213
1214 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1215 {
1216         struct tcp_extend_values tmp_ext;
1217         struct tcp_options_received tmp_opt;
1218         const u8 *hash_location;
1219         struct request_sock *req;
1220         struct inet_request_sock *ireq;
1221         struct tcp_sock *tp = tcp_sk(sk);
1222         struct dst_entry *dst = NULL;
1223         __be32 saddr = ip_hdr(skb)->saddr;
1224         __be32 daddr = ip_hdr(skb)->daddr;
1225         __u32 isn = TCP_SKB_CB(skb)->when;
1226         int want_cookie = 0;
1227
1228         /* Never answer to SYNs send to broadcast or multicast */
1229         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1230                 goto drop;
1231
1232         /* TW buckets are converted to open requests without
1233          * limitations, they conserve resources and peer is
1234          * evidently real one.
1235          */
1236         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1237                 want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
1238                 if (!want_cookie)
1239                         goto drop;
1240         }
1241
1242         /* Accept backlog is full. If we have already queued enough
1243          * of warm entries in syn queue, drop request. It is better than
1244          * clogging syn queue with openreqs with exponentially increasing
1245          * timeout.
1246          */
1247         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1248                 goto drop;
1249
1250         req = inet_reqsk_alloc(&tcp_request_sock_ops);
1251         if (!req)
1252                 goto drop;
1253
1254 #ifdef CONFIG_TCP_MD5SIG
1255         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1256 #endif
1257
1258         tcp_clear_options(&tmp_opt);
1259         tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1260         tmp_opt.user_mss  = tp->rx_opt.user_mss;
1261         tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
1262
1263         if (tmp_opt.cookie_plus > 0 &&
1264             tmp_opt.saw_tstamp &&
1265             !tp->rx_opt.cookie_out_never &&
1266             (sysctl_tcp_cookie_size > 0 ||
1267              (tp->cookie_values != NULL &&
1268               tp->cookie_values->cookie_desired > 0))) {
1269                 u8 *c;
1270                 u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1271                 int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1272
1273                 if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1274                         goto drop_and_release;
1275
1276                 /* Secret recipe starts with IP addresses */
1277                 *mess++ ^= (__force u32)daddr;
1278                 *mess++ ^= (__force u32)saddr;
1279
1280                 /* plus variable length Initiator Cookie */
1281                 c = (u8 *)mess;
1282                 while (l-- > 0)
1283                         *c++ ^= *hash_location++;
1284
1285                 want_cookie = 0;        /* not our kind of cookie */
1286                 tmp_ext.cookie_out_never = 0; /* false */
1287                 tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1288         } else if (!tp->rx_opt.cookie_in_always) {
1289                 /* redundant indications, but ensure initialization. */
1290                 tmp_ext.cookie_out_never = 1; /* true */
1291                 tmp_ext.cookie_plus = 0;
1292         } else {
1293                 goto drop_and_release;
1294         }
1295         tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1296
1297         if (want_cookie && !tmp_opt.saw_tstamp)
1298                 tcp_clear_options(&tmp_opt);
1299
1300         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1301         tcp_openreq_init(req, &tmp_opt, skb);
1302
1303         ireq = inet_rsk(req);
1304         ireq->loc_addr = daddr;
1305         ireq->rmt_addr = saddr;
1306         ireq->no_srccheck = inet_sk(sk)->transparent;
1307         ireq->opt = tcp_v4_save_options(sk, skb);
1308
1309         if (security_inet_conn_request(sk, skb, req))
1310                 goto drop_and_free;
1311
1312         if (!want_cookie || tmp_opt.tstamp_ok)
1313                 TCP_ECN_create_request(req, tcp_hdr(skb));
1314
1315         if (want_cookie) {
1316                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1317                 req->cookie_ts = tmp_opt.tstamp_ok;
1318         } else if (!isn) {
1319                 struct inet_peer *peer = NULL;
1320                 struct flowi4 fl4;
1321
1322                 /* VJ's idea. We save last timestamp seen
1323                  * from the destination in peer table, when entering
1324                  * state TIME-WAIT, and check against it before
1325                  * accepting new connection request.
1326                  *
1327                  * If "isn" is not zero, this request hit alive
1328                  * timewait bucket, so that all the necessary checks
1329                  * are made in the function processing timewait state.
1330                  */
1331                 if (tmp_opt.saw_tstamp &&
1332                     tcp_death_row.sysctl_tw_recycle &&
1333                     (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
1334                     fl4.daddr == saddr &&
1335                     (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) {
1336                         inet_peer_refcheck(peer);
1337                         if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1338                             (s32)(peer->tcp_ts - req->ts_recent) >
1339                                                         TCP_PAWS_WINDOW) {
1340                                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1341                                 goto drop_and_release;
1342                         }
1343                 }
1344                 /* Kill the following clause, if you dislike this way. */
1345                 else if (!sysctl_tcp_syncookies &&
1346                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1347                           (sysctl_max_syn_backlog >> 2)) &&
1348                          (!peer || !peer->tcp_ts_stamp) &&
1349                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1350                         /* Without syncookies last quarter of
1351                          * backlog is filled with destinations,
1352                          * proven to be alive.
1353                          * It means that we continue to communicate
1354                          * to destinations, already remembered
1355                          * to the moment of synflood.
1356                          */
1357                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1358                                        &saddr, ntohs(tcp_hdr(skb)->source));
1359                         goto drop_and_release;
1360                 }
1361
1362                 isn = tcp_v4_init_sequence(skb);
1363         }
1364         tcp_rsk(req)->snt_isn = isn;
1365         tcp_rsk(req)->snt_synack = tcp_time_stamp;
1366
1367         if (tcp_v4_send_synack(sk, dst, req,
1368                                (struct request_values *)&tmp_ext) ||
1369             want_cookie)
1370                 goto drop_and_free;
1371
1372         inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1373         return 0;
1374
1375 drop_and_release:
1376         dst_release(dst);
1377 drop_and_free:
1378         reqsk_free(req);
1379 drop:
1380         return 0;
1381 }
1382 EXPORT_SYMBOL(tcp_v4_conn_request);
1383
1384
1385 /*
1386  * The three way handshake has completed - we got a valid synack -
1387  * now create the new socket.
1388  */
1389 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1390                                   struct request_sock *req,
1391                                   struct dst_entry *dst)
1392 {
1393         struct inet_request_sock *ireq;
1394         struct inet_sock *newinet;
1395         struct tcp_sock *newtp;
1396         struct sock *newsk;
1397 #ifdef CONFIG_TCP_MD5SIG
1398         struct tcp_md5sig_key *key;
1399 #endif
1400         struct ip_options_rcu *inet_opt;
1401
1402         if (sk_acceptq_is_full(sk))
1403                 goto exit_overflow;
1404
1405         newsk = tcp_create_openreq_child(sk, req, skb);
1406         if (!newsk)
1407                 goto exit_nonewsk;
1408
1409         newsk->sk_gso_type = SKB_GSO_TCPV4;
1410
1411         newtp                 = tcp_sk(newsk);
1412         newinet               = inet_sk(newsk);
1413         ireq                  = inet_rsk(req);
1414         newinet->inet_daddr   = ireq->rmt_addr;
1415         newinet->inet_rcv_saddr = ireq->loc_addr;
1416         newinet->inet_saddr           = ireq->loc_addr;
1417         inet_opt              = ireq->opt;
1418         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1419         ireq->opt             = NULL;
1420         newinet->mc_index     = inet_iif(skb);
1421         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1422         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1423         if (inet_opt)
1424                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1425         newinet->inet_id = newtp->write_seq ^ jiffies;
1426
1427         if (!dst && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL)
1428                 goto put_and_exit;
1429
1430         sk_setup_caps(newsk, dst);
1431
1432         tcp_mtup_init(newsk);
1433         tcp_sync_mss(newsk, dst_mtu(dst));
1434         newtp->advmss = dst_metric_advmss(dst);
1435         if (tcp_sk(sk)->rx_opt.user_mss &&
1436             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1437                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1438
1439         tcp_initialize_rcv_mss(newsk);
1440         if (tcp_rsk(req)->snt_synack)
1441                 tcp_valid_rtt_meas(newsk,
1442                     tcp_time_stamp - tcp_rsk(req)->snt_synack);
1443         newtp->total_retrans = req->retrans;
1444
1445 #ifdef CONFIG_TCP_MD5SIG
1446         /* Copy over the MD5 key from the original socket */
1447         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1448                                 AF_INET);
1449         if (key != NULL) {
1450                 /*
1451                  * We're using one, so create a matching key
1452                  * on the newsk structure. If we fail to get
1453                  * memory, then we end up not copying the key
1454                  * across. Shucks.
1455                  */
1456                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1457                                AF_INET, key->key, key->keylen, GFP_ATOMIC);
1458                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1459         }
1460 #endif
1461
1462         if (__inet_inherit_port(sk, newsk) < 0)
1463                 goto put_and_exit;
1464         __inet_hash_nolisten(newsk, NULL);
1465
1466         return newsk;
1467
1468 exit_overflow:
1469         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1470 exit_nonewsk:
1471         dst_release(dst);
1472 exit:
1473         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1474         return NULL;
1475 put_and_exit:
1476         tcp_clear_xmit_timers(newsk);
1477         tcp_cleanup_congestion_control(newsk);
1478         bh_unlock_sock(newsk);
1479         sock_put(newsk);
1480         goto exit;
1481 }
1482 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1483
1484 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1485 {
1486         struct tcphdr *th = tcp_hdr(skb);
1487         const struct iphdr *iph = ip_hdr(skb);
1488         struct sock *nsk;
1489         struct request_sock **prev;
1490         /* Find possible connection requests. */
1491         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1492                                                        iph->saddr, iph->daddr);
1493         if (req)
1494                 return tcp_check_req(sk, skb, req, prev);
1495
1496         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1497                         th->source, iph->daddr, th->dest, inet_iif(skb));
1498
1499         if (nsk) {
1500                 if (nsk->sk_state != TCP_TIME_WAIT) {
1501                         bh_lock_sock(nsk);
1502                         return nsk;
1503                 }
1504                 inet_twsk_put(inet_twsk(nsk));
1505                 return NULL;
1506         }
1507
1508 #ifdef CONFIG_SYN_COOKIES
1509         if (!th->syn)
1510                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1511 #endif
1512         return sk;
1513 }
1514
1515 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1516 {
1517         const struct iphdr *iph = ip_hdr(skb);
1518
1519         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1520                 if (!tcp_v4_check(skb->len, iph->saddr,
1521                                   iph->daddr, skb->csum)) {
1522                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1523                         return 0;
1524                 }
1525         }
1526
1527         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1528                                        skb->len, IPPROTO_TCP, 0);
1529
1530         if (skb->len <= 76) {
1531                 return __skb_checksum_complete(skb);
1532         }
1533         return 0;
1534 }
1535
1536
1537 /* The socket must have it's spinlock held when we get
1538  * here.
1539  *
1540  * We have a potential double-lock case here, so even when
1541  * doing backlog processing we use the BH locking scheme.
1542  * This is because we cannot sleep with the original spinlock
1543  * held.
1544  */
1545 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1546 {
1547         struct sock *rsk;
1548 #ifdef CONFIG_TCP_MD5SIG
1549         /*
1550          * We really want to reject the packet as early as possible
1551          * if:
1552          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1553          *  o There is an MD5 option and we're not expecting one
1554          */
1555         if (tcp_v4_inbound_md5_hash(sk, skb))
1556                 goto discard;
1557 #endif
1558
1559         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1560                 sock_rps_save_rxhash(sk, skb);
1561                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1562                         rsk = sk;
1563                         goto reset;
1564                 }
1565                 return 0;
1566         }
1567
1568         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1569                 goto csum_err;
1570
1571         if (sk->sk_state == TCP_LISTEN) {
1572                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1573                 if (!nsk)
1574                         goto discard;
1575
1576                 if (nsk != sk) {
1577                         sock_rps_save_rxhash(nsk, skb);
1578                         if (tcp_child_process(sk, nsk, skb)) {
1579                                 rsk = nsk;
1580                                 goto reset;
1581                         }
1582                         return 0;
1583                 }
1584         } else
1585                 sock_rps_save_rxhash(sk, skb);
1586
1587         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1588                 rsk = sk;
1589                 goto reset;
1590         }
1591         return 0;
1592
1593 reset:
1594         tcp_v4_send_reset(rsk, skb);
1595 discard:
1596         kfree_skb(skb);
1597         /* Be careful here. If this function gets more complicated and
1598          * gcc suffers from register pressure on the x86, sk (in %ebx)
1599          * might be destroyed here. This current version compiles correctly,
1600          * but you have been warned.
1601          */
1602         return 0;
1603
1604 csum_err:
1605         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1606         goto discard;
1607 }
1608 EXPORT_SYMBOL(tcp_v4_do_rcv);
1609
1610 /*
1611  *      From tcp_input.c
1612  */
1613
1614 int tcp_v4_rcv(struct sk_buff *skb)
1615 {
1616         const struct iphdr *iph;
1617         const struct tcphdr *th;
1618         struct sock *sk;
1619         int ret;
1620         struct net *net = dev_net(skb->dev);
1621
1622         if (skb->pkt_type != PACKET_HOST)
1623                 goto discard_it;
1624
1625         /* Count it even if it's bad */
1626         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1627
1628         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1629                 goto discard_it;
1630
1631         th = tcp_hdr(skb);
1632
1633         if (th->doff < sizeof(struct tcphdr) / 4)
1634                 goto bad_packet;
1635         if (!pskb_may_pull(skb, th->doff * 4))
1636                 goto discard_it;
1637
1638         /* An explanation is required here, I think.
1639          * Packet length and doff are validated by header prediction,
1640          * provided case of th->doff==0 is eliminated.
1641          * So, we defer the checks. */
1642         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1643                 goto bad_packet;
1644
1645         th = tcp_hdr(skb);
1646         iph = ip_hdr(skb);
1647         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1648         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1649                                     skb->len - th->doff * 4);
1650         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1651         TCP_SKB_CB(skb)->when    = 0;
1652         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1653         TCP_SKB_CB(skb)->sacked  = 0;
1654
1655         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1656         if (!sk)
1657                 goto no_tcp_socket;
1658
1659 process:
1660         if (sk->sk_state == TCP_TIME_WAIT)
1661                 goto do_time_wait;
1662
1663         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1664                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1665                 goto discard_and_relse;
1666         }
1667
1668         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1669                 goto discard_and_relse;
1670         nf_reset(skb);
1671
1672         if (sk_filter(sk, skb))
1673                 goto discard_and_relse;
1674
1675         skb->dev = NULL;
1676
1677         bh_lock_sock_nested(sk);
1678         ret = 0;
1679         if (!sock_owned_by_user(sk)) {
1680 #ifdef CONFIG_NET_DMA
1681                 struct tcp_sock *tp = tcp_sk(sk);
1682                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1683                         tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1684                 if (tp->ucopy.dma_chan)
1685                         ret = tcp_v4_do_rcv(sk, skb);
1686                 else
1687 #endif
1688                 {
1689                         if (!tcp_prequeue(sk, skb))
1690                                 ret = tcp_v4_do_rcv(sk, skb);
1691                 }
1692         } else if (unlikely(sk_add_backlog(sk, skb))) {
1693                 bh_unlock_sock(sk);
1694                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1695                 goto discard_and_relse;
1696         }
1697         bh_unlock_sock(sk);
1698
1699         sock_put(sk);
1700
1701         return ret;
1702
1703 no_tcp_socket:
1704         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1705                 goto discard_it;
1706
1707         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1708 bad_packet:
1709                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1710         } else {
1711                 tcp_v4_send_reset(NULL, skb);
1712         }
1713
1714 discard_it:
1715         /* Discard frame. */
1716         kfree_skb(skb);
1717         return 0;
1718
1719 discard_and_relse:
1720         sock_put(sk);
1721         goto discard_it;
1722
1723 do_time_wait:
1724         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1725                 inet_twsk_put(inet_twsk(sk));
1726                 goto discard_it;
1727         }
1728
1729         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1730                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1731                 inet_twsk_put(inet_twsk(sk));
1732                 goto discard_it;
1733         }
1734         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1735         case TCP_TW_SYN: {
1736                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1737                                                         &tcp_hashinfo,
1738                                                         iph->daddr, th->dest,
1739                                                         inet_iif(skb));
1740                 if (sk2) {
1741                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1742                         inet_twsk_put(inet_twsk(sk));
1743                         sk = sk2;
1744                         goto process;
1745                 }
1746                 /* Fall through to ACK */
1747         }
1748         case TCP_TW_ACK:
1749                 tcp_v4_timewait_ack(sk, skb);
1750                 break;
1751         case TCP_TW_RST:
1752                 goto no_tcp_socket;
1753         case TCP_TW_SUCCESS:;
1754         }
1755         goto discard_it;
1756 }
1757
1758 struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it)
1759 {
1760         struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
1761         struct inet_sock *inet = inet_sk(sk);
1762         struct inet_peer *peer;
1763
1764         if (!rt ||
1765             inet->cork.fl.u.ip4.daddr != inet->inet_daddr) {
1766                 peer = inet_getpeer_v4(inet->inet_daddr, 1);
1767                 *release_it = true;
1768         } else {
1769                 if (!rt->peer)
1770                         rt_bind_peer(rt, inet->inet_daddr, 1);
1771                 peer = rt->peer;
1772                 *release_it = false;
1773         }
1774
1775         return peer;
1776 }
1777 EXPORT_SYMBOL(tcp_v4_get_peer);
1778
1779 void *tcp_v4_tw_get_peer(struct sock *sk)
1780 {
1781         const struct inet_timewait_sock *tw = inet_twsk(sk);
1782
1783         return inet_getpeer_v4(tw->tw_daddr, 1);
1784 }
1785 EXPORT_SYMBOL(tcp_v4_tw_get_peer);
1786
1787 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1788         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1789         .twsk_unique    = tcp_twsk_unique,
1790         .twsk_destructor= tcp_twsk_destructor,
1791         .twsk_getpeer   = tcp_v4_tw_get_peer,
1792 };
1793
1794 const struct inet_connection_sock_af_ops ipv4_specific = {
1795         .queue_xmit        = ip_queue_xmit,
1796         .send_check        = tcp_v4_send_check,
1797         .rebuild_header    = inet_sk_rebuild_header,
1798         .conn_request      = tcp_v4_conn_request,
1799         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1800         .get_peer          = tcp_v4_get_peer,
1801         .net_header_len    = sizeof(struct iphdr),
1802         .setsockopt        = ip_setsockopt,
1803         .getsockopt        = ip_getsockopt,
1804         .addr2sockaddr     = inet_csk_addr2sockaddr,
1805         .sockaddr_len      = sizeof(struct sockaddr_in),
1806         .bind_conflict     = inet_csk_bind_conflict,
1807 #ifdef CONFIG_COMPAT
1808         .compat_setsockopt = compat_ip_setsockopt,
1809         .compat_getsockopt = compat_ip_getsockopt,
1810 #endif
1811 };
1812 EXPORT_SYMBOL(ipv4_specific);
1813
1814 #ifdef CONFIG_TCP_MD5SIG
1815 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1816         .md5_lookup             = tcp_v4_md5_lookup,
1817         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1818         .md5_parse              = tcp_v4_parse_md5_keys,
1819 };
1820 #endif
1821
1822 /* NOTE: A lot of things set to zero explicitly by call to
1823  *       sk_alloc() so need not be done here.
1824  */
1825 static int tcp_v4_init_sock(struct sock *sk)
1826 {
1827         struct inet_connection_sock *icsk = inet_csk(sk);
1828         struct tcp_sock *tp = tcp_sk(sk);
1829
1830         skb_queue_head_init(&tp->out_of_order_queue);
1831         tcp_init_xmit_timers(sk);
1832         tcp_prequeue_init(tp);
1833
1834         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1835         tp->mdev = TCP_TIMEOUT_INIT;
1836
1837         /* So many TCP implementations out there (incorrectly) count the
1838          * initial SYN frame in their delayed-ACK and congestion control
1839          * algorithms that we must have the following bandaid to talk
1840          * efficiently to them.  -DaveM
1841          */
1842         tp->snd_cwnd = TCP_INIT_CWND;
1843
1844         /* See draft-stevens-tcpca-spec-01 for discussion of the
1845          * initialization of these values.
1846          */
1847         tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1848         tp->snd_cwnd_clamp = ~0;
1849         tp->mss_cache = TCP_MSS_DEFAULT;
1850
1851         tp->reordering = sysctl_tcp_reordering;
1852         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1853
1854         sk->sk_state = TCP_CLOSE;
1855
1856         sk->sk_write_space = sk_stream_write_space;
1857         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1858
1859         icsk->icsk_af_ops = &ipv4_specific;
1860         icsk->icsk_sync_mss = tcp_sync_mss;
1861 #ifdef CONFIG_TCP_MD5SIG
1862         tp->af_specific = &tcp_sock_ipv4_specific;
1863 #endif
1864
1865         /* TCP Cookie Transactions */
1866         if (sysctl_tcp_cookie_size > 0) {
1867                 /* Default, cookies without s_data_payload. */
1868                 tp->cookie_values =
1869                         kzalloc(sizeof(*tp->cookie_values),
1870                                 sk->sk_allocation);
1871                 if (tp->cookie_values != NULL)
1872                         kref_init(&tp->cookie_values->kref);
1873         }
1874         /* Presumed zeroed, in order of appearance:
1875          *      cookie_in_always, cookie_out_never,
1876          *      s_data_constant, s_data_in, s_data_out
1877          */
1878         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1879         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1880
1881         local_bh_disable();
1882         sock_update_memcg(sk);
1883         sk_sockets_allocated_inc(sk);
1884         local_bh_enable();
1885
1886         return 0;
1887 }
1888
1889 void tcp_v4_destroy_sock(struct sock *sk)
1890 {
1891         struct tcp_sock *tp = tcp_sk(sk);
1892
1893         tcp_clear_xmit_timers(sk);
1894
1895         tcp_cleanup_congestion_control(sk);
1896
1897         /* Cleanup up the write buffer. */
1898         tcp_write_queue_purge(sk);
1899
1900         /* Cleans up our, hopefully empty, out_of_order_queue. */
1901         __skb_queue_purge(&tp->out_of_order_queue);
1902
1903 #ifdef CONFIG_TCP_MD5SIG
1904         /* Clean up the MD5 key list, if any */
1905         if (tp->md5sig_info) {
1906                 tcp_clear_md5_list(sk);
1907                 kfree_rcu(tp->md5sig_info, rcu);
1908                 tp->md5sig_info = NULL;
1909         }
1910 #endif
1911
1912 #ifdef CONFIG_NET_DMA
1913         /* Cleans up our sk_async_wait_queue */
1914         __skb_queue_purge(&sk->sk_async_wait_queue);
1915 #endif
1916
1917         /* Clean prequeue, it must be empty really */
1918         __skb_queue_purge(&tp->ucopy.prequeue);
1919
1920         /* Clean up a referenced TCP bind bucket. */
1921         if (inet_csk(sk)->icsk_bind_hash)
1922                 inet_put_port(sk);
1923
1924         /*
1925          * If sendmsg cached page exists, toss it.
1926          */
1927         if (sk->sk_sndmsg_page) {
1928                 __free_page(sk->sk_sndmsg_page);
1929                 sk->sk_sndmsg_page = NULL;
1930         }
1931
1932         /* TCP Cookie Transactions */
1933         if (tp->cookie_values != NULL) {
1934                 kref_put(&tp->cookie_values->kref,
1935                          tcp_cookie_values_release);
1936                 tp->cookie_values = NULL;
1937         }
1938
1939         sk_sockets_allocated_dec(sk);
1940         sock_release_memcg(sk);
1941 }
1942 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1943
1944 #ifdef CONFIG_PROC_FS
1945 /* Proc filesystem TCP sock list dumping. */
1946
1947 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1948 {
1949         return hlist_nulls_empty(head) ? NULL :
1950                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1951 }
1952
1953 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1954 {
1955         return !is_a_nulls(tw->tw_node.next) ?
1956                 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1957 }
1958
1959 /*
1960  * Get next listener socket follow cur.  If cur is NULL, get first socket
1961  * starting from bucket given in st->bucket; when st->bucket is zero the
1962  * very first socket in the hash table is returned.
1963  */
1964 static void *listening_get_next(struct seq_file *seq, void *cur)
1965 {
1966         struct inet_connection_sock *icsk;
1967         struct hlist_nulls_node *node;
1968         struct sock *sk = cur;
1969         struct inet_listen_hashbucket *ilb;
1970         struct tcp_iter_state *st = seq->private;
1971         struct net *net = seq_file_net(seq);
1972
1973         if (!sk) {
1974                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1975                 spin_lock_bh(&ilb->lock);
1976                 sk = sk_nulls_head(&ilb->head);
1977                 st->offset = 0;
1978                 goto get_sk;
1979         }
1980         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1981         ++st->num;
1982         ++st->offset;
1983
1984         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1985                 struct request_sock *req = cur;
1986
1987                 icsk = inet_csk(st->syn_wait_sk);
1988                 req = req->dl_next;
1989                 while (1) {
1990                         while (req) {
1991                                 if (req->rsk_ops->family == st->family) {
1992                                         cur = req;
1993                                         goto out;
1994                                 }
1995                                 req = req->dl_next;
1996                         }
1997                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1998                                 break;
1999 get_req:
2000                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2001                 }
2002                 sk        = sk_nulls_next(st->syn_wait_sk);
2003                 st->state = TCP_SEQ_STATE_LISTENING;
2004                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2005         } else {
2006                 icsk = inet_csk(sk);
2007                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2008                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
2009                         goto start_req;
2010                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2011                 sk = sk_nulls_next(sk);
2012         }
2013 get_sk:
2014         sk_nulls_for_each_from(sk, node) {
2015                 if (!net_eq(sock_net(sk), net))
2016                         continue;
2017                 if (sk->sk_family == st->family) {
2018                         cur = sk;
2019                         goto out;
2020                 }
2021                 icsk = inet_csk(sk);
2022                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2023                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2024 start_req:
2025                         st->uid         = sock_i_uid(sk);
2026                         st->syn_wait_sk = sk;
2027                         st->state       = TCP_SEQ_STATE_OPENREQ;
2028                         st->sbucket     = 0;
2029                         goto get_req;
2030                 }
2031                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2032         }
2033         spin_unlock_bh(&ilb->lock);
2034         st->offset = 0;
2035         if (++st->bucket < INET_LHTABLE_SIZE) {
2036                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2037                 spin_lock_bh(&ilb->lock);
2038                 sk = sk_nulls_head(&ilb->head);
2039                 goto get_sk;
2040         }
2041         cur = NULL;
2042 out:
2043         return cur;
2044 }
2045
2046 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2047 {
2048         struct tcp_iter_state *st = seq->private;
2049         void *rc;
2050
2051         st->bucket = 0;
2052         st->offset = 0;
2053         rc = listening_get_next(seq, NULL);
2054
2055         while (rc && *pos) {
2056                 rc = listening_get_next(seq, rc);
2057                 --*pos;
2058         }
2059         return rc;
2060 }
2061
2062 static inline int empty_bucket(struct tcp_iter_state *st)
2063 {
2064         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2065                 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2066 }
2067
2068 /*
2069  * Get first established socket starting from bucket given in st->bucket.
2070  * If st->bucket is zero, the very first socket in the hash is returned.
2071  */
2072 static void *established_get_first(struct seq_file *seq)
2073 {
2074         struct tcp_iter_state *st = seq->private;
2075         struct net *net = seq_file_net(seq);
2076         void *rc = NULL;
2077
2078         st->offset = 0;
2079         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2080                 struct sock *sk;
2081                 struct hlist_nulls_node *node;
2082                 struct inet_timewait_sock *tw;
2083                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2084
2085                 /* Lockless fast path for the common case of empty buckets */
2086                 if (empty_bucket(st))
2087                         continue;
2088
2089                 spin_lock_bh(lock);
2090                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2091                         if (sk->sk_family != st->family ||
2092                             !net_eq(sock_net(sk), net)) {
2093                                 continue;
2094                         }
2095                         rc = sk;
2096                         goto out;
2097                 }
2098                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2099                 inet_twsk_for_each(tw, node,
2100                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2101                         if (tw->tw_family != st->family ||
2102                             !net_eq(twsk_net(tw), net)) {
2103                                 continue;
2104                         }
2105                         rc = tw;
2106                         goto out;
2107                 }
2108                 spin_unlock_bh(lock);
2109                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2110         }
2111 out:
2112         return rc;
2113 }
2114
2115 static void *established_get_next(struct seq_file *seq, void *cur)
2116 {
2117         struct sock *sk = cur;
2118         struct inet_timewait_sock *tw;
2119         struct hlist_nulls_node *node;
2120         struct tcp_iter_state *st = seq->private;
2121         struct net *net = seq_file_net(seq);
2122
2123         ++st->num;
2124         ++st->offset;
2125
2126         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2127                 tw = cur;
2128                 tw = tw_next(tw);
2129 get_tw:
2130                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2131                         tw = tw_next(tw);
2132                 }
2133                 if (tw) {
2134                         cur = tw;
2135                         goto out;
2136                 }
2137                 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2138                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2139
2140                 /* Look for next non empty bucket */
2141                 st->offset = 0;
2142                 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2143                                 empty_bucket(st))
2144                         ;
2145                 if (st->bucket > tcp_hashinfo.ehash_mask)
2146                         return NULL;
2147
2148                 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2149                 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2150         } else
2151                 sk = sk_nulls_next(sk);
2152
2153         sk_nulls_for_each_from(sk, node) {
2154                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2155                         goto found;
2156         }
2157
2158         st->state = TCP_SEQ_STATE_TIME_WAIT;
2159         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2160         goto get_tw;
2161 found:
2162         cur = sk;
2163 out:
2164         return cur;
2165 }
2166
2167 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2168 {
2169         struct tcp_iter_state *st = seq->private;
2170         void *rc;
2171
2172         st->bucket = 0;
2173         rc = established_get_first(seq);
2174
2175         while (rc && pos) {
2176                 rc = established_get_next(seq, rc);
2177                 --pos;
2178         }
2179         return rc;
2180 }
2181
2182 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2183 {
2184         void *rc;
2185         struct tcp_iter_state *st = seq->private;
2186
2187         st->state = TCP_SEQ_STATE_LISTENING;
2188         rc        = listening_get_idx(seq, &pos);
2189
2190         if (!rc) {
2191                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2192                 rc        = established_get_idx(seq, pos);
2193         }
2194
2195         return rc;
2196 }
2197
2198 static void *tcp_seek_last_pos(struct seq_file *seq)
2199 {
2200         struct tcp_iter_state *st = seq->private;
2201         int offset = st->offset;
2202         int orig_num = st->num;
2203         void *rc = NULL;
2204
2205         switch (st->state) {
2206         case TCP_SEQ_STATE_OPENREQ:
2207         case TCP_SEQ_STATE_LISTENING:
2208                 if (st->bucket >= INET_LHTABLE_SIZE)
2209                         break;
2210                 st->state = TCP_SEQ_STATE_LISTENING;
2211                 rc = listening_get_next(seq, NULL);
2212                 while (offset-- && rc)
2213                         rc = listening_get_next(seq, rc);
2214                 if (rc)
2215                         break;
2216                 st->bucket = 0;
2217                 /* Fallthrough */
2218         case TCP_SEQ_STATE_ESTABLISHED:
2219         case TCP_SEQ_STATE_TIME_WAIT:
2220                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2221                 if (st->bucket > tcp_hashinfo.ehash_mask)
2222                         break;
2223                 rc = established_get_first(seq);
2224                 while (offset-- && rc)
2225                         rc = established_get_next(seq, rc);
2226         }
2227
2228         st->num = orig_num;
2229
2230         return rc;
2231 }
2232
2233 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2234 {
2235         struct tcp_iter_state *st = seq->private;
2236         void *rc;
2237
2238         if (*pos && *pos == st->last_pos) {
2239                 rc = tcp_seek_last_pos(seq);
2240                 if (rc)
2241                         goto out;
2242         }
2243
2244         st->state = TCP_SEQ_STATE_LISTENING;
2245         st->num = 0;
2246         st->bucket = 0;
2247         st->offset = 0;
2248         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2249
2250 out:
2251         st->last_pos = *pos;
2252         return rc;
2253 }
2254
2255 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2256 {
2257         struct tcp_iter_state *st = seq->private;
2258         void *rc = NULL;
2259
2260         if (v == SEQ_START_TOKEN) {
2261                 rc = tcp_get_idx(seq, 0);
2262                 goto out;
2263         }
2264
2265         switch (st->state) {
2266         case TCP_SEQ_STATE_OPENREQ:
2267         case TCP_SEQ_STATE_LISTENING:
2268                 rc = listening_get_next(seq, v);
2269                 if (!rc) {
2270                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2271                         st->bucket = 0;
2272                         st->offset = 0;
2273                         rc        = established_get_first(seq);
2274                 }
2275                 break;
2276         case TCP_SEQ_STATE_ESTABLISHED:
2277         case TCP_SEQ_STATE_TIME_WAIT:
2278                 rc = established_get_next(seq, v);
2279                 break;
2280         }
2281 out:
2282         ++*pos;
2283         st->last_pos = *pos;
2284         return rc;
2285 }
2286
2287 static void tcp_seq_stop(struct seq_file *seq, void *v)
2288 {
2289         struct tcp_iter_state *st = seq->private;
2290
2291         switch (st->state) {
2292         case TCP_SEQ_STATE_OPENREQ:
2293                 if (v) {
2294                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2295                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2296                 }
2297         case TCP_SEQ_STATE_LISTENING:
2298                 if (v != SEQ_START_TOKEN)
2299                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2300                 break;
2301         case TCP_SEQ_STATE_TIME_WAIT:
2302         case TCP_SEQ_STATE_ESTABLISHED:
2303                 if (v)
2304                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2305                 break;
2306         }
2307 }
2308
2309 int tcp_seq_open(struct inode *inode, struct file *file)
2310 {
2311         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2312         struct tcp_iter_state *s;
2313         int err;
2314
2315         err = seq_open_net(inode, file, &afinfo->seq_ops,
2316                           sizeof(struct tcp_iter_state));
2317         if (err < 0)
2318                 return err;
2319
2320         s = ((struct seq_file *)file->private_data)->private;
2321         s->family               = afinfo->family;
2322         s->last_pos             = 0;
2323         return 0;
2324 }
2325 EXPORT_SYMBOL(tcp_seq_open);
2326
2327 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2328 {
2329         int rc = 0;
2330         struct proc_dir_entry *p;
2331
2332         afinfo->seq_ops.start           = tcp_seq_start;
2333         afinfo->seq_ops.next            = tcp_seq_next;
2334         afinfo->seq_ops.stop            = tcp_seq_stop;
2335
2336         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2337                              afinfo->seq_fops, afinfo);
2338         if (!p)
2339                 rc = -ENOMEM;
2340         return rc;
2341 }
2342 EXPORT_SYMBOL(tcp_proc_register);
2343
2344 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2345 {
2346         proc_net_remove(net, afinfo->name);
2347 }
2348 EXPORT_SYMBOL(tcp_proc_unregister);
2349
2350 static void get_openreq4(const struct sock *sk, const struct request_sock *req,
2351                          struct seq_file *f, int i, int uid, int *len)
2352 {
2353         const struct inet_request_sock *ireq = inet_rsk(req);
2354         int ttd = req->expires - jiffies;
2355
2356         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2357                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
2358                 i,
2359                 ireq->loc_addr,
2360                 ntohs(inet_sk(sk)->inet_sport),
2361                 ireq->rmt_addr,
2362                 ntohs(ireq->rmt_port),
2363                 TCP_SYN_RECV,
2364                 0, 0, /* could print option size, but that is af dependent. */
2365                 1,    /* timers active (only the expire timer) */
2366                 jiffies_to_clock_t(ttd),
2367                 req->retrans,
2368                 uid,
2369                 0,  /* non standard timer */
2370                 0, /* open_requests have no inode */
2371                 atomic_read(&sk->sk_refcnt),
2372                 req,
2373                 len);
2374 }
2375
2376 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2377 {
2378         int timer_active;
2379         unsigned long timer_expires;
2380         const struct tcp_sock *tp = tcp_sk(sk);
2381         const struct inet_connection_sock *icsk = inet_csk(sk);
2382         const struct inet_sock *inet = inet_sk(sk);
2383         __be32 dest = inet->inet_daddr;
2384         __be32 src = inet->inet_rcv_saddr;
2385         __u16 destp = ntohs(inet->inet_dport);
2386         __u16 srcp = ntohs(inet->inet_sport);
2387         int rx_queue;
2388
2389         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2390                 timer_active    = 1;
2391                 timer_expires   = icsk->icsk_timeout;
2392         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2393                 timer_active    = 4;
2394                 timer_expires   = icsk->icsk_timeout;
2395         } else if (timer_pending(&sk->sk_timer)) {
2396                 timer_active    = 2;
2397                 timer_expires   = sk->sk_timer.expires;
2398         } else {
2399                 timer_active    = 0;
2400                 timer_expires = jiffies;
2401         }
2402
2403         if (sk->sk_state == TCP_LISTEN)
2404                 rx_queue = sk->sk_ack_backlog;
2405         else
2406                 /*
2407                  * because we dont lock socket, we might find a transient negative value
2408                  */
2409                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2410
2411         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2412                         "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
2413                 i, src, srcp, dest, destp, sk->sk_state,
2414                 tp->write_seq - tp->snd_una,
2415                 rx_queue,
2416                 timer_active,
2417                 jiffies_to_clock_t(timer_expires - jiffies),
2418                 icsk->icsk_retransmits,
2419                 sock_i_uid(sk),
2420                 icsk->icsk_probes_out,
2421                 sock_i_ino(sk),
2422                 atomic_read(&sk->sk_refcnt), sk,
2423                 jiffies_to_clock_t(icsk->icsk_rto),
2424                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2425                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2426                 tp->snd_cwnd,
2427                 tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2428                 len);
2429 }
2430
2431 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2432                                struct seq_file *f, int i, int *len)
2433 {
2434         __be32 dest, src;
2435         __u16 destp, srcp;
2436         int ttd = tw->tw_ttd - jiffies;
2437
2438         if (ttd < 0)
2439                 ttd = 0;
2440
2441         dest  = tw->tw_daddr;
2442         src   = tw->tw_rcv_saddr;
2443         destp = ntohs(tw->tw_dport);
2444         srcp  = ntohs(tw->tw_sport);
2445
2446         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2447                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
2448                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2449                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2450                 atomic_read(&tw->tw_refcnt), tw, len);
2451 }
2452
2453 #define TMPSZ 150
2454
2455 static int tcp4_seq_show(struct seq_file *seq, void *v)
2456 {
2457         struct tcp_iter_state *st;
2458         int len;
2459
2460         if (v == SEQ_START_TOKEN) {
2461                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2462                            "  sl  local_address rem_address   st tx_queue "
2463                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2464                            "inode");
2465                 goto out;
2466         }
2467         st = seq->private;
2468
2469         switch (st->state) {
2470         case TCP_SEQ_STATE_LISTENING:
2471         case TCP_SEQ_STATE_ESTABLISHED:
2472                 get_tcp4_sock(v, seq, st->num, &len);
2473                 break;
2474         case TCP_SEQ_STATE_OPENREQ:
2475                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2476                 break;
2477         case TCP_SEQ_STATE_TIME_WAIT:
2478                 get_timewait4_sock(v, seq, st->num, &len);
2479                 break;
2480         }
2481         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2482 out:
2483         return 0;
2484 }
2485
2486 static const struct file_operations tcp_afinfo_seq_fops = {
2487         .owner   = THIS_MODULE,
2488         .open    = tcp_seq_open,
2489         .read    = seq_read,
2490         .llseek  = seq_lseek,
2491         .release = seq_release_net
2492 };
2493
2494 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2495         .name           = "tcp",
2496         .family         = AF_INET,
2497         .seq_fops       = &tcp_afinfo_seq_fops,
2498         .seq_ops        = {
2499                 .show           = tcp4_seq_show,
2500         },
2501 };
2502
2503 static int __net_init tcp4_proc_init_net(struct net *net)
2504 {
2505         return tcp_proc_register(net, &tcp4_seq_afinfo);
2506 }
2507
2508 static void __net_exit tcp4_proc_exit_net(struct net *net)
2509 {
2510         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2511 }
2512
2513 static struct pernet_operations tcp4_net_ops = {
2514         .init = tcp4_proc_init_net,
2515         .exit = tcp4_proc_exit_net,
2516 };
2517
2518 int __init tcp4_proc_init(void)
2519 {
2520         return register_pernet_subsys(&tcp4_net_ops);
2521 }
2522
2523 void tcp4_proc_exit(void)
2524 {
2525         unregister_pernet_subsys(&tcp4_net_ops);
2526 }
2527 #endif /* CONFIG_PROC_FS */
2528
2529 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2530 {
2531         const struct iphdr *iph = skb_gro_network_header(skb);
2532
2533         switch (skb->ip_summed) {
2534         case CHECKSUM_COMPLETE:
2535                 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2536                                   skb->csum)) {
2537                         skb->ip_summed = CHECKSUM_UNNECESSARY;
2538                         break;
2539                 }
2540
2541                 /* fall through */
2542         case CHECKSUM_NONE:
2543                 NAPI_GRO_CB(skb)->flush = 1;
2544                 return NULL;
2545         }
2546
2547         return tcp_gro_receive(head, skb);
2548 }
2549
2550 int tcp4_gro_complete(struct sk_buff *skb)
2551 {
2552         const struct iphdr *iph = ip_hdr(skb);
2553         struct tcphdr *th = tcp_hdr(skb);
2554
2555         th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2556                                   iph->saddr, iph->daddr, 0);
2557         skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2558
2559         return tcp_gro_complete(skb);
2560 }
2561
2562 struct proto tcp_prot = {
2563         .name                   = "TCP",
2564         .owner                  = THIS_MODULE,
2565         .close                  = tcp_close,
2566         .connect                = tcp_v4_connect,
2567         .disconnect             = tcp_disconnect,
2568         .accept                 = inet_csk_accept,
2569         .ioctl                  = tcp_ioctl,
2570         .init                   = tcp_v4_init_sock,
2571         .destroy                = tcp_v4_destroy_sock,
2572         .shutdown               = tcp_shutdown,
2573         .setsockopt             = tcp_setsockopt,
2574         .getsockopt             = tcp_getsockopt,
2575         .recvmsg                = tcp_recvmsg,
2576         .sendmsg                = tcp_sendmsg,
2577         .sendpage               = tcp_sendpage,
2578         .backlog_rcv            = tcp_v4_do_rcv,
2579         .hash                   = inet_hash,
2580         .unhash                 = inet_unhash,
2581         .get_port               = inet_csk_get_port,
2582         .enter_memory_pressure  = tcp_enter_memory_pressure,
2583         .sockets_allocated      = &tcp_sockets_allocated,
2584         .orphan_count           = &tcp_orphan_count,
2585         .memory_allocated       = &tcp_memory_allocated,
2586         .memory_pressure        = &tcp_memory_pressure,
2587         .sysctl_wmem            = sysctl_tcp_wmem,
2588         .sysctl_rmem            = sysctl_tcp_rmem,
2589         .max_header             = MAX_TCP_HEADER,
2590         .obj_size               = sizeof(struct tcp_sock),
2591         .slab_flags             = SLAB_DESTROY_BY_RCU,
2592         .twsk_prot              = &tcp_timewait_sock_ops,
2593         .rsk_prot               = &tcp_request_sock_ops,
2594         .h.hashinfo             = &tcp_hashinfo,
2595         .no_autobind            = true,
2596 #ifdef CONFIG_COMPAT
2597         .compat_setsockopt      = compat_tcp_setsockopt,
2598         .compat_getsockopt      = compat_tcp_getsockopt,
2599 #endif
2600 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
2601         .init_cgroup            = tcp_init_cgroup,
2602         .destroy_cgroup         = tcp_destroy_cgroup,
2603         .proto_cgroup           = tcp_proto_cgroup,
2604 #endif
2605 };
2606 EXPORT_SYMBOL(tcp_prot);
2607
2608 static int __net_init tcp_sk_init(struct net *net)
2609 {
2610         return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2611                                     PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2612 }
2613
2614 static void __net_exit tcp_sk_exit(struct net *net)
2615 {
2616         inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2617 }
2618
2619 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2620 {
2621         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2622 }
2623
2624 static struct pernet_operations __net_initdata tcp_sk_ops = {
2625        .init       = tcp_sk_init,
2626        .exit       = tcp_sk_exit,
2627        .exit_batch = tcp_sk_exit_batch,
2628 };
2629
2630 void __init tcp_v4_init(void)
2631 {
2632         inet_hashinfo_init(&tcp_hashinfo);
2633         if (register_pernet_subsys(&tcp_sk_ops))
2634                 panic("Failed to create the TCP control socket.\n");
2635 }