]> git.openfabrics.org - ~shefty/rdma-dev.git/blob - net/ipv4/tcp_ipv4.c
da5d3226771b78841740b47f2e469d89d250cd82
[~shefty/rdma-dev.git] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  *              IPv4 specific functions
9  *
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  *
18  *      This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23
24 /*
25  * Changes:
26  *              David S. Miller :       New socket lookup architecture.
27  *                                      This code is dedicated to John Dyson.
28  *              David S. Miller :       Change semantics of established hash,
29  *                                      half is devoted to TIME_WAIT sockets
30  *                                      and the rest go in the other half.
31  *              Andi Kleen :            Add support for syncookies and fixed
32  *                                      some bugs: ip options weren't passed to
33  *                                      the TCP layer, missed a check for an
34  *                                      ACK bit.
35  *              Andi Kleen :            Implemented fast path mtu discovery.
36  *                                      Fixed many serious bugs in the
37  *                                      request_sock handling and moved
38  *                                      most of it into the af independent code.
39  *                                      Added tail drop and some other bugfixes.
40  *                                      Added new listen semantics.
41  *              Mike McLagan    :       Routing by source
42  *      Juan Jose Ciarlante:            ip_dynaddr bits
43  *              Andi Kleen:             various fixes.
44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
45  *                                      coma.
46  *      Andi Kleen              :       Fix new listen.
47  *      Andi Kleen              :       Fix accept error reporting.
48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
50  *                                      a single port at the same time.
51  */
52
53
54 #include <linux/bottom_half.h>
55 #include <linux/types.h>
56 #include <linux/fcntl.h>
57 #include <linux/module.h>
58 #include <linux/random.h>
59 #include <linux/cache.h>
60 #include <linux/jhash.h>
61 #include <linux/init.h>
62 #include <linux/times.h>
63 #include <linux/slab.h>
64
65 #include <net/net_namespace.h>
66 #include <net/icmp.h>
67 #include <net/inet_hashtables.h>
68 #include <net/tcp.h>
69 #include <net/transp_v6.h>
70 #include <net/ipv6.h>
71 #include <net/inet_common.h>
72 #include <net/timewait_sock.h>
73 #include <net/xfrm.h>
74 #include <net/netdma.h>
75 #include <net/secure_seq.h>
76 #include <net/tcp_memcontrol.h>
77
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83
84 #include <linux/crypto.h>
85 #include <linux/scatterlist.h>
86
87 int sysctl_tcp_tw_reuse __read_mostly;
88 int sysctl_tcp_low_latency __read_mostly;
89 EXPORT_SYMBOL(sysctl_tcp_low_latency);
90
91
92 #ifdef CONFIG_TCP_MD5SIG
93 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
94                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
95 #endif
96
97 struct inet_hashinfo tcp_hashinfo;
98 EXPORT_SYMBOL(tcp_hashinfo);
99
100 static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
101 {
102         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
103                                           ip_hdr(skb)->saddr,
104                                           tcp_hdr(skb)->dest,
105                                           tcp_hdr(skb)->source);
106 }
107
108 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
109 {
110         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
111         struct tcp_sock *tp = tcp_sk(sk);
112
113         /* With PAWS, it is safe from the viewpoint
114            of data integrity. Even without PAWS it is safe provided sequence
115            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
116
117            Actually, the idea is close to VJ's one, only timestamp cache is
118            held not per host, but per port pair and TW bucket is used as state
119            holder.
120
121            If TW bucket has been already destroyed we fall back to VJ's scheme
122            and use initial timestamp retrieved from peer table.
123          */
124         if (tcptw->tw_ts_recent_stamp &&
125             (twp == NULL || (sysctl_tcp_tw_reuse &&
126                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
127                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
128                 if (tp->write_seq == 0)
129                         tp->write_seq = 1;
130                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
131                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
132                 sock_hold(sktw);
133                 return 1;
134         }
135
136         return 0;
137 }
138 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
139
140 /* This will initiate an outgoing connection. */
141 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
142 {
143         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
144         struct inet_sock *inet = inet_sk(sk);
145         struct tcp_sock *tp = tcp_sk(sk);
146         __be16 orig_sport, orig_dport;
147         __be32 daddr, nexthop;
148         struct flowi4 *fl4;
149         struct rtable *rt;
150         int err;
151         struct ip_options_rcu *inet_opt;
152
153         if (addr_len < sizeof(struct sockaddr_in))
154                 return -EINVAL;
155
156         if (usin->sin_family != AF_INET)
157                 return -EAFNOSUPPORT;
158
159         nexthop = daddr = usin->sin_addr.s_addr;
160         inet_opt = rcu_dereference_protected(inet->inet_opt,
161                                              sock_owned_by_user(sk));
162         if (inet_opt && inet_opt->opt.srr) {
163                 if (!daddr)
164                         return -EINVAL;
165                 nexthop = inet_opt->opt.faddr;
166         }
167
168         orig_sport = inet->inet_sport;
169         orig_dport = usin->sin_port;
170         fl4 = &inet->cork.fl.u.ip4;
171         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
172                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
173                               IPPROTO_TCP,
174                               orig_sport, orig_dport, sk, true);
175         if (IS_ERR(rt)) {
176                 err = PTR_ERR(rt);
177                 if (err == -ENETUNREACH)
178                         IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
179                 return err;
180         }
181
182         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
183                 ip_rt_put(rt);
184                 return -ENETUNREACH;
185         }
186
187         if (!inet_opt || !inet_opt->opt.srr)
188                 daddr = fl4->daddr;
189
190         if (!inet->inet_saddr)
191                 inet->inet_saddr = fl4->saddr;
192         inet->inet_rcv_saddr = inet->inet_saddr;
193
194         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
195                 /* Reset inherited state */
196                 tp->rx_opt.ts_recent       = 0;
197                 tp->rx_opt.ts_recent_stamp = 0;
198                 tp->write_seq              = 0;
199         }
200
201         if (tcp_death_row.sysctl_tw_recycle &&
202             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) {
203                 struct inet_peer *peer = rt_get_peer(rt, fl4->daddr);
204                 /*
205                  * VJ's idea. We save last timestamp seen from
206                  * the destination in peer table, when entering state
207                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
208                  * when trying new connection.
209                  */
210                 if (peer) {
211                         inet_peer_refcheck(peer);
212                         if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
213                                 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
214                                 tp->rx_opt.ts_recent = peer->tcp_ts;
215                         }
216                 }
217         }
218
219         inet->inet_dport = usin->sin_port;
220         inet->inet_daddr = daddr;
221
222         inet_csk(sk)->icsk_ext_hdr_len = 0;
223         if (inet_opt)
224                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
225
226         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
227
228         /* Socket identity is still unknown (sport may be zero).
229          * However we set state to SYN-SENT and not releasing socket
230          * lock select source port, enter ourselves into the hash tables and
231          * complete initialization after this.
232          */
233         tcp_set_state(sk, TCP_SYN_SENT);
234         err = inet_hash_connect(&tcp_death_row, sk);
235         if (err)
236                 goto failure;
237
238         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
239                                inet->inet_sport, inet->inet_dport, sk);
240         if (IS_ERR(rt)) {
241                 err = PTR_ERR(rt);
242                 rt = NULL;
243                 goto failure;
244         }
245         /* OK, now commit destination to socket.  */
246         sk->sk_gso_type = SKB_GSO_TCPV4;
247         sk_setup_caps(sk, &rt->dst);
248
249         if (!tp->write_seq)
250                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
251                                                            inet->inet_daddr,
252                                                            inet->inet_sport,
253                                                            usin->sin_port);
254
255         inet->inet_id = tp->write_seq ^ jiffies;
256
257         err = tcp_connect(sk);
258         rt = NULL;
259         if (err)
260                 goto failure;
261
262         return 0;
263
264 failure:
265         /*
266          * This unhashes the socket and releases the local port,
267          * if necessary.
268          */
269         tcp_set_state(sk, TCP_CLOSE);
270         ip_rt_put(rt);
271         sk->sk_route_caps = 0;
272         inet->inet_dport = 0;
273         return err;
274 }
275 EXPORT_SYMBOL(tcp_v4_connect);
276
277 /*
278  * This routine does path mtu discovery as defined in RFC1191.
279  */
280 static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
281 {
282         struct dst_entry *dst;
283         struct inet_sock *inet = inet_sk(sk);
284
285         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
286          * send out by Linux are always <576bytes so they should go through
287          * unfragmented).
288          */
289         if (sk->sk_state == TCP_LISTEN)
290                 return;
291
292         /* We don't check in the destentry if pmtu discovery is forbidden
293          * on this route. We just assume that no packet_to_big packets
294          * are send back when pmtu discovery is not active.
295          * There is a small race when the user changes this flag in the
296          * route, but I think that's acceptable.
297          */
298         if ((dst = __sk_dst_check(sk, 0)) == NULL)
299                 return;
300
301         dst->ops->update_pmtu(dst, mtu);
302
303         /* Something is about to be wrong... Remember soft error
304          * for the case, if this connection will not able to recover.
305          */
306         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
307                 sk->sk_err_soft = EMSGSIZE;
308
309         mtu = dst_mtu(dst);
310
311         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
312             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
313                 tcp_sync_mss(sk, mtu);
314
315                 /* Resend the TCP packet because it's
316                  * clear that the old packet has been
317                  * dropped. This is the new "fast" path mtu
318                  * discovery.
319                  */
320                 tcp_simple_retransmit(sk);
321         } /* else let the usual retransmit timer handle it */
322 }
323
324 /*
325  * This routine is called by the ICMP module when it gets some
326  * sort of error condition.  If err < 0 then the socket should
327  * be closed and the error returned to the user.  If err > 0
328  * it's just the icmp type << 8 | icmp code.  After adjustment
329  * header points to the first 8 bytes of the tcp header.  We need
330  * to find the appropriate port.
331  *
332  * The locking strategy used here is very "optimistic". When
333  * someone else accesses the socket the ICMP is just dropped
334  * and for some paths there is no check at all.
335  * A more general error queue to queue errors for later handling
336  * is probably better.
337  *
338  */
339
340 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
341 {
342         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
343         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
344         struct inet_connection_sock *icsk;
345         struct tcp_sock *tp;
346         struct inet_sock *inet;
347         const int type = icmp_hdr(icmp_skb)->type;
348         const int code = icmp_hdr(icmp_skb)->code;
349         struct sock *sk;
350         struct sk_buff *skb;
351         __u32 seq;
352         __u32 remaining;
353         int err;
354         struct net *net = dev_net(icmp_skb->dev);
355
356         if (icmp_skb->len < (iph->ihl << 2) + 8) {
357                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
358                 return;
359         }
360
361         sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
362                         iph->saddr, th->source, inet_iif(icmp_skb));
363         if (!sk) {
364                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
365                 return;
366         }
367         if (sk->sk_state == TCP_TIME_WAIT) {
368                 inet_twsk_put(inet_twsk(sk));
369                 return;
370         }
371
372         bh_lock_sock(sk);
373         /* If too many ICMPs get dropped on busy
374          * servers this needs to be solved differently.
375          */
376         if (sock_owned_by_user(sk))
377                 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
378
379         if (sk->sk_state == TCP_CLOSE)
380                 goto out;
381
382         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
383                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
384                 goto out;
385         }
386
387         icsk = inet_csk(sk);
388         tp = tcp_sk(sk);
389         seq = ntohl(th->seq);
390         if (sk->sk_state != TCP_LISTEN &&
391             !between(seq, tp->snd_una, tp->snd_nxt)) {
392                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
393                 goto out;
394         }
395
396         switch (type) {
397         case ICMP_SOURCE_QUENCH:
398                 /* Just silently ignore these. */
399                 goto out;
400         case ICMP_PARAMETERPROB:
401                 err = EPROTO;
402                 break;
403         case ICMP_DEST_UNREACH:
404                 if (code > NR_ICMP_UNREACH)
405                         goto out;
406
407                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
408                         if (!sock_owned_by_user(sk))
409                                 do_pmtu_discovery(sk, iph, info);
410                         goto out;
411                 }
412
413                 err = icmp_err_convert[code].errno;
414                 /* check if icmp_skb allows revert of backoff
415                  * (see draft-zimmermann-tcp-lcd) */
416                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
417                         break;
418                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
419                     !icsk->icsk_backoff)
420                         break;
421
422                 if (sock_owned_by_user(sk))
423                         break;
424
425                 icsk->icsk_backoff--;
426                 inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) :
427                         TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
428                 tcp_bound_rto(sk);
429
430                 skb = tcp_write_queue_head(sk);
431                 BUG_ON(!skb);
432
433                 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
434                                 tcp_time_stamp - TCP_SKB_CB(skb)->when);
435
436                 if (remaining) {
437                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
438                                                   remaining, TCP_RTO_MAX);
439                 } else {
440                         /* RTO revert clocked out retransmission.
441                          * Will retransmit now */
442                         tcp_retransmit_timer(sk);
443                 }
444
445                 break;
446         case ICMP_TIME_EXCEEDED:
447                 err = EHOSTUNREACH;
448                 break;
449         default:
450                 goto out;
451         }
452
453         switch (sk->sk_state) {
454                 struct request_sock *req, **prev;
455         case TCP_LISTEN:
456                 if (sock_owned_by_user(sk))
457                         goto out;
458
459                 req = inet_csk_search_req(sk, &prev, th->dest,
460                                           iph->daddr, iph->saddr);
461                 if (!req)
462                         goto out;
463
464                 /* ICMPs are not backlogged, hence we cannot get
465                    an established socket here.
466                  */
467                 WARN_ON(req->sk);
468
469                 if (seq != tcp_rsk(req)->snt_isn) {
470                         NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
471                         goto out;
472                 }
473
474                 /*
475                  * Still in SYN_RECV, just remove it silently.
476                  * There is no good way to pass the error to the newly
477                  * created socket, and POSIX does not want network
478                  * errors returned from accept().
479                  */
480                 inet_csk_reqsk_queue_drop(sk, req, prev);
481                 goto out;
482
483         case TCP_SYN_SENT:
484         case TCP_SYN_RECV:  /* Cannot happen.
485                                It can f.e. if SYNs crossed.
486                              */
487                 if (!sock_owned_by_user(sk)) {
488                         sk->sk_err = err;
489
490                         sk->sk_error_report(sk);
491
492                         tcp_done(sk);
493                 } else {
494                         sk->sk_err_soft = err;
495                 }
496                 goto out;
497         }
498
499         /* If we've already connected we will keep trying
500          * until we time out, or the user gives up.
501          *
502          * rfc1122 4.2.3.9 allows to consider as hard errors
503          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
504          * but it is obsoleted by pmtu discovery).
505          *
506          * Note, that in modern internet, where routing is unreliable
507          * and in each dark corner broken firewalls sit, sending random
508          * errors ordered by their masters even this two messages finally lose
509          * their original sense (even Linux sends invalid PORT_UNREACHs)
510          *
511          * Now we are in compliance with RFCs.
512          *                                                      --ANK (980905)
513          */
514
515         inet = inet_sk(sk);
516         if (!sock_owned_by_user(sk) && inet->recverr) {
517                 sk->sk_err = err;
518                 sk->sk_error_report(sk);
519         } else  { /* Only an error on timeout */
520                 sk->sk_err_soft = err;
521         }
522
523 out:
524         bh_unlock_sock(sk);
525         sock_put(sk);
526 }
527
528 static void __tcp_v4_send_check(struct sk_buff *skb,
529                                 __be32 saddr, __be32 daddr)
530 {
531         struct tcphdr *th = tcp_hdr(skb);
532
533         if (skb->ip_summed == CHECKSUM_PARTIAL) {
534                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
535                 skb->csum_start = skb_transport_header(skb) - skb->head;
536                 skb->csum_offset = offsetof(struct tcphdr, check);
537         } else {
538                 th->check = tcp_v4_check(skb->len, saddr, daddr,
539                                          csum_partial(th,
540                                                       th->doff << 2,
541                                                       skb->csum));
542         }
543 }
544
545 /* This routine computes an IPv4 TCP checksum. */
546 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
547 {
548         const struct inet_sock *inet = inet_sk(sk);
549
550         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
551 }
552 EXPORT_SYMBOL(tcp_v4_send_check);
553
554 int tcp_v4_gso_send_check(struct sk_buff *skb)
555 {
556         const struct iphdr *iph;
557         struct tcphdr *th;
558
559         if (!pskb_may_pull(skb, sizeof(*th)))
560                 return -EINVAL;
561
562         iph = ip_hdr(skb);
563         th = tcp_hdr(skb);
564
565         th->check = 0;
566         skb->ip_summed = CHECKSUM_PARTIAL;
567         __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
568         return 0;
569 }
570
571 /*
572  *      This routine will send an RST to the other tcp.
573  *
574  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
575  *                    for reset.
576  *      Answer: if a packet caused RST, it is not for a socket
577  *              existing in our system, if it is matched to a socket,
578  *              it is just duplicate segment or bug in other side's TCP.
579  *              So that we build reply only basing on parameters
580  *              arrived with segment.
581  *      Exception: precedence violation. We do not implement it in any case.
582  */
583
584 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
585 {
586         const struct tcphdr *th = tcp_hdr(skb);
587         struct {
588                 struct tcphdr th;
589 #ifdef CONFIG_TCP_MD5SIG
590                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
591 #endif
592         } rep;
593         struct ip_reply_arg arg;
594 #ifdef CONFIG_TCP_MD5SIG
595         struct tcp_md5sig_key *key;
596 #endif
597         struct net *net;
598
599         /* Never send a reset in response to a reset. */
600         if (th->rst)
601                 return;
602
603         if (skb_rtable(skb)->rt_type != RTN_LOCAL)
604                 return;
605
606         /* Swap the send and the receive. */
607         memset(&rep, 0, sizeof(rep));
608         rep.th.dest   = th->source;
609         rep.th.source = th->dest;
610         rep.th.doff   = sizeof(struct tcphdr) / 4;
611         rep.th.rst    = 1;
612
613         if (th->ack) {
614                 rep.th.seq = th->ack_seq;
615         } else {
616                 rep.th.ack = 1;
617                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
618                                        skb->len - (th->doff << 2));
619         }
620
621         memset(&arg, 0, sizeof(arg));
622         arg.iov[0].iov_base = (unsigned char *)&rep;
623         arg.iov[0].iov_len  = sizeof(rep.th);
624
625 #ifdef CONFIG_TCP_MD5SIG
626         key = sk ? tcp_md5_do_lookup(sk,
627                                      (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
628                                      AF_INET) : NULL;
629         if (key) {
630                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
631                                    (TCPOPT_NOP << 16) |
632                                    (TCPOPT_MD5SIG << 8) |
633                                    TCPOLEN_MD5SIG);
634                 /* Update length and the length the header thinks exists */
635                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
636                 rep.th.doff = arg.iov[0].iov_len / 4;
637
638                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
639                                      key, ip_hdr(skb)->saddr,
640                                      ip_hdr(skb)->daddr, &rep.th);
641         }
642 #endif
643         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
644                                       ip_hdr(skb)->saddr, /* XXX */
645                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
646         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
647         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
648
649         net = dev_net(skb_dst(skb)->dev);
650         arg.tos = ip_hdr(skb)->tos;
651         ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
652                       &arg, arg.iov[0].iov_len);
653
654         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
655         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
656 }
657
658 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
659    outside socket context is ugly, certainly. What can I do?
660  */
661
662 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
663                             u32 win, u32 ts, int oif,
664                             struct tcp_md5sig_key *key,
665                             int reply_flags, u8 tos)
666 {
667         const struct tcphdr *th = tcp_hdr(skb);
668         struct {
669                 struct tcphdr th;
670                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
671 #ifdef CONFIG_TCP_MD5SIG
672                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
673 #endif
674                         ];
675         } rep;
676         struct ip_reply_arg arg;
677         struct net *net = dev_net(skb_dst(skb)->dev);
678
679         memset(&rep.th, 0, sizeof(struct tcphdr));
680         memset(&arg, 0, sizeof(arg));
681
682         arg.iov[0].iov_base = (unsigned char *)&rep;
683         arg.iov[0].iov_len  = sizeof(rep.th);
684         if (ts) {
685                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
686                                    (TCPOPT_TIMESTAMP << 8) |
687                                    TCPOLEN_TIMESTAMP);
688                 rep.opt[1] = htonl(tcp_time_stamp);
689                 rep.opt[2] = htonl(ts);
690                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
691         }
692
693         /* Swap the send and the receive. */
694         rep.th.dest    = th->source;
695         rep.th.source  = th->dest;
696         rep.th.doff    = arg.iov[0].iov_len / 4;
697         rep.th.seq     = htonl(seq);
698         rep.th.ack_seq = htonl(ack);
699         rep.th.ack     = 1;
700         rep.th.window  = htons(win);
701
702 #ifdef CONFIG_TCP_MD5SIG
703         if (key) {
704                 int offset = (ts) ? 3 : 0;
705
706                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
707                                           (TCPOPT_NOP << 16) |
708                                           (TCPOPT_MD5SIG << 8) |
709                                           TCPOLEN_MD5SIG);
710                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
711                 rep.th.doff = arg.iov[0].iov_len/4;
712
713                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
714                                     key, ip_hdr(skb)->saddr,
715                                     ip_hdr(skb)->daddr, &rep.th);
716         }
717 #endif
718         arg.flags = reply_flags;
719         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
720                                       ip_hdr(skb)->saddr, /* XXX */
721                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
722         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
723         if (oif)
724                 arg.bound_dev_if = oif;
725         arg.tos = tos;
726         ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
727                       &arg, arg.iov[0].iov_len);
728
729         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
730 }
731
732 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
733 {
734         struct inet_timewait_sock *tw = inet_twsk(sk);
735         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
736
737         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
738                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
739                         tcptw->tw_ts_recent,
740                         tw->tw_bound_dev_if,
741                         tcp_twsk_md5_key(tcptw),
742                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
743                         tw->tw_tos
744                         );
745
746         inet_twsk_put(tw);
747 }
748
749 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
750                                   struct request_sock *req)
751 {
752         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
753                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
754                         req->ts_recent,
755                         0,
756                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
757                                           AF_INET),
758                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
759                         ip_hdr(skb)->tos);
760 }
761
762 /*
763  *      Send a SYN-ACK after having received a SYN.
764  *      This still operates on a request_sock only, not on a big
765  *      socket.
766  */
767 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
768                               struct request_sock *req,
769                               struct request_values *rvp)
770 {
771         const struct inet_request_sock *ireq = inet_rsk(req);
772         struct flowi4 fl4;
773         int err = -1;
774         struct sk_buff * skb;
775
776         /* First, grab a route. */
777         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
778                 return -1;
779
780         skb = tcp_make_synack(sk, dst, req, rvp);
781
782         if (skb) {
783                 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
784
785                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
786                                             ireq->rmt_addr,
787                                             ireq->opt);
788                 err = net_xmit_eval(err);
789         }
790
791         dst_release(dst);
792         return err;
793 }
794
795 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
796                               struct request_values *rvp)
797 {
798         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
799         return tcp_v4_send_synack(sk, NULL, req, rvp);
800 }
801
802 /*
803  *      IPv4 request_sock destructor.
804  */
805 static void tcp_v4_reqsk_destructor(struct request_sock *req)
806 {
807         kfree(inet_rsk(req)->opt);
808 }
809
810 /*
811  * Return 1 if a syncookie should be sent
812  */
813 int tcp_syn_flood_action(struct sock *sk,
814                          const struct sk_buff *skb,
815                          const char *proto)
816 {
817         const char *msg = "Dropping request";
818         int want_cookie = 0;
819         struct listen_sock *lopt;
820
821
822
823 #ifdef CONFIG_SYN_COOKIES
824         if (sysctl_tcp_syncookies) {
825                 msg = "Sending cookies";
826                 want_cookie = 1;
827                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
828         } else
829 #endif
830                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
831
832         lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
833         if (!lopt->synflood_warned) {
834                 lopt->synflood_warned = 1;
835                 pr_info("%s: Possible SYN flooding on port %d. %s. "
836                         " Check SNMP counters.\n",
837                         proto, ntohs(tcp_hdr(skb)->dest), msg);
838         }
839         return want_cookie;
840 }
841 EXPORT_SYMBOL(tcp_syn_flood_action);
842
843 /*
844  * Save and compile IPv4 options into the request_sock if needed.
845  */
846 static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk,
847                                                   struct sk_buff *skb)
848 {
849         const struct ip_options *opt = &(IPCB(skb)->opt);
850         struct ip_options_rcu *dopt = NULL;
851
852         if (opt && opt->optlen) {
853                 int opt_size = sizeof(*dopt) + opt->optlen;
854
855                 dopt = kmalloc(opt_size, GFP_ATOMIC);
856                 if (dopt) {
857                         if (ip_options_echo(&dopt->opt, skb)) {
858                                 kfree(dopt);
859                                 dopt = NULL;
860                         }
861                 }
862         }
863         return dopt;
864 }
865
866 #ifdef CONFIG_TCP_MD5SIG
867 /*
868  * RFC2385 MD5 checksumming requires a mapping of
869  * IP address->MD5 Key.
870  * We need to maintain these in the sk structure.
871  */
872
873 /* Find the Key structure for an address.  */
874 struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
875                                          const union tcp_md5_addr *addr,
876                                          int family)
877 {
878         struct tcp_sock *tp = tcp_sk(sk);
879         struct tcp_md5sig_key *key;
880         struct hlist_node *pos;
881         unsigned int size = sizeof(struct in_addr);
882
883         if (!tp->md5sig_info)
884                 return NULL;
885 #if IS_ENABLED(CONFIG_IPV6)
886         if (family == AF_INET6)
887                 size = sizeof(struct in6_addr);
888 #endif
889         hlist_for_each_entry_rcu(key, pos, &tp->md5sig_info->head, node) {
890                 if (key->family != family)
891                         continue;
892                 if (!memcmp(&key->addr, addr, size))
893                         return key;
894         }
895         return NULL;
896 }
897 EXPORT_SYMBOL(tcp_md5_do_lookup);
898
899 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
900                                          struct sock *addr_sk)
901 {
902         union tcp_md5_addr *addr;
903
904         addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
905         return tcp_md5_do_lookup(sk, addr, AF_INET);
906 }
907 EXPORT_SYMBOL(tcp_v4_md5_lookup);
908
909 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
910                                                       struct request_sock *req)
911 {
912         union tcp_md5_addr *addr;
913
914         addr = (union tcp_md5_addr *)&inet_rsk(req)->rmt_addr;
915         return tcp_md5_do_lookup(sk, addr, AF_INET);
916 }
917
918 /* This can be called on a newly created socket, from other files */
919 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
920                    int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
921 {
922         /* Add Key to the list */
923         struct tcp_md5sig_key *key;
924         struct tcp_sock *tp = tcp_sk(sk);
925         struct tcp_md5sig_info *md5sig;
926
927         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
928         if (key) {
929                 /* Pre-existing entry - just update that one. */
930                 memcpy(key->key, newkey, newkeylen);
931                 key->keylen = newkeylen;
932                 return 0;
933         }
934
935         md5sig = tp->md5sig_info;
936         if (!md5sig) {
937                 md5sig = kmalloc(sizeof(*md5sig), gfp);
938                 if (!md5sig)
939                         return -ENOMEM;
940
941                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
942                 INIT_HLIST_HEAD(&md5sig->head);
943                 tp->md5sig_info = md5sig;
944         }
945
946         key = sock_kmalloc(sk, sizeof(*key), gfp);
947         if (!key)
948                 return -ENOMEM;
949         if (hlist_empty(&md5sig->head) && !tcp_alloc_md5sig_pool(sk)) {
950                 sock_kfree_s(sk, key, sizeof(*key));
951                 return -ENOMEM;
952         }
953
954         memcpy(key->key, newkey, newkeylen);
955         key->keylen = newkeylen;
956         key->family = family;
957         memcpy(&key->addr, addr,
958                (family == AF_INET6) ? sizeof(struct in6_addr) :
959                                       sizeof(struct in_addr));
960         hlist_add_head_rcu(&key->node, &md5sig->head);
961         return 0;
962 }
963 EXPORT_SYMBOL(tcp_md5_do_add);
964
965 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
966 {
967         struct tcp_sock *tp = tcp_sk(sk);
968         struct tcp_md5sig_key *key;
969
970         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
971         if (!key)
972                 return -ENOENT;
973         hlist_del_rcu(&key->node);
974         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
975         kfree_rcu(key, rcu);
976         if (hlist_empty(&tp->md5sig_info->head))
977                 tcp_free_md5sig_pool();
978         return 0;
979 }
980 EXPORT_SYMBOL(tcp_md5_do_del);
981
982 void tcp_clear_md5_list(struct sock *sk)
983 {
984         struct tcp_sock *tp = tcp_sk(sk);
985         struct tcp_md5sig_key *key;
986         struct hlist_node *pos, *n;
987
988         if (!hlist_empty(&tp->md5sig_info->head))
989                 tcp_free_md5sig_pool();
990         hlist_for_each_entry_safe(key, pos, n, &tp->md5sig_info->head, node) {
991                 hlist_del_rcu(&key->node);
992                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
993                 kfree_rcu(key, rcu);
994         }
995 }
996
997 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
998                                  int optlen)
999 {
1000         struct tcp_md5sig cmd;
1001         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1002
1003         if (optlen < sizeof(cmd))
1004                 return -EINVAL;
1005
1006         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1007                 return -EFAULT;
1008
1009         if (sin->sin_family != AF_INET)
1010                 return -EINVAL;
1011
1012         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
1013                 if (!tcp_sk(sk)->md5sig_info)
1014                         return -ENOENT;
1015                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1016                                       AF_INET);
1017         }
1018
1019         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1020                 return -EINVAL;
1021
1022         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1023                               AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1024                               GFP_KERNEL);
1025 }
1026
1027 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1028                                         __be32 daddr, __be32 saddr, int nbytes)
1029 {
1030         struct tcp4_pseudohdr *bp;
1031         struct scatterlist sg;
1032
1033         bp = &hp->md5_blk.ip4;
1034
1035         /*
1036          * 1. the TCP pseudo-header (in the order: source IP address,
1037          * destination IP address, zero-padded protocol number, and
1038          * segment length)
1039          */
1040         bp->saddr = saddr;
1041         bp->daddr = daddr;
1042         bp->pad = 0;
1043         bp->protocol = IPPROTO_TCP;
1044         bp->len = cpu_to_be16(nbytes);
1045
1046         sg_init_one(&sg, bp, sizeof(*bp));
1047         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1048 }
1049
1050 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1051                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1052 {
1053         struct tcp_md5sig_pool *hp;
1054         struct hash_desc *desc;
1055
1056         hp = tcp_get_md5sig_pool();
1057         if (!hp)
1058                 goto clear_hash_noput;
1059         desc = &hp->md5_desc;
1060
1061         if (crypto_hash_init(desc))
1062                 goto clear_hash;
1063         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1064                 goto clear_hash;
1065         if (tcp_md5_hash_header(hp, th))
1066                 goto clear_hash;
1067         if (tcp_md5_hash_key(hp, key))
1068                 goto clear_hash;
1069         if (crypto_hash_final(desc, md5_hash))
1070                 goto clear_hash;
1071
1072         tcp_put_md5sig_pool();
1073         return 0;
1074
1075 clear_hash:
1076         tcp_put_md5sig_pool();
1077 clear_hash_noput:
1078         memset(md5_hash, 0, 16);
1079         return 1;
1080 }
1081
1082 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1083                         const struct sock *sk, const struct request_sock *req,
1084                         const struct sk_buff *skb)
1085 {
1086         struct tcp_md5sig_pool *hp;
1087         struct hash_desc *desc;
1088         const struct tcphdr *th = tcp_hdr(skb);
1089         __be32 saddr, daddr;
1090
1091         if (sk) {
1092                 saddr = inet_sk(sk)->inet_saddr;
1093                 daddr = inet_sk(sk)->inet_daddr;
1094         } else if (req) {
1095                 saddr = inet_rsk(req)->loc_addr;
1096                 daddr = inet_rsk(req)->rmt_addr;
1097         } else {
1098                 const struct iphdr *iph = ip_hdr(skb);
1099                 saddr = iph->saddr;
1100                 daddr = iph->daddr;
1101         }
1102
1103         hp = tcp_get_md5sig_pool();
1104         if (!hp)
1105                 goto clear_hash_noput;
1106         desc = &hp->md5_desc;
1107
1108         if (crypto_hash_init(desc))
1109                 goto clear_hash;
1110
1111         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1112                 goto clear_hash;
1113         if (tcp_md5_hash_header(hp, th))
1114                 goto clear_hash;
1115         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1116                 goto clear_hash;
1117         if (tcp_md5_hash_key(hp, key))
1118                 goto clear_hash;
1119         if (crypto_hash_final(desc, md5_hash))
1120                 goto clear_hash;
1121
1122         tcp_put_md5sig_pool();
1123         return 0;
1124
1125 clear_hash:
1126         tcp_put_md5sig_pool();
1127 clear_hash_noput:
1128         memset(md5_hash, 0, 16);
1129         return 1;
1130 }
1131 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1132
1133 static int tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1134 {
1135         /*
1136          * This gets called for each TCP segment that arrives
1137          * so we want to be efficient.
1138          * We have 3 drop cases:
1139          * o No MD5 hash and one expected.
1140          * o MD5 hash and we're not expecting one.
1141          * o MD5 hash and its wrong.
1142          */
1143         const __u8 *hash_location = NULL;
1144         struct tcp_md5sig_key *hash_expected;
1145         const struct iphdr *iph = ip_hdr(skb);
1146         const struct tcphdr *th = tcp_hdr(skb);
1147         int genhash;
1148         unsigned char newhash[16];
1149
1150         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1151                                           AF_INET);
1152         hash_location = tcp_parse_md5sig_option(th);
1153
1154         /* We've parsed the options - do we have a hash? */
1155         if (!hash_expected && !hash_location)
1156                 return 0;
1157
1158         if (hash_expected && !hash_location) {
1159                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1160                 return 1;
1161         }
1162
1163         if (!hash_expected && hash_location) {
1164                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1165                 return 1;
1166         }
1167
1168         /* Okay, so this is hash_expected and hash_location -
1169          * so we need to calculate the checksum.
1170          */
1171         genhash = tcp_v4_md5_hash_skb(newhash,
1172                                       hash_expected,
1173                                       NULL, NULL, skb);
1174
1175         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1176                 if (net_ratelimit()) {
1177                         printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1178                                &iph->saddr, ntohs(th->source),
1179                                &iph->daddr, ntohs(th->dest),
1180                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1181                 }
1182                 return 1;
1183         }
1184         return 0;
1185 }
1186
1187 #endif
1188
1189 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1190         .family         =       PF_INET,
1191         .obj_size       =       sizeof(struct tcp_request_sock),
1192         .rtx_syn_ack    =       tcp_v4_rtx_synack,
1193         .send_ack       =       tcp_v4_reqsk_send_ack,
1194         .destructor     =       tcp_v4_reqsk_destructor,
1195         .send_reset     =       tcp_v4_send_reset,
1196         .syn_ack_timeout =      tcp_syn_ack_timeout,
1197 };
1198
1199 #ifdef CONFIG_TCP_MD5SIG
1200 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1201         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1202         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1203 };
1204 #endif
1205
1206 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1207 {
1208         struct tcp_extend_values tmp_ext;
1209         struct tcp_options_received tmp_opt;
1210         const u8 *hash_location;
1211         struct request_sock *req;
1212         struct inet_request_sock *ireq;
1213         struct tcp_sock *tp = tcp_sk(sk);
1214         struct dst_entry *dst = NULL;
1215         __be32 saddr = ip_hdr(skb)->saddr;
1216         __be32 daddr = ip_hdr(skb)->daddr;
1217         __u32 isn = TCP_SKB_CB(skb)->when;
1218         int want_cookie = 0;
1219
1220         /* Never answer to SYNs send to broadcast or multicast */
1221         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1222                 goto drop;
1223
1224         /* TW buckets are converted to open requests without
1225          * limitations, they conserve resources and peer is
1226          * evidently real one.
1227          */
1228         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1229                 want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
1230                 if (!want_cookie)
1231                         goto drop;
1232         }
1233
1234         /* Accept backlog is full. If we have already queued enough
1235          * of warm entries in syn queue, drop request. It is better than
1236          * clogging syn queue with openreqs with exponentially increasing
1237          * timeout.
1238          */
1239         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1240                 goto drop;
1241
1242         req = inet_reqsk_alloc(&tcp_request_sock_ops);
1243         if (!req)
1244                 goto drop;
1245
1246 #ifdef CONFIG_TCP_MD5SIG
1247         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1248 #endif
1249
1250         tcp_clear_options(&tmp_opt);
1251         tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1252         tmp_opt.user_mss  = tp->rx_opt.user_mss;
1253         tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
1254
1255         if (tmp_opt.cookie_plus > 0 &&
1256             tmp_opt.saw_tstamp &&
1257             !tp->rx_opt.cookie_out_never &&
1258             (sysctl_tcp_cookie_size > 0 ||
1259              (tp->cookie_values != NULL &&
1260               tp->cookie_values->cookie_desired > 0))) {
1261                 u8 *c;
1262                 u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1263                 int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1264
1265                 if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1266                         goto drop_and_release;
1267
1268                 /* Secret recipe starts with IP addresses */
1269                 *mess++ ^= (__force u32)daddr;
1270                 *mess++ ^= (__force u32)saddr;
1271
1272                 /* plus variable length Initiator Cookie */
1273                 c = (u8 *)mess;
1274                 while (l-- > 0)
1275                         *c++ ^= *hash_location++;
1276
1277                 want_cookie = 0;        /* not our kind of cookie */
1278                 tmp_ext.cookie_out_never = 0; /* false */
1279                 tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1280         } else if (!tp->rx_opt.cookie_in_always) {
1281                 /* redundant indications, but ensure initialization. */
1282                 tmp_ext.cookie_out_never = 1; /* true */
1283                 tmp_ext.cookie_plus = 0;
1284         } else {
1285                 goto drop_and_release;
1286         }
1287         tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1288
1289         if (want_cookie && !tmp_opt.saw_tstamp)
1290                 tcp_clear_options(&tmp_opt);
1291
1292         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1293         tcp_openreq_init(req, &tmp_opt, skb);
1294
1295         ireq = inet_rsk(req);
1296         ireq->loc_addr = daddr;
1297         ireq->rmt_addr = saddr;
1298         ireq->no_srccheck = inet_sk(sk)->transparent;
1299         ireq->opt = tcp_v4_save_options(sk, skb);
1300
1301         if (security_inet_conn_request(sk, skb, req))
1302                 goto drop_and_free;
1303
1304         if (!want_cookie || tmp_opt.tstamp_ok)
1305                 TCP_ECN_create_request(req, tcp_hdr(skb));
1306
1307         if (want_cookie) {
1308                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1309                 req->cookie_ts = tmp_opt.tstamp_ok;
1310         } else if (!isn) {
1311                 struct inet_peer *peer = NULL;
1312                 struct flowi4 fl4;
1313
1314                 /* VJ's idea. We save last timestamp seen
1315                  * from the destination in peer table, when entering
1316                  * state TIME-WAIT, and check against it before
1317                  * accepting new connection request.
1318                  *
1319                  * If "isn" is not zero, this request hit alive
1320                  * timewait bucket, so that all the necessary checks
1321                  * are made in the function processing timewait state.
1322                  */
1323                 if (tmp_opt.saw_tstamp &&
1324                     tcp_death_row.sysctl_tw_recycle &&
1325                     (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
1326                     fl4.daddr == saddr &&
1327                     (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) {
1328                         inet_peer_refcheck(peer);
1329                         if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1330                             (s32)(peer->tcp_ts - req->ts_recent) >
1331                                                         TCP_PAWS_WINDOW) {
1332                                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1333                                 goto drop_and_release;
1334                         }
1335                 }
1336                 /* Kill the following clause, if you dislike this way. */
1337                 else if (!sysctl_tcp_syncookies &&
1338                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1339                           (sysctl_max_syn_backlog >> 2)) &&
1340                          (!peer || !peer->tcp_ts_stamp) &&
1341                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1342                         /* Without syncookies last quarter of
1343                          * backlog is filled with destinations,
1344                          * proven to be alive.
1345                          * It means that we continue to communicate
1346                          * to destinations, already remembered
1347                          * to the moment of synflood.
1348                          */
1349                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1350                                        &saddr, ntohs(tcp_hdr(skb)->source));
1351                         goto drop_and_release;
1352                 }
1353
1354                 isn = tcp_v4_init_sequence(skb);
1355         }
1356         tcp_rsk(req)->snt_isn = isn;
1357         tcp_rsk(req)->snt_synack = tcp_time_stamp;
1358
1359         if (tcp_v4_send_synack(sk, dst, req,
1360                                (struct request_values *)&tmp_ext) ||
1361             want_cookie)
1362                 goto drop_and_free;
1363
1364         inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1365         return 0;
1366
1367 drop_and_release:
1368         dst_release(dst);
1369 drop_and_free:
1370         reqsk_free(req);
1371 drop:
1372         return 0;
1373 }
1374 EXPORT_SYMBOL(tcp_v4_conn_request);
1375
1376
1377 /*
1378  * The three way handshake has completed - we got a valid synack -
1379  * now create the new socket.
1380  */
1381 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1382                                   struct request_sock *req,
1383                                   struct dst_entry *dst)
1384 {
1385         struct inet_request_sock *ireq;
1386         struct inet_sock *newinet;
1387         struct tcp_sock *newtp;
1388         struct sock *newsk;
1389 #ifdef CONFIG_TCP_MD5SIG
1390         struct tcp_md5sig_key *key;
1391 #endif
1392         struct ip_options_rcu *inet_opt;
1393
1394         if (sk_acceptq_is_full(sk))
1395                 goto exit_overflow;
1396
1397         newsk = tcp_create_openreq_child(sk, req, skb);
1398         if (!newsk)
1399                 goto exit_nonewsk;
1400
1401         newsk->sk_gso_type = SKB_GSO_TCPV4;
1402
1403         newtp                 = tcp_sk(newsk);
1404         newinet               = inet_sk(newsk);
1405         ireq                  = inet_rsk(req);
1406         newinet->inet_daddr   = ireq->rmt_addr;
1407         newinet->inet_rcv_saddr = ireq->loc_addr;
1408         newinet->inet_saddr           = ireq->loc_addr;
1409         inet_opt              = ireq->opt;
1410         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1411         ireq->opt             = NULL;
1412         newinet->mc_index     = inet_iif(skb);
1413         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1414         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1415         if (inet_opt)
1416                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1417         newinet->inet_id = newtp->write_seq ^ jiffies;
1418
1419         if (!dst && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL)
1420                 goto put_and_exit;
1421
1422         sk_setup_caps(newsk, dst);
1423
1424         tcp_mtup_init(newsk);
1425         tcp_sync_mss(newsk, dst_mtu(dst));
1426         newtp->advmss = dst_metric_advmss(dst);
1427         if (tcp_sk(sk)->rx_opt.user_mss &&
1428             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1429                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1430
1431         tcp_initialize_rcv_mss(newsk);
1432         if (tcp_rsk(req)->snt_synack)
1433                 tcp_valid_rtt_meas(newsk,
1434                     tcp_time_stamp - tcp_rsk(req)->snt_synack);
1435         newtp->total_retrans = req->retrans;
1436
1437 #ifdef CONFIG_TCP_MD5SIG
1438         /* Copy over the MD5 key from the original socket */
1439         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1440                                 AF_INET);
1441         if (key != NULL) {
1442                 /*
1443                  * We're using one, so create a matching key
1444                  * on the newsk structure. If we fail to get
1445                  * memory, then we end up not copying the key
1446                  * across. Shucks.
1447                  */
1448                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1449                                AF_INET, key->key, key->keylen, GFP_ATOMIC);
1450                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1451         }
1452 #endif
1453
1454         if (__inet_inherit_port(sk, newsk) < 0)
1455                 goto put_and_exit;
1456         __inet_hash_nolisten(newsk, NULL);
1457
1458         return newsk;
1459
1460 exit_overflow:
1461         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1462 exit_nonewsk:
1463         dst_release(dst);
1464 exit:
1465         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1466         return NULL;
1467 put_and_exit:
1468         tcp_clear_xmit_timers(newsk);
1469         tcp_cleanup_congestion_control(newsk);
1470         bh_unlock_sock(newsk);
1471         sock_put(newsk);
1472         goto exit;
1473 }
1474 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1475
1476 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1477 {
1478         struct tcphdr *th = tcp_hdr(skb);
1479         const struct iphdr *iph = ip_hdr(skb);
1480         struct sock *nsk;
1481         struct request_sock **prev;
1482         /* Find possible connection requests. */
1483         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1484                                                        iph->saddr, iph->daddr);
1485         if (req)
1486                 return tcp_check_req(sk, skb, req, prev);
1487
1488         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1489                         th->source, iph->daddr, th->dest, inet_iif(skb));
1490
1491         if (nsk) {
1492                 if (nsk->sk_state != TCP_TIME_WAIT) {
1493                         bh_lock_sock(nsk);
1494                         return nsk;
1495                 }
1496                 inet_twsk_put(inet_twsk(nsk));
1497                 return NULL;
1498         }
1499
1500 #ifdef CONFIG_SYN_COOKIES
1501         if (!th->syn)
1502                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1503 #endif
1504         return sk;
1505 }
1506
1507 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1508 {
1509         const struct iphdr *iph = ip_hdr(skb);
1510
1511         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1512                 if (!tcp_v4_check(skb->len, iph->saddr,
1513                                   iph->daddr, skb->csum)) {
1514                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1515                         return 0;
1516                 }
1517         }
1518
1519         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1520                                        skb->len, IPPROTO_TCP, 0);
1521
1522         if (skb->len <= 76) {
1523                 return __skb_checksum_complete(skb);
1524         }
1525         return 0;
1526 }
1527
1528
1529 /* The socket must have it's spinlock held when we get
1530  * here.
1531  *
1532  * We have a potential double-lock case here, so even when
1533  * doing backlog processing we use the BH locking scheme.
1534  * This is because we cannot sleep with the original spinlock
1535  * held.
1536  */
1537 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1538 {
1539         struct sock *rsk;
1540 #ifdef CONFIG_TCP_MD5SIG
1541         /*
1542          * We really want to reject the packet as early as possible
1543          * if:
1544          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1545          *  o There is an MD5 option and we're not expecting one
1546          */
1547         if (tcp_v4_inbound_md5_hash(sk, skb))
1548                 goto discard;
1549 #endif
1550
1551         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1552                 sock_rps_save_rxhash(sk, skb);
1553                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1554                         rsk = sk;
1555                         goto reset;
1556                 }
1557                 return 0;
1558         }
1559
1560         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1561                 goto csum_err;
1562
1563         if (sk->sk_state == TCP_LISTEN) {
1564                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1565                 if (!nsk)
1566                         goto discard;
1567
1568                 if (nsk != sk) {
1569                         sock_rps_save_rxhash(nsk, skb);
1570                         if (tcp_child_process(sk, nsk, skb)) {
1571                                 rsk = nsk;
1572                                 goto reset;
1573                         }
1574                         return 0;
1575                 }
1576         } else
1577                 sock_rps_save_rxhash(sk, skb);
1578
1579         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1580                 rsk = sk;
1581                 goto reset;
1582         }
1583         return 0;
1584
1585 reset:
1586         tcp_v4_send_reset(rsk, skb);
1587 discard:
1588         kfree_skb(skb);
1589         /* Be careful here. If this function gets more complicated and
1590          * gcc suffers from register pressure on the x86, sk (in %ebx)
1591          * might be destroyed here. This current version compiles correctly,
1592          * but you have been warned.
1593          */
1594         return 0;
1595
1596 csum_err:
1597         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1598         goto discard;
1599 }
1600 EXPORT_SYMBOL(tcp_v4_do_rcv);
1601
1602 /*
1603  *      From tcp_input.c
1604  */
1605
1606 int tcp_v4_rcv(struct sk_buff *skb)
1607 {
1608         const struct iphdr *iph;
1609         const struct tcphdr *th;
1610         struct sock *sk;
1611         int ret;
1612         struct net *net = dev_net(skb->dev);
1613
1614         if (skb->pkt_type != PACKET_HOST)
1615                 goto discard_it;
1616
1617         /* Count it even if it's bad */
1618         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1619
1620         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1621                 goto discard_it;
1622
1623         th = tcp_hdr(skb);
1624
1625         if (th->doff < sizeof(struct tcphdr) / 4)
1626                 goto bad_packet;
1627         if (!pskb_may_pull(skb, th->doff * 4))
1628                 goto discard_it;
1629
1630         /* An explanation is required here, I think.
1631          * Packet length and doff are validated by header prediction,
1632          * provided case of th->doff==0 is eliminated.
1633          * So, we defer the checks. */
1634         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1635                 goto bad_packet;
1636
1637         th = tcp_hdr(skb);
1638         iph = ip_hdr(skb);
1639         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1640         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1641                                     skb->len - th->doff * 4);
1642         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1643         TCP_SKB_CB(skb)->when    = 0;
1644         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1645         TCP_SKB_CB(skb)->sacked  = 0;
1646
1647         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1648         if (!sk)
1649                 goto no_tcp_socket;
1650
1651 process:
1652         if (sk->sk_state == TCP_TIME_WAIT)
1653                 goto do_time_wait;
1654
1655         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1656                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1657                 goto discard_and_relse;
1658         }
1659
1660         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1661                 goto discard_and_relse;
1662         nf_reset(skb);
1663
1664         if (sk_filter(sk, skb))
1665                 goto discard_and_relse;
1666
1667         skb->dev = NULL;
1668
1669         bh_lock_sock_nested(sk);
1670         ret = 0;
1671         if (!sock_owned_by_user(sk)) {
1672 #ifdef CONFIG_NET_DMA
1673                 struct tcp_sock *tp = tcp_sk(sk);
1674                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1675                         tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1676                 if (tp->ucopy.dma_chan)
1677                         ret = tcp_v4_do_rcv(sk, skb);
1678                 else
1679 #endif
1680                 {
1681                         if (!tcp_prequeue(sk, skb))
1682                                 ret = tcp_v4_do_rcv(sk, skb);
1683                 }
1684         } else if (unlikely(sk_add_backlog(sk, skb))) {
1685                 bh_unlock_sock(sk);
1686                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1687                 goto discard_and_relse;
1688         }
1689         bh_unlock_sock(sk);
1690
1691         sock_put(sk);
1692
1693         return ret;
1694
1695 no_tcp_socket:
1696         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1697                 goto discard_it;
1698
1699         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1700 bad_packet:
1701                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1702         } else {
1703                 tcp_v4_send_reset(NULL, skb);
1704         }
1705
1706 discard_it:
1707         /* Discard frame. */
1708         kfree_skb(skb);
1709         return 0;
1710
1711 discard_and_relse:
1712         sock_put(sk);
1713         goto discard_it;
1714
1715 do_time_wait:
1716         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1717                 inet_twsk_put(inet_twsk(sk));
1718                 goto discard_it;
1719         }
1720
1721         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1722                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1723                 inet_twsk_put(inet_twsk(sk));
1724                 goto discard_it;
1725         }
1726         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1727         case TCP_TW_SYN: {
1728                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1729                                                         &tcp_hashinfo,
1730                                                         iph->daddr, th->dest,
1731                                                         inet_iif(skb));
1732                 if (sk2) {
1733                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1734                         inet_twsk_put(inet_twsk(sk));
1735                         sk = sk2;
1736                         goto process;
1737                 }
1738                 /* Fall through to ACK */
1739         }
1740         case TCP_TW_ACK:
1741                 tcp_v4_timewait_ack(sk, skb);
1742                 break;
1743         case TCP_TW_RST:
1744                 goto no_tcp_socket;
1745         case TCP_TW_SUCCESS:;
1746         }
1747         goto discard_it;
1748 }
1749
1750 struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it)
1751 {
1752         struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
1753         struct inet_sock *inet = inet_sk(sk);
1754         struct inet_peer *peer;
1755
1756         if (!rt ||
1757             inet->cork.fl.u.ip4.daddr != inet->inet_daddr) {
1758                 peer = inet_getpeer_v4(inet->inet_daddr, 1);
1759                 *release_it = true;
1760         } else {
1761                 if (!rt->peer)
1762                         rt_bind_peer(rt, inet->inet_daddr, 1);
1763                 peer = rt->peer;
1764                 *release_it = false;
1765         }
1766
1767         return peer;
1768 }
1769 EXPORT_SYMBOL(tcp_v4_get_peer);
1770
1771 void *tcp_v4_tw_get_peer(struct sock *sk)
1772 {
1773         const struct inet_timewait_sock *tw = inet_twsk(sk);
1774
1775         return inet_getpeer_v4(tw->tw_daddr, 1);
1776 }
1777 EXPORT_SYMBOL(tcp_v4_tw_get_peer);
1778
1779 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1780         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1781         .twsk_unique    = tcp_twsk_unique,
1782         .twsk_destructor= tcp_twsk_destructor,
1783         .twsk_getpeer   = tcp_v4_tw_get_peer,
1784 };
1785
1786 const struct inet_connection_sock_af_ops ipv4_specific = {
1787         .queue_xmit        = ip_queue_xmit,
1788         .send_check        = tcp_v4_send_check,
1789         .rebuild_header    = inet_sk_rebuild_header,
1790         .conn_request      = tcp_v4_conn_request,
1791         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1792         .get_peer          = tcp_v4_get_peer,
1793         .net_header_len    = sizeof(struct iphdr),
1794         .setsockopt        = ip_setsockopt,
1795         .getsockopt        = ip_getsockopt,
1796         .addr2sockaddr     = inet_csk_addr2sockaddr,
1797         .sockaddr_len      = sizeof(struct sockaddr_in),
1798         .bind_conflict     = inet_csk_bind_conflict,
1799 #ifdef CONFIG_COMPAT
1800         .compat_setsockopt = compat_ip_setsockopt,
1801         .compat_getsockopt = compat_ip_getsockopt,
1802 #endif
1803 };
1804 EXPORT_SYMBOL(ipv4_specific);
1805
1806 #ifdef CONFIG_TCP_MD5SIG
1807 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1808         .md5_lookup             = tcp_v4_md5_lookup,
1809         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1810         .md5_parse              = tcp_v4_parse_md5_keys,
1811 };
1812 #endif
1813
1814 /* NOTE: A lot of things set to zero explicitly by call to
1815  *       sk_alloc() so need not be done here.
1816  */
1817 static int tcp_v4_init_sock(struct sock *sk)
1818 {
1819         struct inet_connection_sock *icsk = inet_csk(sk);
1820         struct tcp_sock *tp = tcp_sk(sk);
1821
1822         skb_queue_head_init(&tp->out_of_order_queue);
1823         tcp_init_xmit_timers(sk);
1824         tcp_prequeue_init(tp);
1825
1826         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1827         tp->mdev = TCP_TIMEOUT_INIT;
1828
1829         /* So many TCP implementations out there (incorrectly) count the
1830          * initial SYN frame in their delayed-ACK and congestion control
1831          * algorithms that we must have the following bandaid to talk
1832          * efficiently to them.  -DaveM
1833          */
1834         tp->snd_cwnd = TCP_INIT_CWND;
1835
1836         /* See draft-stevens-tcpca-spec-01 for discussion of the
1837          * initialization of these values.
1838          */
1839         tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1840         tp->snd_cwnd_clamp = ~0;
1841         tp->mss_cache = TCP_MSS_DEFAULT;
1842
1843         tp->reordering = sysctl_tcp_reordering;
1844         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1845
1846         sk->sk_state = TCP_CLOSE;
1847
1848         sk->sk_write_space = sk_stream_write_space;
1849         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1850
1851         icsk->icsk_af_ops = &ipv4_specific;
1852         icsk->icsk_sync_mss = tcp_sync_mss;
1853 #ifdef CONFIG_TCP_MD5SIG
1854         tp->af_specific = &tcp_sock_ipv4_specific;
1855 #endif
1856
1857         /* TCP Cookie Transactions */
1858         if (sysctl_tcp_cookie_size > 0) {
1859                 /* Default, cookies without s_data_payload. */
1860                 tp->cookie_values =
1861                         kzalloc(sizeof(*tp->cookie_values),
1862                                 sk->sk_allocation);
1863                 if (tp->cookie_values != NULL)
1864                         kref_init(&tp->cookie_values->kref);
1865         }
1866         /* Presumed zeroed, in order of appearance:
1867          *      cookie_in_always, cookie_out_never,
1868          *      s_data_constant, s_data_in, s_data_out
1869          */
1870         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1871         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1872
1873         local_bh_disable();
1874         sock_update_memcg(sk);
1875         sk_sockets_allocated_inc(sk);
1876         local_bh_enable();
1877
1878         return 0;
1879 }
1880
1881 void tcp_v4_destroy_sock(struct sock *sk)
1882 {
1883         struct tcp_sock *tp = tcp_sk(sk);
1884
1885         tcp_clear_xmit_timers(sk);
1886
1887         tcp_cleanup_congestion_control(sk);
1888
1889         /* Cleanup up the write buffer. */
1890         tcp_write_queue_purge(sk);
1891
1892         /* Cleans up our, hopefully empty, out_of_order_queue. */
1893         __skb_queue_purge(&tp->out_of_order_queue);
1894
1895 #ifdef CONFIG_TCP_MD5SIG
1896         /* Clean up the MD5 key list, if any */
1897         if (tp->md5sig_info) {
1898                 tcp_clear_md5_list(sk);
1899                 kfree(tp->md5sig_info);
1900                 tp->md5sig_info = NULL;
1901         }
1902 #endif
1903
1904 #ifdef CONFIG_NET_DMA
1905         /* Cleans up our sk_async_wait_queue */
1906         __skb_queue_purge(&sk->sk_async_wait_queue);
1907 #endif
1908
1909         /* Clean prequeue, it must be empty really */
1910         __skb_queue_purge(&tp->ucopy.prequeue);
1911
1912         /* Clean up a referenced TCP bind bucket. */
1913         if (inet_csk(sk)->icsk_bind_hash)
1914                 inet_put_port(sk);
1915
1916         /*
1917          * If sendmsg cached page exists, toss it.
1918          */
1919         if (sk->sk_sndmsg_page) {
1920                 __free_page(sk->sk_sndmsg_page);
1921                 sk->sk_sndmsg_page = NULL;
1922         }
1923
1924         /* TCP Cookie Transactions */
1925         if (tp->cookie_values != NULL) {
1926                 kref_put(&tp->cookie_values->kref,
1927                          tcp_cookie_values_release);
1928                 tp->cookie_values = NULL;
1929         }
1930
1931         sk_sockets_allocated_dec(sk);
1932         sock_release_memcg(sk);
1933 }
1934 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1935
1936 #ifdef CONFIG_PROC_FS
1937 /* Proc filesystem TCP sock list dumping. */
1938
1939 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1940 {
1941         return hlist_nulls_empty(head) ? NULL :
1942                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1943 }
1944
1945 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1946 {
1947         return !is_a_nulls(tw->tw_node.next) ?
1948                 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1949 }
1950
1951 /*
1952  * Get next listener socket follow cur.  If cur is NULL, get first socket
1953  * starting from bucket given in st->bucket; when st->bucket is zero the
1954  * very first socket in the hash table is returned.
1955  */
1956 static void *listening_get_next(struct seq_file *seq, void *cur)
1957 {
1958         struct inet_connection_sock *icsk;
1959         struct hlist_nulls_node *node;
1960         struct sock *sk = cur;
1961         struct inet_listen_hashbucket *ilb;
1962         struct tcp_iter_state *st = seq->private;
1963         struct net *net = seq_file_net(seq);
1964
1965         if (!sk) {
1966                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1967                 spin_lock_bh(&ilb->lock);
1968                 sk = sk_nulls_head(&ilb->head);
1969                 st->offset = 0;
1970                 goto get_sk;
1971         }
1972         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1973         ++st->num;
1974         ++st->offset;
1975
1976         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1977                 struct request_sock *req = cur;
1978
1979                 icsk = inet_csk(st->syn_wait_sk);
1980                 req = req->dl_next;
1981                 while (1) {
1982                         while (req) {
1983                                 if (req->rsk_ops->family == st->family) {
1984                                         cur = req;
1985                                         goto out;
1986                                 }
1987                                 req = req->dl_next;
1988                         }
1989                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1990                                 break;
1991 get_req:
1992                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1993                 }
1994                 sk        = sk_nulls_next(st->syn_wait_sk);
1995                 st->state = TCP_SEQ_STATE_LISTENING;
1996                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1997         } else {
1998                 icsk = inet_csk(sk);
1999                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2000                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
2001                         goto start_req;
2002                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2003                 sk = sk_nulls_next(sk);
2004         }
2005 get_sk:
2006         sk_nulls_for_each_from(sk, node) {
2007                 if (!net_eq(sock_net(sk), net))
2008                         continue;
2009                 if (sk->sk_family == st->family) {
2010                         cur = sk;
2011                         goto out;
2012                 }
2013                 icsk = inet_csk(sk);
2014                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2015                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2016 start_req:
2017                         st->uid         = sock_i_uid(sk);
2018                         st->syn_wait_sk = sk;
2019                         st->state       = TCP_SEQ_STATE_OPENREQ;
2020                         st->sbucket     = 0;
2021                         goto get_req;
2022                 }
2023                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2024         }
2025         spin_unlock_bh(&ilb->lock);
2026         st->offset = 0;
2027         if (++st->bucket < INET_LHTABLE_SIZE) {
2028                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2029                 spin_lock_bh(&ilb->lock);
2030                 sk = sk_nulls_head(&ilb->head);
2031                 goto get_sk;
2032         }
2033         cur = NULL;
2034 out:
2035         return cur;
2036 }
2037
2038 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2039 {
2040         struct tcp_iter_state *st = seq->private;
2041         void *rc;
2042
2043         st->bucket = 0;
2044         st->offset = 0;
2045         rc = listening_get_next(seq, NULL);
2046
2047         while (rc && *pos) {
2048                 rc = listening_get_next(seq, rc);
2049                 --*pos;
2050         }
2051         return rc;
2052 }
2053
2054 static inline int empty_bucket(struct tcp_iter_state *st)
2055 {
2056         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2057                 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2058 }
2059
2060 /*
2061  * Get first established socket starting from bucket given in st->bucket.
2062  * If st->bucket is zero, the very first socket in the hash is returned.
2063  */
2064 static void *established_get_first(struct seq_file *seq)
2065 {
2066         struct tcp_iter_state *st = seq->private;
2067         struct net *net = seq_file_net(seq);
2068         void *rc = NULL;
2069
2070         st->offset = 0;
2071         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2072                 struct sock *sk;
2073                 struct hlist_nulls_node *node;
2074                 struct inet_timewait_sock *tw;
2075                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2076
2077                 /* Lockless fast path for the common case of empty buckets */
2078                 if (empty_bucket(st))
2079                         continue;
2080
2081                 spin_lock_bh(lock);
2082                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2083                         if (sk->sk_family != st->family ||
2084                             !net_eq(sock_net(sk), net)) {
2085                                 continue;
2086                         }
2087                         rc = sk;
2088                         goto out;
2089                 }
2090                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2091                 inet_twsk_for_each(tw, node,
2092                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2093                         if (tw->tw_family != st->family ||
2094                             !net_eq(twsk_net(tw), net)) {
2095                                 continue;
2096                         }
2097                         rc = tw;
2098                         goto out;
2099                 }
2100                 spin_unlock_bh(lock);
2101                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2102         }
2103 out:
2104         return rc;
2105 }
2106
2107 static void *established_get_next(struct seq_file *seq, void *cur)
2108 {
2109         struct sock *sk = cur;
2110         struct inet_timewait_sock *tw;
2111         struct hlist_nulls_node *node;
2112         struct tcp_iter_state *st = seq->private;
2113         struct net *net = seq_file_net(seq);
2114
2115         ++st->num;
2116         ++st->offset;
2117
2118         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2119                 tw = cur;
2120                 tw = tw_next(tw);
2121 get_tw:
2122                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2123                         tw = tw_next(tw);
2124                 }
2125                 if (tw) {
2126                         cur = tw;
2127                         goto out;
2128                 }
2129                 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2130                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2131
2132                 /* Look for next non empty bucket */
2133                 st->offset = 0;
2134                 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2135                                 empty_bucket(st))
2136                         ;
2137                 if (st->bucket > tcp_hashinfo.ehash_mask)
2138                         return NULL;
2139
2140                 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2141                 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2142         } else
2143                 sk = sk_nulls_next(sk);
2144
2145         sk_nulls_for_each_from(sk, node) {
2146                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2147                         goto found;
2148         }
2149
2150         st->state = TCP_SEQ_STATE_TIME_WAIT;
2151         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2152         goto get_tw;
2153 found:
2154         cur = sk;
2155 out:
2156         return cur;
2157 }
2158
2159 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2160 {
2161         struct tcp_iter_state *st = seq->private;
2162         void *rc;
2163
2164         st->bucket = 0;
2165         rc = established_get_first(seq);
2166
2167         while (rc && pos) {
2168                 rc = established_get_next(seq, rc);
2169                 --pos;
2170         }
2171         return rc;
2172 }
2173
2174 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2175 {
2176         void *rc;
2177         struct tcp_iter_state *st = seq->private;
2178
2179         st->state = TCP_SEQ_STATE_LISTENING;
2180         rc        = listening_get_idx(seq, &pos);
2181
2182         if (!rc) {
2183                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2184                 rc        = established_get_idx(seq, pos);
2185         }
2186
2187         return rc;
2188 }
2189
2190 static void *tcp_seek_last_pos(struct seq_file *seq)
2191 {
2192         struct tcp_iter_state *st = seq->private;
2193         int offset = st->offset;
2194         int orig_num = st->num;
2195         void *rc = NULL;
2196
2197         switch (st->state) {
2198         case TCP_SEQ_STATE_OPENREQ:
2199         case TCP_SEQ_STATE_LISTENING:
2200                 if (st->bucket >= INET_LHTABLE_SIZE)
2201                         break;
2202                 st->state = TCP_SEQ_STATE_LISTENING;
2203                 rc = listening_get_next(seq, NULL);
2204                 while (offset-- && rc)
2205                         rc = listening_get_next(seq, rc);
2206                 if (rc)
2207                         break;
2208                 st->bucket = 0;
2209                 /* Fallthrough */
2210         case TCP_SEQ_STATE_ESTABLISHED:
2211         case TCP_SEQ_STATE_TIME_WAIT:
2212                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2213                 if (st->bucket > tcp_hashinfo.ehash_mask)
2214                         break;
2215                 rc = established_get_first(seq);
2216                 while (offset-- && rc)
2217                         rc = established_get_next(seq, rc);
2218         }
2219
2220         st->num = orig_num;
2221
2222         return rc;
2223 }
2224
2225 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2226 {
2227         struct tcp_iter_state *st = seq->private;
2228         void *rc;
2229
2230         if (*pos && *pos == st->last_pos) {
2231                 rc = tcp_seek_last_pos(seq);
2232                 if (rc)
2233                         goto out;
2234         }
2235
2236         st->state = TCP_SEQ_STATE_LISTENING;
2237         st->num = 0;
2238         st->bucket = 0;
2239         st->offset = 0;
2240         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2241
2242 out:
2243         st->last_pos = *pos;
2244         return rc;
2245 }
2246
2247 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2248 {
2249         struct tcp_iter_state *st = seq->private;
2250         void *rc = NULL;
2251
2252         if (v == SEQ_START_TOKEN) {
2253                 rc = tcp_get_idx(seq, 0);
2254                 goto out;
2255         }
2256
2257         switch (st->state) {
2258         case TCP_SEQ_STATE_OPENREQ:
2259         case TCP_SEQ_STATE_LISTENING:
2260                 rc = listening_get_next(seq, v);
2261                 if (!rc) {
2262                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2263                         st->bucket = 0;
2264                         st->offset = 0;
2265                         rc        = established_get_first(seq);
2266                 }
2267                 break;
2268         case TCP_SEQ_STATE_ESTABLISHED:
2269         case TCP_SEQ_STATE_TIME_WAIT:
2270                 rc = established_get_next(seq, v);
2271                 break;
2272         }
2273 out:
2274         ++*pos;
2275         st->last_pos = *pos;
2276         return rc;
2277 }
2278
2279 static void tcp_seq_stop(struct seq_file *seq, void *v)
2280 {
2281         struct tcp_iter_state *st = seq->private;
2282
2283         switch (st->state) {
2284         case TCP_SEQ_STATE_OPENREQ:
2285                 if (v) {
2286                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2287                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2288                 }
2289         case TCP_SEQ_STATE_LISTENING:
2290                 if (v != SEQ_START_TOKEN)
2291                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2292                 break;
2293         case TCP_SEQ_STATE_TIME_WAIT:
2294         case TCP_SEQ_STATE_ESTABLISHED:
2295                 if (v)
2296                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2297                 break;
2298         }
2299 }
2300
2301 int tcp_seq_open(struct inode *inode, struct file *file)
2302 {
2303         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2304         struct tcp_iter_state *s;
2305         int err;
2306
2307         err = seq_open_net(inode, file, &afinfo->seq_ops,
2308                           sizeof(struct tcp_iter_state));
2309         if (err < 0)
2310                 return err;
2311
2312         s = ((struct seq_file *)file->private_data)->private;
2313         s->family               = afinfo->family;
2314         s->last_pos             = 0;
2315         return 0;
2316 }
2317 EXPORT_SYMBOL(tcp_seq_open);
2318
2319 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2320 {
2321         int rc = 0;
2322         struct proc_dir_entry *p;
2323
2324         afinfo->seq_ops.start           = tcp_seq_start;
2325         afinfo->seq_ops.next            = tcp_seq_next;
2326         afinfo->seq_ops.stop            = tcp_seq_stop;
2327
2328         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2329                              afinfo->seq_fops, afinfo);
2330         if (!p)
2331                 rc = -ENOMEM;
2332         return rc;
2333 }
2334 EXPORT_SYMBOL(tcp_proc_register);
2335
2336 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2337 {
2338         proc_net_remove(net, afinfo->name);
2339 }
2340 EXPORT_SYMBOL(tcp_proc_unregister);
2341
2342 static void get_openreq4(const struct sock *sk, const struct request_sock *req,
2343                          struct seq_file *f, int i, int uid, int *len)
2344 {
2345         const struct inet_request_sock *ireq = inet_rsk(req);
2346         int ttd = req->expires - jiffies;
2347
2348         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2349                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
2350                 i,
2351                 ireq->loc_addr,
2352                 ntohs(inet_sk(sk)->inet_sport),
2353                 ireq->rmt_addr,
2354                 ntohs(ireq->rmt_port),
2355                 TCP_SYN_RECV,
2356                 0, 0, /* could print option size, but that is af dependent. */
2357                 1,    /* timers active (only the expire timer) */
2358                 jiffies_to_clock_t(ttd),
2359                 req->retrans,
2360                 uid,
2361                 0,  /* non standard timer */
2362                 0, /* open_requests have no inode */
2363                 atomic_read(&sk->sk_refcnt),
2364                 req,
2365                 len);
2366 }
2367
2368 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2369 {
2370         int timer_active;
2371         unsigned long timer_expires;
2372         const struct tcp_sock *tp = tcp_sk(sk);
2373         const struct inet_connection_sock *icsk = inet_csk(sk);
2374         const struct inet_sock *inet = inet_sk(sk);
2375         __be32 dest = inet->inet_daddr;
2376         __be32 src = inet->inet_rcv_saddr;
2377         __u16 destp = ntohs(inet->inet_dport);
2378         __u16 srcp = ntohs(inet->inet_sport);
2379         int rx_queue;
2380
2381         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2382                 timer_active    = 1;
2383                 timer_expires   = icsk->icsk_timeout;
2384         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2385                 timer_active    = 4;
2386                 timer_expires   = icsk->icsk_timeout;
2387         } else if (timer_pending(&sk->sk_timer)) {
2388                 timer_active    = 2;
2389                 timer_expires   = sk->sk_timer.expires;
2390         } else {
2391                 timer_active    = 0;
2392                 timer_expires = jiffies;
2393         }
2394
2395         if (sk->sk_state == TCP_LISTEN)
2396                 rx_queue = sk->sk_ack_backlog;
2397         else
2398                 /*
2399                  * because we dont lock socket, we might find a transient negative value
2400                  */
2401                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2402
2403         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2404                         "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
2405                 i, src, srcp, dest, destp, sk->sk_state,
2406                 tp->write_seq - tp->snd_una,
2407                 rx_queue,
2408                 timer_active,
2409                 jiffies_to_clock_t(timer_expires - jiffies),
2410                 icsk->icsk_retransmits,
2411                 sock_i_uid(sk),
2412                 icsk->icsk_probes_out,
2413                 sock_i_ino(sk),
2414                 atomic_read(&sk->sk_refcnt), sk,
2415                 jiffies_to_clock_t(icsk->icsk_rto),
2416                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2417                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2418                 tp->snd_cwnd,
2419                 tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2420                 len);
2421 }
2422
2423 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2424                                struct seq_file *f, int i, int *len)
2425 {
2426         __be32 dest, src;
2427         __u16 destp, srcp;
2428         int ttd = tw->tw_ttd - jiffies;
2429
2430         if (ttd < 0)
2431                 ttd = 0;
2432
2433         dest  = tw->tw_daddr;
2434         src   = tw->tw_rcv_saddr;
2435         destp = ntohs(tw->tw_dport);
2436         srcp  = ntohs(tw->tw_sport);
2437
2438         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2439                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
2440                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2441                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2442                 atomic_read(&tw->tw_refcnt), tw, len);
2443 }
2444
2445 #define TMPSZ 150
2446
2447 static int tcp4_seq_show(struct seq_file *seq, void *v)
2448 {
2449         struct tcp_iter_state *st;
2450         int len;
2451
2452         if (v == SEQ_START_TOKEN) {
2453                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2454                            "  sl  local_address rem_address   st tx_queue "
2455                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2456                            "inode");
2457                 goto out;
2458         }
2459         st = seq->private;
2460
2461         switch (st->state) {
2462         case TCP_SEQ_STATE_LISTENING:
2463         case TCP_SEQ_STATE_ESTABLISHED:
2464                 get_tcp4_sock(v, seq, st->num, &len);
2465                 break;
2466         case TCP_SEQ_STATE_OPENREQ:
2467                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2468                 break;
2469         case TCP_SEQ_STATE_TIME_WAIT:
2470                 get_timewait4_sock(v, seq, st->num, &len);
2471                 break;
2472         }
2473         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2474 out:
2475         return 0;
2476 }
2477
2478 static const struct file_operations tcp_afinfo_seq_fops = {
2479         .owner   = THIS_MODULE,
2480         .open    = tcp_seq_open,
2481         .read    = seq_read,
2482         .llseek  = seq_lseek,
2483         .release = seq_release_net
2484 };
2485
2486 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2487         .name           = "tcp",
2488         .family         = AF_INET,
2489         .seq_fops       = &tcp_afinfo_seq_fops,
2490         .seq_ops        = {
2491                 .show           = tcp4_seq_show,
2492         },
2493 };
2494
2495 static int __net_init tcp4_proc_init_net(struct net *net)
2496 {
2497         return tcp_proc_register(net, &tcp4_seq_afinfo);
2498 }
2499
2500 static void __net_exit tcp4_proc_exit_net(struct net *net)
2501 {
2502         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2503 }
2504
2505 static struct pernet_operations tcp4_net_ops = {
2506         .init = tcp4_proc_init_net,
2507         .exit = tcp4_proc_exit_net,
2508 };
2509
2510 int __init tcp4_proc_init(void)
2511 {
2512         return register_pernet_subsys(&tcp4_net_ops);
2513 }
2514
2515 void tcp4_proc_exit(void)
2516 {
2517         unregister_pernet_subsys(&tcp4_net_ops);
2518 }
2519 #endif /* CONFIG_PROC_FS */
2520
2521 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2522 {
2523         const struct iphdr *iph = skb_gro_network_header(skb);
2524
2525         switch (skb->ip_summed) {
2526         case CHECKSUM_COMPLETE:
2527                 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2528                                   skb->csum)) {
2529                         skb->ip_summed = CHECKSUM_UNNECESSARY;
2530                         break;
2531                 }
2532
2533                 /* fall through */
2534         case CHECKSUM_NONE:
2535                 NAPI_GRO_CB(skb)->flush = 1;
2536                 return NULL;
2537         }
2538
2539         return tcp_gro_receive(head, skb);
2540 }
2541
2542 int tcp4_gro_complete(struct sk_buff *skb)
2543 {
2544         const struct iphdr *iph = ip_hdr(skb);
2545         struct tcphdr *th = tcp_hdr(skb);
2546
2547         th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2548                                   iph->saddr, iph->daddr, 0);
2549         skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2550
2551         return tcp_gro_complete(skb);
2552 }
2553
2554 struct proto tcp_prot = {
2555         .name                   = "TCP",
2556         .owner                  = THIS_MODULE,
2557         .close                  = tcp_close,
2558         .connect                = tcp_v4_connect,
2559         .disconnect             = tcp_disconnect,
2560         .accept                 = inet_csk_accept,
2561         .ioctl                  = tcp_ioctl,
2562         .init                   = tcp_v4_init_sock,
2563         .destroy                = tcp_v4_destroy_sock,
2564         .shutdown               = tcp_shutdown,
2565         .setsockopt             = tcp_setsockopt,
2566         .getsockopt             = tcp_getsockopt,
2567         .recvmsg                = tcp_recvmsg,
2568         .sendmsg                = tcp_sendmsg,
2569         .sendpage               = tcp_sendpage,
2570         .backlog_rcv            = tcp_v4_do_rcv,
2571         .hash                   = inet_hash,
2572         .unhash                 = inet_unhash,
2573         .get_port               = inet_csk_get_port,
2574         .enter_memory_pressure  = tcp_enter_memory_pressure,
2575         .sockets_allocated      = &tcp_sockets_allocated,
2576         .orphan_count           = &tcp_orphan_count,
2577         .memory_allocated       = &tcp_memory_allocated,
2578         .memory_pressure        = &tcp_memory_pressure,
2579         .sysctl_wmem            = sysctl_tcp_wmem,
2580         .sysctl_rmem            = sysctl_tcp_rmem,
2581         .max_header             = MAX_TCP_HEADER,
2582         .obj_size               = sizeof(struct tcp_sock),
2583         .slab_flags             = SLAB_DESTROY_BY_RCU,
2584         .twsk_prot              = &tcp_timewait_sock_ops,
2585         .rsk_prot               = &tcp_request_sock_ops,
2586         .h.hashinfo             = &tcp_hashinfo,
2587         .no_autobind            = true,
2588 #ifdef CONFIG_COMPAT
2589         .compat_setsockopt      = compat_tcp_setsockopt,
2590         .compat_getsockopt      = compat_tcp_getsockopt,
2591 #endif
2592 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
2593         .init_cgroup            = tcp_init_cgroup,
2594         .destroy_cgroup         = tcp_destroy_cgroup,
2595         .proto_cgroup           = tcp_proto_cgroup,
2596 #endif
2597 };
2598 EXPORT_SYMBOL(tcp_prot);
2599
2600 static int __net_init tcp_sk_init(struct net *net)
2601 {
2602         return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2603                                     PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2604 }
2605
2606 static void __net_exit tcp_sk_exit(struct net *net)
2607 {
2608         inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2609 }
2610
2611 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2612 {
2613         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2614 }
2615
2616 static struct pernet_operations __net_initdata tcp_sk_ops = {
2617        .init       = tcp_sk_init,
2618        .exit       = tcp_sk_exit,
2619        .exit_batch = tcp_sk_exit_batch,
2620 };
2621
2622 void __init tcp_v4_init(void)
2623 {
2624         inet_hashinfo_init(&tcp_hashinfo);
2625         if (register_pernet_subsys(&tcp_sk_ops))
2626                 panic("Failed to create the TCP control socket.\n");
2627 }