Merge branch 'master' of git://1984.lsi.us.es/nf-next
[~shefty/rdma-dev.git] / net / netfilter / ipvs / ip_vs_ctl.c
1 /*
2  * IPVS         An implementation of the IP virtual server support for the
3  *              LINUX operating system.  IPVS is now implemented as a module
4  *              over the NetFilter framework. IPVS can be used to build a
5  *              high-performance and highly available server based on a
6  *              cluster of servers.
7  *
8  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
9  *              Peter Kese <peter.kese@ijs.si>
10  *              Julian Anastasov <ja@ssi.bg>
11  *
12  *              This program is free software; you can redistribute it and/or
13  *              modify it under the terms of the GNU General Public License
14  *              as published by the Free Software Foundation; either version
15  *              2 of the License, or (at your option) any later version.
16  *
17  * Changes:
18  *
19  */
20
21 #define KMSG_COMPONENT "IPVS"
22 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
23
24 #include <linux/module.h>
25 #include <linux/init.h>
26 #include <linux/types.h>
27 #include <linux/capability.h>
28 #include <linux/fs.h>
29 #include <linux/sysctl.h>
30 #include <linux/proc_fs.h>
31 #include <linux/workqueue.h>
32 #include <linux/swap.h>
33 #include <linux/seq_file.h>
34 #include <linux/slab.h>
35
36 #include <linux/netfilter.h>
37 #include <linux/netfilter_ipv4.h>
38 #include <linux/mutex.h>
39
40 #include <net/net_namespace.h>
41 #include <linux/nsproxy.h>
42 #include <net/ip.h>
43 #ifdef CONFIG_IP_VS_IPV6
44 #include <net/ipv6.h>
45 #include <net/ip6_route.h>
46 #endif
47 #include <net/route.h>
48 #include <net/sock.h>
49 #include <net/genetlink.h>
50
51 #include <asm/uaccess.h>
52
53 #include <net/ip_vs.h>
54
55 /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
56 static DEFINE_MUTEX(__ip_vs_mutex);
57
58 /* lock for service table */
59 static DEFINE_RWLOCK(__ip_vs_svc_lock);
60
61 /* sysctl variables */
62
63 #ifdef CONFIG_IP_VS_DEBUG
64 static int sysctl_ip_vs_debug_level = 0;
65
66 int ip_vs_get_debug_level(void)
67 {
68         return sysctl_ip_vs_debug_level;
69 }
70 #endif
71
72
73 /*  Protos */
74 static void __ip_vs_del_service(struct ip_vs_service *svc);
75
76
77 #ifdef CONFIG_IP_VS_IPV6
78 /* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */
79 static bool __ip_vs_addr_is_local_v6(struct net *net,
80                                      const struct in6_addr *addr)
81 {
82         struct flowi6 fl6 = {
83                 .daddr = *addr,
84         };
85         struct dst_entry *dst = ip6_route_output(net, NULL, &fl6);
86         bool is_local;
87
88         is_local = !dst->error && dst->dev && (dst->dev->flags & IFF_LOOPBACK);
89
90         dst_release(dst);
91         return is_local;
92 }
93 #endif
94
95 #ifdef CONFIG_SYSCTL
96 /*
97  *      update_defense_level is called from keventd and from sysctl,
98  *      so it needs to protect itself from softirqs
99  */
100 static void update_defense_level(struct netns_ipvs *ipvs)
101 {
102         struct sysinfo i;
103         static int old_secure_tcp = 0;
104         int availmem;
105         int nomem;
106         int to_change = -1;
107
108         /* we only count free and buffered memory (in pages) */
109         si_meminfo(&i);
110         availmem = i.freeram + i.bufferram;
111         /* however in linux 2.5 the i.bufferram is total page cache size,
112            we need adjust it */
113         /* si_swapinfo(&i); */
114         /* availmem = availmem - (i.totalswap - i.freeswap); */
115
116         nomem = (availmem < ipvs->sysctl_amemthresh);
117
118         local_bh_disable();
119
120         /* drop_entry */
121         spin_lock(&ipvs->dropentry_lock);
122         switch (ipvs->sysctl_drop_entry) {
123         case 0:
124                 atomic_set(&ipvs->dropentry, 0);
125                 break;
126         case 1:
127                 if (nomem) {
128                         atomic_set(&ipvs->dropentry, 1);
129                         ipvs->sysctl_drop_entry = 2;
130                 } else {
131                         atomic_set(&ipvs->dropentry, 0);
132                 }
133                 break;
134         case 2:
135                 if (nomem) {
136                         atomic_set(&ipvs->dropentry, 1);
137                 } else {
138                         atomic_set(&ipvs->dropentry, 0);
139                         ipvs->sysctl_drop_entry = 1;
140                 };
141                 break;
142         case 3:
143                 atomic_set(&ipvs->dropentry, 1);
144                 break;
145         }
146         spin_unlock(&ipvs->dropentry_lock);
147
148         /* drop_packet */
149         spin_lock(&ipvs->droppacket_lock);
150         switch (ipvs->sysctl_drop_packet) {
151         case 0:
152                 ipvs->drop_rate = 0;
153                 break;
154         case 1:
155                 if (nomem) {
156                         ipvs->drop_rate = ipvs->drop_counter
157                                 = ipvs->sysctl_amemthresh /
158                                 (ipvs->sysctl_amemthresh-availmem);
159                         ipvs->sysctl_drop_packet = 2;
160                 } else {
161                         ipvs->drop_rate = 0;
162                 }
163                 break;
164         case 2:
165                 if (nomem) {
166                         ipvs->drop_rate = ipvs->drop_counter
167                                 = ipvs->sysctl_amemthresh /
168                                 (ipvs->sysctl_amemthresh-availmem);
169                 } else {
170                         ipvs->drop_rate = 0;
171                         ipvs->sysctl_drop_packet = 1;
172                 }
173                 break;
174         case 3:
175                 ipvs->drop_rate = ipvs->sysctl_am_droprate;
176                 break;
177         }
178         spin_unlock(&ipvs->droppacket_lock);
179
180         /* secure_tcp */
181         spin_lock(&ipvs->securetcp_lock);
182         switch (ipvs->sysctl_secure_tcp) {
183         case 0:
184                 if (old_secure_tcp >= 2)
185                         to_change = 0;
186                 break;
187         case 1:
188                 if (nomem) {
189                         if (old_secure_tcp < 2)
190                                 to_change = 1;
191                         ipvs->sysctl_secure_tcp = 2;
192                 } else {
193                         if (old_secure_tcp >= 2)
194                                 to_change = 0;
195                 }
196                 break;
197         case 2:
198                 if (nomem) {
199                         if (old_secure_tcp < 2)
200                                 to_change = 1;
201                 } else {
202                         if (old_secure_tcp >= 2)
203                                 to_change = 0;
204                         ipvs->sysctl_secure_tcp = 1;
205                 }
206                 break;
207         case 3:
208                 if (old_secure_tcp < 2)
209                         to_change = 1;
210                 break;
211         }
212         old_secure_tcp = ipvs->sysctl_secure_tcp;
213         if (to_change >= 0)
214                 ip_vs_protocol_timeout_change(ipvs,
215                                               ipvs->sysctl_secure_tcp > 1);
216         spin_unlock(&ipvs->securetcp_lock);
217
218         local_bh_enable();
219 }
220
221
222 /*
223  *      Timer for checking the defense
224  */
225 #define DEFENSE_TIMER_PERIOD    1*HZ
226
227 static void defense_work_handler(struct work_struct *work)
228 {
229         struct netns_ipvs *ipvs =
230                 container_of(work, struct netns_ipvs, defense_work.work);
231
232         update_defense_level(ipvs);
233         if (atomic_read(&ipvs->dropentry))
234                 ip_vs_random_dropentry(ipvs->net);
235         schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
236 }
237 #endif
238
239 int
240 ip_vs_use_count_inc(void)
241 {
242         return try_module_get(THIS_MODULE);
243 }
244
245 void
246 ip_vs_use_count_dec(void)
247 {
248         module_put(THIS_MODULE);
249 }
250
251
252 /*
253  *      Hash table: for virtual service lookups
254  */
255 #define IP_VS_SVC_TAB_BITS 8
256 #define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
257 #define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
258
259 /* the service table hashed by <protocol, addr, port> */
260 static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
261 /* the service table hashed by fwmark */
262 static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
263
264
265 /*
266  *      Returns hash value for virtual service
267  */
268 static inline unsigned int
269 ip_vs_svc_hashkey(struct net *net, int af, unsigned int proto,
270                   const union nf_inet_addr *addr, __be16 port)
271 {
272         register unsigned int porth = ntohs(port);
273         __be32 addr_fold = addr->ip;
274         __u32 ahash;
275
276 #ifdef CONFIG_IP_VS_IPV6
277         if (af == AF_INET6)
278                 addr_fold = addr->ip6[0]^addr->ip6[1]^
279                             addr->ip6[2]^addr->ip6[3];
280 #endif
281         ahash = ntohl(addr_fold);
282         ahash ^= ((size_t) net >> 8);
283
284         return (proto ^ ahash ^ (porth >> IP_VS_SVC_TAB_BITS) ^ porth) &
285                IP_VS_SVC_TAB_MASK;
286 }
287
288 /*
289  *      Returns hash value of fwmark for virtual service lookup
290  */
291 static inline unsigned int ip_vs_svc_fwm_hashkey(struct net *net, __u32 fwmark)
292 {
293         return (((size_t)net>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK;
294 }
295
296 /*
297  *      Hashes a service in the ip_vs_svc_table by <netns,proto,addr,port>
298  *      or in the ip_vs_svc_fwm_table by fwmark.
299  *      Should be called with locked tables.
300  */
301 static int ip_vs_svc_hash(struct ip_vs_service *svc)
302 {
303         unsigned int hash;
304
305         if (svc->flags & IP_VS_SVC_F_HASHED) {
306                 pr_err("%s(): request for already hashed, called from %pF\n",
307                        __func__, __builtin_return_address(0));
308                 return 0;
309         }
310
311         if (svc->fwmark == 0) {
312                 /*
313                  *  Hash it by <netns,protocol,addr,port> in ip_vs_svc_table
314                  */
315                 hash = ip_vs_svc_hashkey(svc->net, svc->af, svc->protocol,
316                                          &svc->addr, svc->port);
317                 list_add(&svc->s_list, &ip_vs_svc_table[hash]);
318         } else {
319                 /*
320                  *  Hash it by fwmark in svc_fwm_table
321                  */
322                 hash = ip_vs_svc_fwm_hashkey(svc->net, svc->fwmark);
323                 list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
324         }
325
326         svc->flags |= IP_VS_SVC_F_HASHED;
327         /* increase its refcnt because it is referenced by the svc table */
328         atomic_inc(&svc->refcnt);
329         return 1;
330 }
331
332
333 /*
334  *      Unhashes a service from svc_table / svc_fwm_table.
335  *      Should be called with locked tables.
336  */
337 static int ip_vs_svc_unhash(struct ip_vs_service *svc)
338 {
339         if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
340                 pr_err("%s(): request for unhash flagged, called from %pF\n",
341                        __func__, __builtin_return_address(0));
342                 return 0;
343         }
344
345         if (svc->fwmark == 0) {
346                 /* Remove it from the svc_table table */
347                 list_del(&svc->s_list);
348         } else {
349                 /* Remove it from the svc_fwm_table table */
350                 list_del(&svc->f_list);
351         }
352
353         svc->flags &= ~IP_VS_SVC_F_HASHED;
354         atomic_dec(&svc->refcnt);
355         return 1;
356 }
357
358
359 /*
360  *      Get service by {netns, proto,addr,port} in the service table.
361  */
362 static inline struct ip_vs_service *
363 __ip_vs_service_find(struct net *net, int af, __u16 protocol,
364                      const union nf_inet_addr *vaddr, __be16 vport)
365 {
366         unsigned int hash;
367         struct ip_vs_service *svc;
368
369         /* Check for "full" addressed entries */
370         hash = ip_vs_svc_hashkey(net, af, protocol, vaddr, vport);
371
372         list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
373                 if ((svc->af == af)
374                     && ip_vs_addr_equal(af, &svc->addr, vaddr)
375                     && (svc->port == vport)
376                     && (svc->protocol == protocol)
377                     && net_eq(svc->net, net)) {
378                         /* HIT */
379                         return svc;
380                 }
381         }
382
383         return NULL;
384 }
385
386
387 /*
388  *      Get service by {fwmark} in the service table.
389  */
390 static inline struct ip_vs_service *
391 __ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark)
392 {
393         unsigned int hash;
394         struct ip_vs_service *svc;
395
396         /* Check for fwmark addressed entries */
397         hash = ip_vs_svc_fwm_hashkey(net, fwmark);
398
399         list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
400                 if (svc->fwmark == fwmark && svc->af == af
401                     && net_eq(svc->net, net)) {
402                         /* HIT */
403                         return svc;
404                 }
405         }
406
407         return NULL;
408 }
409
410 struct ip_vs_service *
411 ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol,
412                   const union nf_inet_addr *vaddr, __be16 vport)
413 {
414         struct ip_vs_service *svc;
415         struct netns_ipvs *ipvs = net_ipvs(net);
416
417         read_lock(&__ip_vs_svc_lock);
418
419         /*
420          *      Check the table hashed by fwmark first
421          */
422         if (fwmark) {
423                 svc = __ip_vs_svc_fwm_find(net, af, fwmark);
424                 if (svc)
425                         goto out;
426         }
427
428         /*
429          *      Check the table hashed by <protocol,addr,port>
430          *      for "full" addressed entries
431          */
432         svc = __ip_vs_service_find(net, af, protocol, vaddr, vport);
433
434         if (svc == NULL
435             && protocol == IPPROTO_TCP
436             && atomic_read(&ipvs->ftpsvc_counter)
437             && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
438                 /*
439                  * Check if ftp service entry exists, the packet
440                  * might belong to FTP data connections.
441                  */
442                 svc = __ip_vs_service_find(net, af, protocol, vaddr, FTPPORT);
443         }
444
445         if (svc == NULL
446             && atomic_read(&ipvs->nullsvc_counter)) {
447                 /*
448                  * Check if the catch-all port (port zero) exists
449                  */
450                 svc = __ip_vs_service_find(net, af, protocol, vaddr, 0);
451         }
452
453   out:
454         if (svc)
455                 atomic_inc(&svc->usecnt);
456         read_unlock(&__ip_vs_svc_lock);
457
458         IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n",
459                       fwmark, ip_vs_proto_name(protocol),
460                       IP_VS_DBG_ADDR(af, vaddr), ntohs(vport),
461                       svc ? "hit" : "not hit");
462
463         return svc;
464 }
465
466
467 static inline void
468 __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
469 {
470         atomic_inc(&svc->refcnt);
471         dest->svc = svc;
472 }
473
474 static void
475 __ip_vs_unbind_svc(struct ip_vs_dest *dest)
476 {
477         struct ip_vs_service *svc = dest->svc;
478
479         dest->svc = NULL;
480         if (atomic_dec_and_test(&svc->refcnt)) {
481                 IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
482                               svc->fwmark,
483                               IP_VS_DBG_ADDR(svc->af, &svc->addr),
484                               ntohs(svc->port), atomic_read(&svc->usecnt));
485                 free_percpu(svc->stats.cpustats);
486                 kfree(svc);
487         }
488 }
489
490
491 /*
492  *      Returns hash value for real service
493  */
494 static inline unsigned int ip_vs_rs_hashkey(int af,
495                                             const union nf_inet_addr *addr,
496                                             __be16 port)
497 {
498         register unsigned int porth = ntohs(port);
499         __be32 addr_fold = addr->ip;
500
501 #ifdef CONFIG_IP_VS_IPV6
502         if (af == AF_INET6)
503                 addr_fold = addr->ip6[0]^addr->ip6[1]^
504                             addr->ip6[2]^addr->ip6[3];
505 #endif
506
507         return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth)
508                 & IP_VS_RTAB_MASK;
509 }
510
511 /*
512  *      Hashes ip_vs_dest in rs_table by <proto,addr,port>.
513  *      should be called with locked tables.
514  */
515 static int ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
516 {
517         unsigned int hash;
518
519         if (!list_empty(&dest->d_list)) {
520                 return 0;
521         }
522
523         /*
524          *      Hash by proto,addr,port,
525          *      which are the parameters of the real service.
526          */
527         hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port);
528
529         list_add(&dest->d_list, &ipvs->rs_table[hash]);
530
531         return 1;
532 }
533
534 /*
535  *      UNhashes ip_vs_dest from rs_table.
536  *      should be called with locked tables.
537  */
538 static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
539 {
540         /*
541          * Remove it from the rs_table table.
542          */
543         if (!list_empty(&dest->d_list)) {
544                 list_del_init(&dest->d_list);
545         }
546
547         return 1;
548 }
549
550 /*
551  *      Lookup real service by <proto,addr,port> in the real service table.
552  */
553 struct ip_vs_dest *
554 ip_vs_lookup_real_service(struct net *net, int af, __u16 protocol,
555                           const union nf_inet_addr *daddr,
556                           __be16 dport)
557 {
558         struct netns_ipvs *ipvs = net_ipvs(net);
559         unsigned int hash;
560         struct ip_vs_dest *dest;
561
562         /*
563          *      Check for "full" addressed entries
564          *      Return the first found entry
565          */
566         hash = ip_vs_rs_hashkey(af, daddr, dport);
567
568         read_lock(&ipvs->rs_lock);
569         list_for_each_entry(dest, &ipvs->rs_table[hash], d_list) {
570                 if ((dest->af == af)
571                     && ip_vs_addr_equal(af, &dest->addr, daddr)
572                     && (dest->port == dport)
573                     && ((dest->protocol == protocol) ||
574                         dest->vfwmark)) {
575                         /* HIT */
576                         read_unlock(&ipvs->rs_lock);
577                         return dest;
578                 }
579         }
580         read_unlock(&ipvs->rs_lock);
581
582         return NULL;
583 }
584
585 /*
586  *      Lookup destination by {addr,port} in the given service
587  */
588 static struct ip_vs_dest *
589 ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
590                   __be16 dport)
591 {
592         struct ip_vs_dest *dest;
593
594         /*
595          * Find the destination for the given service
596          */
597         list_for_each_entry(dest, &svc->destinations, n_list) {
598                 if ((dest->af == svc->af)
599                     && ip_vs_addr_equal(svc->af, &dest->addr, daddr)
600                     && (dest->port == dport)) {
601                         /* HIT */
602                         return dest;
603                 }
604         }
605
606         return NULL;
607 }
608
609 /*
610  * Find destination by {daddr,dport,vaddr,protocol}
611  * Cretaed to be used in ip_vs_process_message() in
612  * the backup synchronization daemon. It finds the
613  * destination to be bound to the received connection
614  * on the backup.
615  *
616  * ip_vs_lookup_real_service() looked promissing, but
617  * seems not working as expected.
618  */
619 struct ip_vs_dest *ip_vs_find_dest(struct net  *net, int af,
620                                    const union nf_inet_addr *daddr,
621                                    __be16 dport,
622                                    const union nf_inet_addr *vaddr,
623                                    __be16 vport, __u16 protocol, __u32 fwmark,
624                                    __u32 flags)
625 {
626         struct ip_vs_dest *dest;
627         struct ip_vs_service *svc;
628         __be16 port = dport;
629
630         svc = ip_vs_service_get(net, af, fwmark, protocol, vaddr, vport);
631         if (!svc)
632                 return NULL;
633         if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ)
634                 port = 0;
635         dest = ip_vs_lookup_dest(svc, daddr, port);
636         if (!dest)
637                 dest = ip_vs_lookup_dest(svc, daddr, port ^ dport);
638         if (dest)
639                 atomic_inc(&dest->refcnt);
640         ip_vs_service_put(svc);
641         return dest;
642 }
643
644 /*
645  *  Lookup dest by {svc,addr,port} in the destination trash.
646  *  The destination trash is used to hold the destinations that are removed
647  *  from the service table but are still referenced by some conn entries.
648  *  The reason to add the destination trash is when the dest is temporary
649  *  down (either by administrator or by monitor program), the dest can be
650  *  picked back from the trash, the remaining connections to the dest can
651  *  continue, and the counting information of the dest is also useful for
652  *  scheduling.
653  */
654 static struct ip_vs_dest *
655 ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
656                      __be16 dport)
657 {
658         struct ip_vs_dest *dest, *nxt;
659         struct netns_ipvs *ipvs = net_ipvs(svc->net);
660
661         /*
662          * Find the destination in trash
663          */
664         list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) {
665                 IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, "
666                               "dest->refcnt=%d\n",
667                               dest->vfwmark,
668                               IP_VS_DBG_ADDR(svc->af, &dest->addr),
669                               ntohs(dest->port),
670                               atomic_read(&dest->refcnt));
671                 if (dest->af == svc->af &&
672                     ip_vs_addr_equal(svc->af, &dest->addr, daddr) &&
673                     dest->port == dport &&
674                     dest->vfwmark == svc->fwmark &&
675                     dest->protocol == svc->protocol &&
676                     (svc->fwmark ||
677                      (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) &&
678                       dest->vport == svc->port))) {
679                         /* HIT */
680                         return dest;
681                 }
682
683                 /*
684                  * Try to purge the destination from trash if not referenced
685                  */
686                 if (atomic_read(&dest->refcnt) == 1) {
687                         IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u "
688                                       "from trash\n",
689                                       dest->vfwmark,
690                                       IP_VS_DBG_ADDR(svc->af, &dest->addr),
691                                       ntohs(dest->port));
692                         list_del(&dest->n_list);
693                         ip_vs_dst_reset(dest);
694                         __ip_vs_unbind_svc(dest);
695                         free_percpu(dest->stats.cpustats);
696                         kfree(dest);
697                 }
698         }
699
700         return NULL;
701 }
702
703
704 /*
705  *  Clean up all the destinations in the trash
706  *  Called by the ip_vs_control_cleanup()
707  *
708  *  When the ip_vs_control_clearup is activated by ipvs module exit,
709  *  the service tables must have been flushed and all the connections
710  *  are expired, and the refcnt of each destination in the trash must
711  *  be 1, so we simply release them here.
712  */
713 static void ip_vs_trash_cleanup(struct net *net)
714 {
715         struct ip_vs_dest *dest, *nxt;
716         struct netns_ipvs *ipvs = net_ipvs(net);
717
718         list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) {
719                 list_del(&dest->n_list);
720                 ip_vs_dst_reset(dest);
721                 __ip_vs_unbind_svc(dest);
722                 free_percpu(dest->stats.cpustats);
723                 kfree(dest);
724         }
725 }
726
727 static void
728 ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
729 {
730 #define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->ustats.c - src->ustats0.c
731
732         spin_lock_bh(&src->lock);
733
734         IP_VS_SHOW_STATS_COUNTER(conns);
735         IP_VS_SHOW_STATS_COUNTER(inpkts);
736         IP_VS_SHOW_STATS_COUNTER(outpkts);
737         IP_VS_SHOW_STATS_COUNTER(inbytes);
738         IP_VS_SHOW_STATS_COUNTER(outbytes);
739
740         ip_vs_read_estimator(dst, src);
741
742         spin_unlock_bh(&src->lock);
743 }
744
745 static void
746 ip_vs_zero_stats(struct ip_vs_stats *stats)
747 {
748         spin_lock_bh(&stats->lock);
749
750         /* get current counters as zero point, rates are zeroed */
751
752 #define IP_VS_ZERO_STATS_COUNTER(c) stats->ustats0.c = stats->ustats.c
753
754         IP_VS_ZERO_STATS_COUNTER(conns);
755         IP_VS_ZERO_STATS_COUNTER(inpkts);
756         IP_VS_ZERO_STATS_COUNTER(outpkts);
757         IP_VS_ZERO_STATS_COUNTER(inbytes);
758         IP_VS_ZERO_STATS_COUNTER(outbytes);
759
760         ip_vs_zero_estimator(stats);
761
762         spin_unlock_bh(&stats->lock);
763 }
764
765 /*
766  *      Update a destination in the given service
767  */
768 static void
769 __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
770                     struct ip_vs_dest_user_kern *udest, int add)
771 {
772         struct netns_ipvs *ipvs = net_ipvs(svc->net);
773         int conn_flags;
774
775         /* set the weight and the flags */
776         atomic_set(&dest->weight, udest->weight);
777         conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
778         conn_flags |= IP_VS_CONN_F_INACTIVE;
779
780         /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
781         if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
782                 conn_flags |= IP_VS_CONN_F_NOOUTPUT;
783         } else {
784                 /*
785                  *    Put the real service in rs_table if not present.
786                  *    For now only for NAT!
787                  */
788                 write_lock_bh(&ipvs->rs_lock);
789                 ip_vs_rs_hash(ipvs, dest);
790                 write_unlock_bh(&ipvs->rs_lock);
791         }
792         atomic_set(&dest->conn_flags, conn_flags);
793
794         /* bind the service */
795         if (!dest->svc) {
796                 __ip_vs_bind_svc(dest, svc);
797         } else {
798                 if (dest->svc != svc) {
799                         __ip_vs_unbind_svc(dest);
800                         ip_vs_zero_stats(&dest->stats);
801                         __ip_vs_bind_svc(dest, svc);
802                 }
803         }
804
805         /* set the dest status flags */
806         dest->flags |= IP_VS_DEST_F_AVAILABLE;
807
808         if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
809                 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
810         dest->u_threshold = udest->u_threshold;
811         dest->l_threshold = udest->l_threshold;
812
813         spin_lock_bh(&dest->dst_lock);
814         ip_vs_dst_reset(dest);
815         spin_unlock_bh(&dest->dst_lock);
816
817         if (add)
818                 ip_vs_start_estimator(svc->net, &dest->stats);
819
820         write_lock_bh(&__ip_vs_svc_lock);
821
822         /* Wait until all other svc users go away */
823         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
824
825         if (add) {
826                 list_add(&dest->n_list, &svc->destinations);
827                 svc->num_dests++;
828         }
829
830         /* call the update_service, because server weight may be changed */
831         if (svc->scheduler->update_service)
832                 svc->scheduler->update_service(svc);
833
834         write_unlock_bh(&__ip_vs_svc_lock);
835 }
836
837
838 /*
839  *      Create a destination for the given service
840  */
841 static int
842 ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
843                struct ip_vs_dest **dest_p)
844 {
845         struct ip_vs_dest *dest;
846         unsigned int atype;
847
848         EnterFunction(2);
849
850 #ifdef CONFIG_IP_VS_IPV6
851         if (svc->af == AF_INET6) {
852                 atype = ipv6_addr_type(&udest->addr.in6);
853                 if ((!(atype & IPV6_ADDR_UNICAST) ||
854                         atype & IPV6_ADDR_LINKLOCAL) &&
855                         !__ip_vs_addr_is_local_v6(svc->net, &udest->addr.in6))
856                         return -EINVAL;
857         } else
858 #endif
859         {
860                 atype = inet_addr_type(svc->net, udest->addr.ip);
861                 if (atype != RTN_LOCAL && atype != RTN_UNICAST)
862                         return -EINVAL;
863         }
864
865         dest = kzalloc(sizeof(struct ip_vs_dest), GFP_KERNEL);
866         if (dest == NULL)
867                 return -ENOMEM;
868
869         dest->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
870         if (!dest->stats.cpustats)
871                 goto err_alloc;
872
873         dest->af = svc->af;
874         dest->protocol = svc->protocol;
875         dest->vaddr = svc->addr;
876         dest->vport = svc->port;
877         dest->vfwmark = svc->fwmark;
878         ip_vs_addr_copy(svc->af, &dest->addr, &udest->addr);
879         dest->port = udest->port;
880
881         atomic_set(&dest->activeconns, 0);
882         atomic_set(&dest->inactconns, 0);
883         atomic_set(&dest->persistconns, 0);
884         atomic_set(&dest->refcnt, 1);
885
886         INIT_LIST_HEAD(&dest->d_list);
887         spin_lock_init(&dest->dst_lock);
888         spin_lock_init(&dest->stats.lock);
889         __ip_vs_update_dest(svc, dest, udest, 1);
890
891         *dest_p = dest;
892
893         LeaveFunction(2);
894         return 0;
895
896 err_alloc:
897         kfree(dest);
898         return -ENOMEM;
899 }
900
901
902 /*
903  *      Add a destination into an existing service
904  */
905 static int
906 ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
907 {
908         struct ip_vs_dest *dest;
909         union nf_inet_addr daddr;
910         __be16 dport = udest->port;
911         int ret;
912
913         EnterFunction(2);
914
915         if (udest->weight < 0) {
916                 pr_err("%s(): server weight less than zero\n", __func__);
917                 return -ERANGE;
918         }
919
920         if (udest->l_threshold > udest->u_threshold) {
921                 pr_err("%s(): lower threshold is higher than upper threshold\n",
922                         __func__);
923                 return -ERANGE;
924         }
925
926         ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
927
928         /*
929          * Check if the dest already exists in the list
930          */
931         dest = ip_vs_lookup_dest(svc, &daddr, dport);
932
933         if (dest != NULL) {
934                 IP_VS_DBG(1, "%s(): dest already exists\n", __func__);
935                 return -EEXIST;
936         }
937
938         /*
939          * Check if the dest already exists in the trash and
940          * is from the same service
941          */
942         dest = ip_vs_trash_get_dest(svc, &daddr, dport);
943
944         if (dest != NULL) {
945                 IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, "
946                               "dest->refcnt=%d, service %u/%s:%u\n",
947                               IP_VS_DBG_ADDR(svc->af, &daddr), ntohs(dport),
948                               atomic_read(&dest->refcnt),
949                               dest->vfwmark,
950                               IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
951                               ntohs(dest->vport));
952
953                 /*
954                  * Get the destination from the trash
955                  */
956                 list_del(&dest->n_list);
957
958                 __ip_vs_update_dest(svc, dest, udest, 1);
959                 ret = 0;
960         } else {
961                 /*
962                  * Allocate and initialize the dest structure
963                  */
964                 ret = ip_vs_new_dest(svc, udest, &dest);
965         }
966         LeaveFunction(2);
967
968         return ret;
969 }
970
971
972 /*
973  *      Edit a destination in the given service
974  */
975 static int
976 ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
977 {
978         struct ip_vs_dest *dest;
979         union nf_inet_addr daddr;
980         __be16 dport = udest->port;
981
982         EnterFunction(2);
983
984         if (udest->weight < 0) {
985                 pr_err("%s(): server weight less than zero\n", __func__);
986                 return -ERANGE;
987         }
988
989         if (udest->l_threshold > udest->u_threshold) {
990                 pr_err("%s(): lower threshold is higher than upper threshold\n",
991                         __func__);
992                 return -ERANGE;
993         }
994
995         ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
996
997         /*
998          *  Lookup the destination list
999          */
1000         dest = ip_vs_lookup_dest(svc, &daddr, dport);
1001
1002         if (dest == NULL) {
1003                 IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__);
1004                 return -ENOENT;
1005         }
1006
1007         __ip_vs_update_dest(svc, dest, udest, 0);
1008         LeaveFunction(2);
1009
1010         return 0;
1011 }
1012
1013
1014 /*
1015  *      Delete a destination (must be already unlinked from the service)
1016  */
1017 static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest)
1018 {
1019         struct netns_ipvs *ipvs = net_ipvs(net);
1020
1021         ip_vs_stop_estimator(net, &dest->stats);
1022
1023         /*
1024          *  Remove it from the d-linked list with the real services.
1025          */
1026         write_lock_bh(&ipvs->rs_lock);
1027         ip_vs_rs_unhash(dest);
1028         write_unlock_bh(&ipvs->rs_lock);
1029
1030         /*
1031          *  Decrease the refcnt of the dest, and free the dest
1032          *  if nobody refers to it (refcnt=0). Otherwise, throw
1033          *  the destination into the trash.
1034          */
1035         if (atomic_dec_and_test(&dest->refcnt)) {
1036                 IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u\n",
1037                               dest->vfwmark,
1038                               IP_VS_DBG_ADDR(dest->af, &dest->addr),
1039                               ntohs(dest->port));
1040                 ip_vs_dst_reset(dest);
1041                 /* simply decrease svc->refcnt here, let the caller check
1042                    and release the service if nobody refers to it.
1043                    Only user context can release destination and service,
1044                    and only one user context can update virtual service at a
1045                    time, so the operation here is OK */
1046                 atomic_dec(&dest->svc->refcnt);
1047                 free_percpu(dest->stats.cpustats);
1048                 kfree(dest);
1049         } else {
1050                 IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, "
1051                               "dest->refcnt=%d\n",
1052                               IP_VS_DBG_ADDR(dest->af, &dest->addr),
1053                               ntohs(dest->port),
1054                               atomic_read(&dest->refcnt));
1055                 list_add(&dest->n_list, &ipvs->dest_trash);
1056                 atomic_inc(&dest->refcnt);
1057         }
1058 }
1059
1060
1061 /*
1062  *      Unlink a destination from the given service
1063  */
1064 static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
1065                                 struct ip_vs_dest *dest,
1066                                 int svcupd)
1067 {
1068         dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
1069
1070         /*
1071          *  Remove it from the d-linked destination list.
1072          */
1073         list_del(&dest->n_list);
1074         svc->num_dests--;
1075
1076         /*
1077          *  Call the update_service function of its scheduler
1078          */
1079         if (svcupd && svc->scheduler->update_service)
1080                         svc->scheduler->update_service(svc);
1081 }
1082
1083
1084 /*
1085  *      Delete a destination server in the given service
1086  */
1087 static int
1088 ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
1089 {
1090         struct ip_vs_dest *dest;
1091         __be16 dport = udest->port;
1092
1093         EnterFunction(2);
1094
1095         dest = ip_vs_lookup_dest(svc, &udest->addr, dport);
1096
1097         if (dest == NULL) {
1098                 IP_VS_DBG(1, "%s(): destination not found!\n", __func__);
1099                 return -ENOENT;
1100         }
1101
1102         write_lock_bh(&__ip_vs_svc_lock);
1103
1104         /*
1105          *      Wait until all other svc users go away.
1106          */
1107         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1108
1109         /*
1110          *      Unlink dest from the service
1111          */
1112         __ip_vs_unlink_dest(svc, dest, 1);
1113
1114         write_unlock_bh(&__ip_vs_svc_lock);
1115
1116         /*
1117          *      Delete the destination
1118          */
1119         __ip_vs_del_dest(svc->net, dest);
1120
1121         LeaveFunction(2);
1122
1123         return 0;
1124 }
1125
1126
1127 /*
1128  *      Add a service into the service hash table
1129  */
1130 static int
1131 ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
1132                   struct ip_vs_service **svc_p)
1133 {
1134         int ret = 0;
1135         struct ip_vs_scheduler *sched = NULL;
1136         struct ip_vs_pe *pe = NULL;
1137         struct ip_vs_service *svc = NULL;
1138         struct netns_ipvs *ipvs = net_ipvs(net);
1139
1140         /* increase the module use count */
1141         ip_vs_use_count_inc();
1142
1143         /* Lookup the scheduler by 'u->sched_name' */
1144         sched = ip_vs_scheduler_get(u->sched_name);
1145         if (sched == NULL) {
1146                 pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name);
1147                 ret = -ENOENT;
1148                 goto out_err;
1149         }
1150
1151         if (u->pe_name && *u->pe_name) {
1152                 pe = ip_vs_pe_getbyname(u->pe_name);
1153                 if (pe == NULL) {
1154                         pr_info("persistence engine module ip_vs_pe_%s "
1155                                 "not found\n", u->pe_name);
1156                         ret = -ENOENT;
1157                         goto out_err;
1158                 }
1159         }
1160
1161 #ifdef CONFIG_IP_VS_IPV6
1162         if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) {
1163                 ret = -EINVAL;
1164                 goto out_err;
1165         }
1166 #endif
1167
1168         svc = kzalloc(sizeof(struct ip_vs_service), GFP_KERNEL);
1169         if (svc == NULL) {
1170                 IP_VS_DBG(1, "%s(): no memory\n", __func__);
1171                 ret = -ENOMEM;
1172                 goto out_err;
1173         }
1174         svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
1175         if (!svc->stats.cpustats) {
1176                 ret = -ENOMEM;
1177                 goto out_err;
1178         }
1179
1180         /* I'm the first user of the service */
1181         atomic_set(&svc->usecnt, 0);
1182         atomic_set(&svc->refcnt, 0);
1183
1184         svc->af = u->af;
1185         svc->protocol = u->protocol;
1186         ip_vs_addr_copy(svc->af, &svc->addr, &u->addr);
1187         svc->port = u->port;
1188         svc->fwmark = u->fwmark;
1189         svc->flags = u->flags;
1190         svc->timeout = u->timeout * HZ;
1191         svc->netmask = u->netmask;
1192         svc->net = net;
1193
1194         INIT_LIST_HEAD(&svc->destinations);
1195         rwlock_init(&svc->sched_lock);
1196         spin_lock_init(&svc->stats.lock);
1197
1198         /* Bind the scheduler */
1199         ret = ip_vs_bind_scheduler(svc, sched);
1200         if (ret)
1201                 goto out_err;
1202         sched = NULL;
1203
1204         /* Bind the ct retriever */
1205         ip_vs_bind_pe(svc, pe);
1206         pe = NULL;
1207
1208         /* Update the virtual service counters */
1209         if (svc->port == FTPPORT)
1210                 atomic_inc(&ipvs->ftpsvc_counter);
1211         else if (svc->port == 0)
1212                 atomic_inc(&ipvs->nullsvc_counter);
1213
1214         ip_vs_start_estimator(net, &svc->stats);
1215
1216         /* Count only IPv4 services for old get/setsockopt interface */
1217         if (svc->af == AF_INET)
1218                 ipvs->num_services++;
1219
1220         /* Hash the service into the service table */
1221         write_lock_bh(&__ip_vs_svc_lock);
1222         ip_vs_svc_hash(svc);
1223         write_unlock_bh(&__ip_vs_svc_lock);
1224
1225         *svc_p = svc;
1226         /* Now there is a service - full throttle */
1227         ipvs->enable = 1;
1228         return 0;
1229
1230
1231  out_err:
1232         if (svc != NULL) {
1233                 ip_vs_unbind_scheduler(svc);
1234                 if (svc->inc) {
1235                         local_bh_disable();
1236                         ip_vs_app_inc_put(svc->inc);
1237                         local_bh_enable();
1238                 }
1239                 if (svc->stats.cpustats)
1240                         free_percpu(svc->stats.cpustats);
1241                 kfree(svc);
1242         }
1243         ip_vs_scheduler_put(sched);
1244         ip_vs_pe_put(pe);
1245
1246         /* decrease the module use count */
1247         ip_vs_use_count_dec();
1248
1249         return ret;
1250 }
1251
1252
1253 /*
1254  *      Edit a service and bind it with a new scheduler
1255  */
1256 static int
1257 ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
1258 {
1259         struct ip_vs_scheduler *sched, *old_sched;
1260         struct ip_vs_pe *pe = NULL, *old_pe = NULL;
1261         int ret = 0;
1262
1263         /*
1264          * Lookup the scheduler, by 'u->sched_name'
1265          */
1266         sched = ip_vs_scheduler_get(u->sched_name);
1267         if (sched == NULL) {
1268                 pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name);
1269                 return -ENOENT;
1270         }
1271         old_sched = sched;
1272
1273         if (u->pe_name && *u->pe_name) {
1274                 pe = ip_vs_pe_getbyname(u->pe_name);
1275                 if (pe == NULL) {
1276                         pr_info("persistence engine module ip_vs_pe_%s "
1277                                 "not found\n", u->pe_name);
1278                         ret = -ENOENT;
1279                         goto out;
1280                 }
1281                 old_pe = pe;
1282         }
1283
1284 #ifdef CONFIG_IP_VS_IPV6
1285         if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) {
1286                 ret = -EINVAL;
1287                 goto out;
1288         }
1289 #endif
1290
1291         write_lock_bh(&__ip_vs_svc_lock);
1292
1293         /*
1294          * Wait until all other svc users go away.
1295          */
1296         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1297
1298         /*
1299          * Set the flags and timeout value
1300          */
1301         svc->flags = u->flags | IP_VS_SVC_F_HASHED;
1302         svc->timeout = u->timeout * HZ;
1303         svc->netmask = u->netmask;
1304
1305         old_sched = svc->scheduler;
1306         if (sched != old_sched) {
1307                 /*
1308                  * Unbind the old scheduler
1309                  */
1310                 if ((ret = ip_vs_unbind_scheduler(svc))) {
1311                         old_sched = sched;
1312                         goto out_unlock;
1313                 }
1314
1315                 /*
1316                  * Bind the new scheduler
1317                  */
1318                 if ((ret = ip_vs_bind_scheduler(svc, sched))) {
1319                         /*
1320                          * If ip_vs_bind_scheduler fails, restore the old
1321                          * scheduler.
1322                          * The main reason of failure is out of memory.
1323                          *
1324                          * The question is if the old scheduler can be
1325                          * restored all the time. TODO: if it cannot be
1326                          * restored some time, we must delete the service,
1327                          * otherwise the system may crash.
1328                          */
1329                         ip_vs_bind_scheduler(svc, old_sched);
1330                         old_sched = sched;
1331                         goto out_unlock;
1332                 }
1333         }
1334
1335         old_pe = svc->pe;
1336         if (pe != old_pe) {
1337                 ip_vs_unbind_pe(svc);
1338                 ip_vs_bind_pe(svc, pe);
1339         }
1340
1341 out_unlock:
1342         write_unlock_bh(&__ip_vs_svc_lock);
1343 out:
1344         ip_vs_scheduler_put(old_sched);
1345         ip_vs_pe_put(old_pe);
1346         return ret;
1347 }
1348
1349
1350 /*
1351  *      Delete a service from the service list
1352  *      - The service must be unlinked, unlocked and not referenced!
1353  *      - We are called under _bh lock
1354  */
1355 static void __ip_vs_del_service(struct ip_vs_service *svc)
1356 {
1357         struct ip_vs_dest *dest, *nxt;
1358         struct ip_vs_scheduler *old_sched;
1359         struct ip_vs_pe *old_pe;
1360         struct netns_ipvs *ipvs = net_ipvs(svc->net);
1361
1362         pr_info("%s: enter\n", __func__);
1363
1364         /* Count only IPv4 services for old get/setsockopt interface */
1365         if (svc->af == AF_INET)
1366                 ipvs->num_services--;
1367
1368         ip_vs_stop_estimator(svc->net, &svc->stats);
1369
1370         /* Unbind scheduler */
1371         old_sched = svc->scheduler;
1372         ip_vs_unbind_scheduler(svc);
1373         ip_vs_scheduler_put(old_sched);
1374
1375         /* Unbind persistence engine */
1376         old_pe = svc->pe;
1377         ip_vs_unbind_pe(svc);
1378         ip_vs_pe_put(old_pe);
1379
1380         /* Unbind app inc */
1381         if (svc->inc) {
1382                 ip_vs_app_inc_put(svc->inc);
1383                 svc->inc = NULL;
1384         }
1385
1386         /*
1387          *    Unlink the whole destination list
1388          */
1389         list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
1390                 __ip_vs_unlink_dest(svc, dest, 0);
1391                 __ip_vs_del_dest(svc->net, dest);
1392         }
1393
1394         /*
1395          *    Update the virtual service counters
1396          */
1397         if (svc->port == FTPPORT)
1398                 atomic_dec(&ipvs->ftpsvc_counter);
1399         else if (svc->port == 0)
1400                 atomic_dec(&ipvs->nullsvc_counter);
1401
1402         /*
1403          *    Free the service if nobody refers to it
1404          */
1405         if (atomic_read(&svc->refcnt) == 0) {
1406                 IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
1407                               svc->fwmark,
1408                               IP_VS_DBG_ADDR(svc->af, &svc->addr),
1409                               ntohs(svc->port), atomic_read(&svc->usecnt));
1410                 free_percpu(svc->stats.cpustats);
1411                 kfree(svc);
1412         }
1413
1414         /* decrease the module use count */
1415         ip_vs_use_count_dec();
1416 }
1417
1418 /*
1419  * Unlink a service from list and try to delete it if its refcnt reached 0
1420  */
1421 static void ip_vs_unlink_service(struct ip_vs_service *svc)
1422 {
1423         /*
1424          * Unhash it from the service table
1425          */
1426         write_lock_bh(&__ip_vs_svc_lock);
1427
1428         ip_vs_svc_unhash(svc);
1429
1430         /*
1431          * Wait until all the svc users go away.
1432          */
1433         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1434
1435         __ip_vs_del_service(svc);
1436
1437         write_unlock_bh(&__ip_vs_svc_lock);
1438 }
1439
1440 /*
1441  *      Delete a service from the service list
1442  */
1443 static int ip_vs_del_service(struct ip_vs_service *svc)
1444 {
1445         if (svc == NULL)
1446                 return -EEXIST;
1447         ip_vs_unlink_service(svc);
1448
1449         return 0;
1450 }
1451
1452
1453 /*
1454  *      Flush all the virtual services
1455  */
1456 static int ip_vs_flush(struct net *net)
1457 {
1458         int idx;
1459         struct ip_vs_service *svc, *nxt;
1460
1461         /*
1462          * Flush the service table hashed by <netns,protocol,addr,port>
1463          */
1464         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1465                 list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx],
1466                                          s_list) {
1467                         if (net_eq(svc->net, net))
1468                                 ip_vs_unlink_service(svc);
1469                 }
1470         }
1471
1472         /*
1473          * Flush the service table hashed by fwmark
1474          */
1475         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1476                 list_for_each_entry_safe(svc, nxt,
1477                                          &ip_vs_svc_fwm_table[idx], f_list) {
1478                         if (net_eq(svc->net, net))
1479                                 ip_vs_unlink_service(svc);
1480                 }
1481         }
1482
1483         return 0;
1484 }
1485
1486 /*
1487  *      Delete service by {netns} in the service table.
1488  *      Called by __ip_vs_cleanup()
1489  */
1490 void ip_vs_service_net_cleanup(struct net *net)
1491 {
1492         EnterFunction(2);
1493         /* Check for "full" addressed entries */
1494         mutex_lock(&__ip_vs_mutex);
1495         ip_vs_flush(net);
1496         mutex_unlock(&__ip_vs_mutex);
1497         LeaveFunction(2);
1498 }
1499 /*
1500  * Release dst hold by dst_cache
1501  */
1502 static inline void
1503 __ip_vs_dev_reset(struct ip_vs_dest *dest, struct net_device *dev)
1504 {
1505         spin_lock_bh(&dest->dst_lock);
1506         if (dest->dst_cache && dest->dst_cache->dev == dev) {
1507                 IP_VS_DBG_BUF(3, "Reset dev:%s dest %s:%u ,dest->refcnt=%d\n",
1508                               dev->name,
1509                               IP_VS_DBG_ADDR(dest->af, &dest->addr),
1510                               ntohs(dest->port),
1511                               atomic_read(&dest->refcnt));
1512                 ip_vs_dst_reset(dest);
1513         }
1514         spin_unlock_bh(&dest->dst_lock);
1515
1516 }
1517 /*
1518  * Netdev event receiver
1519  * Currently only NETDEV_UNREGISTER is handled, i.e. if we hold a reference to
1520  * a device that is "unregister" it must be released.
1521  */
1522 static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
1523                             void *ptr)
1524 {
1525         struct net_device *dev = ptr;
1526         struct net *net = dev_net(dev);
1527         struct netns_ipvs *ipvs = net_ipvs(net);
1528         struct ip_vs_service *svc;
1529         struct ip_vs_dest *dest;
1530         unsigned int idx;
1531
1532         if (event != NETDEV_UNREGISTER || !ipvs)
1533                 return NOTIFY_DONE;
1534         IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name);
1535         EnterFunction(2);
1536         mutex_lock(&__ip_vs_mutex);
1537         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1538                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1539                         if (net_eq(svc->net, net)) {
1540                                 list_for_each_entry(dest, &svc->destinations,
1541                                                     n_list) {
1542                                         __ip_vs_dev_reset(dest, dev);
1543                                 }
1544                         }
1545                 }
1546
1547                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1548                         if (net_eq(svc->net, net)) {
1549                                 list_for_each_entry(dest, &svc->destinations,
1550                                                     n_list) {
1551                                         __ip_vs_dev_reset(dest, dev);
1552                                 }
1553                         }
1554
1555                 }
1556         }
1557
1558         list_for_each_entry(dest, &ipvs->dest_trash, n_list) {
1559                 __ip_vs_dev_reset(dest, dev);
1560         }
1561         mutex_unlock(&__ip_vs_mutex);
1562         LeaveFunction(2);
1563         return NOTIFY_DONE;
1564 }
1565
1566 /*
1567  *      Zero counters in a service or all services
1568  */
1569 static int ip_vs_zero_service(struct ip_vs_service *svc)
1570 {
1571         struct ip_vs_dest *dest;
1572
1573         write_lock_bh(&__ip_vs_svc_lock);
1574         list_for_each_entry(dest, &svc->destinations, n_list) {
1575                 ip_vs_zero_stats(&dest->stats);
1576         }
1577         ip_vs_zero_stats(&svc->stats);
1578         write_unlock_bh(&__ip_vs_svc_lock);
1579         return 0;
1580 }
1581
1582 static int ip_vs_zero_all(struct net *net)
1583 {
1584         int idx;
1585         struct ip_vs_service *svc;
1586
1587         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1588                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1589                         if (net_eq(svc->net, net))
1590                                 ip_vs_zero_service(svc);
1591                 }
1592         }
1593
1594         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1595                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1596                         if (net_eq(svc->net, net))
1597                                 ip_vs_zero_service(svc);
1598                 }
1599         }
1600
1601         ip_vs_zero_stats(&net_ipvs(net)->tot_stats);
1602         return 0;
1603 }
1604
1605 #ifdef CONFIG_SYSCTL
1606
1607 static int zero;
1608 static int three = 3;
1609
1610 static int
1611 proc_do_defense_mode(ctl_table *table, int write,
1612                      void __user *buffer, size_t *lenp, loff_t *ppos)
1613 {
1614         struct net *net = current->nsproxy->net_ns;
1615         int *valp = table->data;
1616         int val = *valp;
1617         int rc;
1618
1619         rc = proc_dointvec(table, write, buffer, lenp, ppos);
1620         if (write && (*valp != val)) {
1621                 if ((*valp < 0) || (*valp > 3)) {
1622                         /* Restore the correct value */
1623                         *valp = val;
1624                 } else {
1625                         update_defense_level(net_ipvs(net));
1626                 }
1627         }
1628         return rc;
1629 }
1630
1631 static int
1632 proc_do_sync_threshold(ctl_table *table, int write,
1633                        void __user *buffer, size_t *lenp, loff_t *ppos)
1634 {
1635         int *valp = table->data;
1636         int val[2];
1637         int rc;
1638
1639         /* backup the value first */
1640         memcpy(val, valp, sizeof(val));
1641
1642         rc = proc_dointvec(table, write, buffer, lenp, ppos);
1643         if (write && (valp[0] < 0 || valp[1] < 0 ||
1644             (valp[0] >= valp[1] && valp[1]))) {
1645                 /* Restore the correct value */
1646                 memcpy(valp, val, sizeof(val));
1647         }
1648         return rc;
1649 }
1650
1651 static int
1652 proc_do_sync_mode(ctl_table *table, int write,
1653                      void __user *buffer, size_t *lenp, loff_t *ppos)
1654 {
1655         int *valp = table->data;
1656         int val = *valp;
1657         int rc;
1658
1659         rc = proc_dointvec(table, write, buffer, lenp, ppos);
1660         if (write && (*valp != val)) {
1661                 if ((*valp < 0) || (*valp > 1)) {
1662                         /* Restore the correct value */
1663                         *valp = val;
1664                 }
1665         }
1666         return rc;
1667 }
1668
1669 static int
1670 proc_do_sync_ports(ctl_table *table, int write,
1671                    void __user *buffer, size_t *lenp, loff_t *ppos)
1672 {
1673         int *valp = table->data;
1674         int val = *valp;
1675         int rc;
1676
1677         rc = proc_dointvec(table, write, buffer, lenp, ppos);
1678         if (write && (*valp != val)) {
1679                 if (*valp < 1 || !is_power_of_2(*valp)) {
1680                         /* Restore the correct value */
1681                         *valp = val;
1682                 }
1683         }
1684         return rc;
1685 }
1686
1687 /*
1688  *      IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
1689  *      Do not change order or insert new entries without
1690  *      align with netns init in ip_vs_control_net_init()
1691  */
1692
1693 static struct ctl_table vs_vars[] = {
1694         {
1695                 .procname       = "amemthresh",
1696                 .maxlen         = sizeof(int),
1697                 .mode           = 0644,
1698                 .proc_handler   = proc_dointvec,
1699         },
1700         {
1701                 .procname       = "am_droprate",
1702                 .maxlen         = sizeof(int),
1703                 .mode           = 0644,
1704                 .proc_handler   = proc_dointvec,
1705         },
1706         {
1707                 .procname       = "drop_entry",
1708                 .maxlen         = sizeof(int),
1709                 .mode           = 0644,
1710                 .proc_handler   = proc_do_defense_mode,
1711         },
1712         {
1713                 .procname       = "drop_packet",
1714                 .maxlen         = sizeof(int),
1715                 .mode           = 0644,
1716                 .proc_handler   = proc_do_defense_mode,
1717         },
1718 #ifdef CONFIG_IP_VS_NFCT
1719         {
1720                 .procname       = "conntrack",
1721                 .maxlen         = sizeof(int),
1722                 .mode           = 0644,
1723                 .proc_handler   = &proc_dointvec,
1724         },
1725 #endif
1726         {
1727                 .procname       = "secure_tcp",
1728                 .maxlen         = sizeof(int),
1729                 .mode           = 0644,
1730                 .proc_handler   = proc_do_defense_mode,
1731         },
1732         {
1733                 .procname       = "snat_reroute",
1734                 .maxlen         = sizeof(int),
1735                 .mode           = 0644,
1736                 .proc_handler   = &proc_dointvec,
1737         },
1738         {
1739                 .procname       = "sync_version",
1740                 .maxlen         = sizeof(int),
1741                 .mode           = 0644,
1742                 .proc_handler   = &proc_do_sync_mode,
1743         },
1744         {
1745                 .procname       = "sync_ports",
1746                 .maxlen         = sizeof(int),
1747                 .mode           = 0644,
1748                 .proc_handler   = &proc_do_sync_ports,
1749         },
1750         {
1751                 .procname       = "sync_qlen_max",
1752                 .maxlen         = sizeof(int),
1753                 .mode           = 0644,
1754                 .proc_handler   = proc_dointvec,
1755         },
1756         {
1757                 .procname       = "sync_sock_size",
1758                 .maxlen         = sizeof(int),
1759                 .mode           = 0644,
1760                 .proc_handler   = proc_dointvec,
1761         },
1762         {
1763                 .procname       = "cache_bypass",
1764                 .maxlen         = sizeof(int),
1765                 .mode           = 0644,
1766                 .proc_handler   = proc_dointvec,
1767         },
1768         {
1769                 .procname       = "expire_nodest_conn",
1770                 .maxlen         = sizeof(int),
1771                 .mode           = 0644,
1772                 .proc_handler   = proc_dointvec,
1773         },
1774         {
1775                 .procname       = "expire_quiescent_template",
1776                 .maxlen         = sizeof(int),
1777                 .mode           = 0644,
1778                 .proc_handler   = proc_dointvec,
1779         },
1780         {
1781                 .procname       = "sync_threshold",
1782                 .maxlen         =
1783                         sizeof(((struct netns_ipvs *)0)->sysctl_sync_threshold),
1784                 .mode           = 0644,
1785                 .proc_handler   = proc_do_sync_threshold,
1786         },
1787         {
1788                 .procname       = "sync_refresh_period",
1789                 .maxlen         = sizeof(int),
1790                 .mode           = 0644,
1791                 .proc_handler   = proc_dointvec_jiffies,
1792         },
1793         {
1794                 .procname       = "sync_retries",
1795                 .maxlen         = sizeof(int),
1796                 .mode           = 0644,
1797                 .proc_handler   = proc_dointvec_minmax,
1798                 .extra1         = &zero,
1799                 .extra2         = &three,
1800         },
1801         {
1802                 .procname       = "nat_icmp_send",
1803                 .maxlen         = sizeof(int),
1804                 .mode           = 0644,
1805                 .proc_handler   = proc_dointvec,
1806         },
1807         {
1808                 .procname       = "pmtu_disc",
1809                 .maxlen         = sizeof(int),
1810                 .mode           = 0644,
1811                 .proc_handler   = proc_dointvec,
1812         },
1813         {
1814                 .procname       = "backup_only",
1815                 .maxlen         = sizeof(int),
1816                 .mode           = 0644,
1817                 .proc_handler   = proc_dointvec,
1818         },
1819 #ifdef CONFIG_IP_VS_DEBUG
1820         {
1821                 .procname       = "debug_level",
1822                 .data           = &sysctl_ip_vs_debug_level,
1823                 .maxlen         = sizeof(int),
1824                 .mode           = 0644,
1825                 .proc_handler   = proc_dointvec,
1826         },
1827 #endif
1828 #if 0
1829         {
1830                 .procname       = "timeout_established",
1831                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
1832                 .maxlen         = sizeof(int),
1833                 .mode           = 0644,
1834                 .proc_handler   = proc_dointvec_jiffies,
1835         },
1836         {
1837                 .procname       = "timeout_synsent",
1838                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
1839                 .maxlen         = sizeof(int),
1840                 .mode           = 0644,
1841                 .proc_handler   = proc_dointvec_jiffies,
1842         },
1843         {
1844                 .procname       = "timeout_synrecv",
1845                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
1846                 .maxlen         = sizeof(int),
1847                 .mode           = 0644,
1848                 .proc_handler   = proc_dointvec_jiffies,
1849         },
1850         {
1851                 .procname       = "timeout_finwait",
1852                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
1853                 .maxlen         = sizeof(int),
1854                 .mode           = 0644,
1855                 .proc_handler   = proc_dointvec_jiffies,
1856         },
1857         {
1858                 .procname       = "timeout_timewait",
1859                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
1860                 .maxlen         = sizeof(int),
1861                 .mode           = 0644,
1862                 .proc_handler   = proc_dointvec_jiffies,
1863         },
1864         {
1865                 .procname       = "timeout_close",
1866                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
1867                 .maxlen         = sizeof(int),
1868                 .mode           = 0644,
1869                 .proc_handler   = proc_dointvec_jiffies,
1870         },
1871         {
1872                 .procname       = "timeout_closewait",
1873                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
1874                 .maxlen         = sizeof(int),
1875                 .mode           = 0644,
1876                 .proc_handler   = proc_dointvec_jiffies,
1877         },
1878         {
1879                 .procname       = "timeout_lastack",
1880                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
1881                 .maxlen         = sizeof(int),
1882                 .mode           = 0644,
1883                 .proc_handler   = proc_dointvec_jiffies,
1884         },
1885         {
1886                 .procname       = "timeout_listen",
1887                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
1888                 .maxlen         = sizeof(int),
1889                 .mode           = 0644,
1890                 .proc_handler   = proc_dointvec_jiffies,
1891         },
1892         {
1893                 .procname       = "timeout_synack",
1894                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
1895                 .maxlen         = sizeof(int),
1896                 .mode           = 0644,
1897                 .proc_handler   = proc_dointvec_jiffies,
1898         },
1899         {
1900                 .procname       = "timeout_udp",
1901                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
1902                 .maxlen         = sizeof(int),
1903                 .mode           = 0644,
1904                 .proc_handler   = proc_dointvec_jiffies,
1905         },
1906         {
1907                 .procname       = "timeout_icmp",
1908                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
1909                 .maxlen         = sizeof(int),
1910                 .mode           = 0644,
1911                 .proc_handler   = proc_dointvec_jiffies,
1912         },
1913 #endif
1914         { }
1915 };
1916
1917 #endif
1918
1919 #ifdef CONFIG_PROC_FS
1920
1921 struct ip_vs_iter {
1922         struct seq_net_private p;  /* Do not move this, netns depends upon it*/
1923         struct list_head *table;
1924         int bucket;
1925 };
1926
1927 /*
1928  *      Write the contents of the VS rule table to a PROCfs file.
1929  *      (It is kept just for backward compatibility)
1930  */
1931 static inline const char *ip_vs_fwd_name(unsigned int flags)
1932 {
1933         switch (flags & IP_VS_CONN_F_FWD_MASK) {
1934         case IP_VS_CONN_F_LOCALNODE:
1935                 return "Local";
1936         case IP_VS_CONN_F_TUNNEL:
1937                 return "Tunnel";
1938         case IP_VS_CONN_F_DROUTE:
1939                 return "Route";
1940         default:
1941                 return "Masq";
1942         }
1943 }
1944
1945
1946 /* Get the Nth entry in the two lists */
1947 static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
1948 {
1949         struct net *net = seq_file_net(seq);
1950         struct ip_vs_iter *iter = seq->private;
1951         int idx;
1952         struct ip_vs_service *svc;
1953
1954         /* look in hash by protocol */
1955         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1956                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1957                         if (net_eq(svc->net, net) && pos-- == 0) {
1958                                 iter->table = ip_vs_svc_table;
1959                                 iter->bucket = idx;
1960                                 return svc;
1961                         }
1962                 }
1963         }
1964
1965         /* keep looking in fwmark */
1966         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1967                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1968                         if (net_eq(svc->net, net) && pos-- == 0) {
1969                                 iter->table = ip_vs_svc_fwm_table;
1970                                 iter->bucket = idx;
1971                                 return svc;
1972                         }
1973                 }
1974         }
1975
1976         return NULL;
1977 }
1978
1979 static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
1980 __acquires(__ip_vs_svc_lock)
1981 {
1982
1983         read_lock_bh(&__ip_vs_svc_lock);
1984         return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
1985 }
1986
1987
1988 static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1989 {
1990         struct list_head *e;
1991         struct ip_vs_iter *iter;
1992         struct ip_vs_service *svc;
1993
1994         ++*pos;
1995         if (v == SEQ_START_TOKEN)
1996                 return ip_vs_info_array(seq,0);
1997
1998         svc = v;
1999         iter = seq->private;
2000
2001         if (iter->table == ip_vs_svc_table) {
2002                 /* next service in table hashed by protocol */
2003                 if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
2004                         return list_entry(e, struct ip_vs_service, s_list);
2005
2006
2007                 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
2008                         list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
2009                                             s_list) {
2010                                 return svc;
2011                         }
2012                 }
2013
2014                 iter->table = ip_vs_svc_fwm_table;
2015                 iter->bucket = -1;
2016                 goto scan_fwmark;
2017         }
2018
2019         /* next service in hashed by fwmark */
2020         if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
2021                 return list_entry(e, struct ip_vs_service, f_list);
2022
2023  scan_fwmark:
2024         while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
2025                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
2026                                     f_list)
2027                         return svc;
2028         }
2029
2030         return NULL;
2031 }
2032
2033 static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
2034 __releases(__ip_vs_svc_lock)
2035 {
2036         read_unlock_bh(&__ip_vs_svc_lock);
2037 }
2038
2039
2040 static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
2041 {
2042         if (v == SEQ_START_TOKEN) {
2043                 seq_printf(seq,
2044                         "IP Virtual Server version %d.%d.%d (size=%d)\n",
2045                         NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
2046                 seq_puts(seq,
2047                          "Prot LocalAddress:Port Scheduler Flags\n");
2048                 seq_puts(seq,
2049                          "  -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
2050         } else {
2051                 const struct ip_vs_service *svc = v;
2052                 const struct ip_vs_iter *iter = seq->private;
2053                 const struct ip_vs_dest *dest;
2054
2055                 if (iter->table == ip_vs_svc_table) {
2056 #ifdef CONFIG_IP_VS_IPV6
2057                         if (svc->af == AF_INET6)
2058                                 seq_printf(seq, "%s  [%pI6]:%04X %s ",
2059                                            ip_vs_proto_name(svc->protocol),
2060                                            &svc->addr.in6,
2061                                            ntohs(svc->port),
2062                                            svc->scheduler->name);
2063                         else
2064 #endif
2065                                 seq_printf(seq, "%s  %08X:%04X %s %s ",
2066                                            ip_vs_proto_name(svc->protocol),
2067                                            ntohl(svc->addr.ip),
2068                                            ntohs(svc->port),
2069                                            svc->scheduler->name,
2070                                            (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
2071                 } else {
2072                         seq_printf(seq, "FWM  %08X %s %s",
2073                                    svc->fwmark, svc->scheduler->name,
2074                                    (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
2075                 }
2076
2077                 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
2078                         seq_printf(seq, "persistent %d %08X\n",
2079                                 svc->timeout,
2080                                 ntohl(svc->netmask));
2081                 else
2082                         seq_putc(seq, '\n');
2083
2084                 list_for_each_entry(dest, &svc->destinations, n_list) {
2085 #ifdef CONFIG_IP_VS_IPV6
2086                         if (dest->af == AF_INET6)
2087                                 seq_printf(seq,
2088                                            "  -> [%pI6]:%04X"
2089                                            "      %-7s %-6d %-10d %-10d\n",
2090                                            &dest->addr.in6,
2091                                            ntohs(dest->port),
2092                                            ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
2093                                            atomic_read(&dest->weight),
2094                                            atomic_read(&dest->activeconns),
2095                                            atomic_read(&dest->inactconns));
2096                         else
2097 #endif
2098                                 seq_printf(seq,
2099                                            "  -> %08X:%04X      "
2100                                            "%-7s %-6d %-10d %-10d\n",
2101                                            ntohl(dest->addr.ip),
2102                                            ntohs(dest->port),
2103                                            ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
2104                                            atomic_read(&dest->weight),
2105                                            atomic_read(&dest->activeconns),
2106                                            atomic_read(&dest->inactconns));
2107
2108                 }
2109         }
2110         return 0;
2111 }
2112
2113 static const struct seq_operations ip_vs_info_seq_ops = {
2114         .start = ip_vs_info_seq_start,
2115         .next  = ip_vs_info_seq_next,
2116         .stop  = ip_vs_info_seq_stop,
2117         .show  = ip_vs_info_seq_show,
2118 };
2119
2120 static int ip_vs_info_open(struct inode *inode, struct file *file)
2121 {
2122         return seq_open_net(inode, file, &ip_vs_info_seq_ops,
2123                         sizeof(struct ip_vs_iter));
2124 }
2125
2126 static const struct file_operations ip_vs_info_fops = {
2127         .owner   = THIS_MODULE,
2128         .open    = ip_vs_info_open,
2129         .read    = seq_read,
2130         .llseek  = seq_lseek,
2131         .release = seq_release_net,
2132 };
2133
2134 static int ip_vs_stats_show(struct seq_file *seq, void *v)
2135 {
2136         struct net *net = seq_file_single_net(seq);
2137         struct ip_vs_stats_user show;
2138
2139 /*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
2140         seq_puts(seq,
2141                  "   Total Incoming Outgoing         Incoming         Outgoing\n");
2142         seq_printf(seq,
2143                    "   Conns  Packets  Packets            Bytes            Bytes\n");
2144
2145         ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats);
2146         seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", show.conns,
2147                    show.inpkts, show.outpkts,
2148                    (unsigned long long) show.inbytes,
2149                    (unsigned long long) show.outbytes);
2150
2151 /*                 01234567 01234567 01234567 0123456701234567 0123456701234567 */
2152         seq_puts(seq,
2153                    " Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
2154         seq_printf(seq, "%8X %8X %8X %16X %16X\n",
2155                         show.cps, show.inpps, show.outpps,
2156                         show.inbps, show.outbps);
2157
2158         return 0;
2159 }
2160
2161 static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
2162 {
2163         return single_open_net(inode, file, ip_vs_stats_show);
2164 }
2165
2166 static const struct file_operations ip_vs_stats_fops = {
2167         .owner = THIS_MODULE,
2168         .open = ip_vs_stats_seq_open,
2169         .read = seq_read,
2170         .llseek = seq_lseek,
2171         .release = single_release_net,
2172 };
2173
2174 static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
2175 {
2176         struct net *net = seq_file_single_net(seq);
2177         struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats;
2178         struct ip_vs_cpu_stats *cpustats = tot_stats->cpustats;
2179         struct ip_vs_stats_user rates;
2180         int i;
2181
2182 /*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
2183         seq_puts(seq,
2184                  "       Total Incoming Outgoing         Incoming         Outgoing\n");
2185         seq_printf(seq,
2186                    "CPU    Conns  Packets  Packets            Bytes            Bytes\n");
2187
2188         for_each_possible_cpu(i) {
2189                 struct ip_vs_cpu_stats *u = per_cpu_ptr(cpustats, i);
2190                 unsigned int start;
2191                 __u64 inbytes, outbytes;
2192
2193                 do {
2194                         start = u64_stats_fetch_begin_bh(&u->syncp);
2195                         inbytes = u->ustats.inbytes;
2196                         outbytes = u->ustats.outbytes;
2197                 } while (u64_stats_fetch_retry_bh(&u->syncp, start));
2198
2199                 seq_printf(seq, "%3X %8X %8X %8X %16LX %16LX\n",
2200                            i, u->ustats.conns, u->ustats.inpkts,
2201                            u->ustats.outpkts, (__u64)inbytes,
2202                            (__u64)outbytes);
2203         }
2204
2205         spin_lock_bh(&tot_stats->lock);
2206
2207         seq_printf(seq, "  ~ %8X %8X %8X %16LX %16LX\n\n",
2208                    tot_stats->ustats.conns, tot_stats->ustats.inpkts,
2209                    tot_stats->ustats.outpkts,
2210                    (unsigned long long) tot_stats->ustats.inbytes,
2211                    (unsigned long long) tot_stats->ustats.outbytes);
2212
2213         ip_vs_read_estimator(&rates, tot_stats);
2214
2215         spin_unlock_bh(&tot_stats->lock);
2216
2217 /*                 01234567 01234567 01234567 0123456701234567 0123456701234567 */
2218         seq_puts(seq,
2219                    "     Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
2220         seq_printf(seq, "    %8X %8X %8X %16X %16X\n",
2221                         rates.cps,
2222                         rates.inpps,
2223                         rates.outpps,
2224                         rates.inbps,
2225                         rates.outbps);
2226
2227         return 0;
2228 }
2229
2230 static int ip_vs_stats_percpu_seq_open(struct inode *inode, struct file *file)
2231 {
2232         return single_open_net(inode, file, ip_vs_stats_percpu_show);
2233 }
2234
2235 static const struct file_operations ip_vs_stats_percpu_fops = {
2236         .owner = THIS_MODULE,
2237         .open = ip_vs_stats_percpu_seq_open,
2238         .read = seq_read,
2239         .llseek = seq_lseek,
2240         .release = single_release_net,
2241 };
2242 #endif
2243
2244 /*
2245  *      Set timeout values for tcp tcpfin udp in the timeout_table.
2246  */
2247 static int ip_vs_set_timeout(struct net *net, struct ip_vs_timeout_user *u)
2248 {
2249 #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
2250         struct ip_vs_proto_data *pd;
2251 #endif
2252
2253         IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
2254                   u->tcp_timeout,
2255                   u->tcp_fin_timeout,
2256                   u->udp_timeout);
2257
2258 #ifdef CONFIG_IP_VS_PROTO_TCP
2259         if (u->tcp_timeout) {
2260                 pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
2261                 pd->timeout_table[IP_VS_TCP_S_ESTABLISHED]
2262                         = u->tcp_timeout * HZ;
2263         }
2264
2265         if (u->tcp_fin_timeout) {
2266                 pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
2267                 pd->timeout_table[IP_VS_TCP_S_FIN_WAIT]
2268                         = u->tcp_fin_timeout * HZ;
2269         }
2270 #endif
2271
2272 #ifdef CONFIG_IP_VS_PROTO_UDP
2273         if (u->udp_timeout) {
2274                 pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
2275                 pd->timeout_table[IP_VS_UDP_S_NORMAL]
2276                         = u->udp_timeout * HZ;
2277         }
2278 #endif
2279         return 0;
2280 }
2281
2282
2283 #define SET_CMDID(cmd)          (cmd - IP_VS_BASE_CTL)
2284 #define SERVICE_ARG_LEN         (sizeof(struct ip_vs_service_user))
2285 #define SVCDEST_ARG_LEN         (sizeof(struct ip_vs_service_user) +    \
2286                                  sizeof(struct ip_vs_dest_user))
2287 #define TIMEOUT_ARG_LEN         (sizeof(struct ip_vs_timeout_user))
2288 #define DAEMON_ARG_LEN          (sizeof(struct ip_vs_daemon_user))
2289 #define MAX_ARG_LEN             SVCDEST_ARG_LEN
2290
2291 static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
2292         [SET_CMDID(IP_VS_SO_SET_ADD)]           = SERVICE_ARG_LEN,
2293         [SET_CMDID(IP_VS_SO_SET_EDIT)]          = SERVICE_ARG_LEN,
2294         [SET_CMDID(IP_VS_SO_SET_DEL)]           = SERVICE_ARG_LEN,
2295         [SET_CMDID(IP_VS_SO_SET_FLUSH)]         = 0,
2296         [SET_CMDID(IP_VS_SO_SET_ADDDEST)]       = SVCDEST_ARG_LEN,
2297         [SET_CMDID(IP_VS_SO_SET_DELDEST)]       = SVCDEST_ARG_LEN,
2298         [SET_CMDID(IP_VS_SO_SET_EDITDEST)]      = SVCDEST_ARG_LEN,
2299         [SET_CMDID(IP_VS_SO_SET_TIMEOUT)]       = TIMEOUT_ARG_LEN,
2300         [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)]   = DAEMON_ARG_LEN,
2301         [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)]    = DAEMON_ARG_LEN,
2302         [SET_CMDID(IP_VS_SO_SET_ZERO)]          = SERVICE_ARG_LEN,
2303 };
2304
2305 static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc,
2306                                   struct ip_vs_service_user *usvc_compat)
2307 {
2308         memset(usvc, 0, sizeof(*usvc));
2309
2310         usvc->af                = AF_INET;
2311         usvc->protocol          = usvc_compat->protocol;
2312         usvc->addr.ip           = usvc_compat->addr;
2313         usvc->port              = usvc_compat->port;
2314         usvc->fwmark            = usvc_compat->fwmark;
2315
2316         /* Deep copy of sched_name is not needed here */
2317         usvc->sched_name        = usvc_compat->sched_name;
2318
2319         usvc->flags             = usvc_compat->flags;
2320         usvc->timeout           = usvc_compat->timeout;
2321         usvc->netmask           = usvc_compat->netmask;
2322 }
2323
2324 static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
2325                                    struct ip_vs_dest_user *udest_compat)
2326 {
2327         memset(udest, 0, sizeof(*udest));
2328
2329         udest->addr.ip          = udest_compat->addr;
2330         udest->port             = udest_compat->port;
2331         udest->conn_flags       = udest_compat->conn_flags;
2332         udest->weight           = udest_compat->weight;
2333         udest->u_threshold      = udest_compat->u_threshold;
2334         udest->l_threshold      = udest_compat->l_threshold;
2335 }
2336
2337 static int
2338 do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
2339 {
2340         struct net *net = sock_net(sk);
2341         int ret;
2342         unsigned char arg[MAX_ARG_LEN];
2343         struct ip_vs_service_user *usvc_compat;
2344         struct ip_vs_service_user_kern usvc;
2345         struct ip_vs_service *svc;
2346         struct ip_vs_dest_user *udest_compat;
2347         struct ip_vs_dest_user_kern udest;
2348         struct netns_ipvs *ipvs = net_ipvs(net);
2349
2350         if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2351                 return -EPERM;
2352
2353         if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_SET_MAX)
2354                 return -EINVAL;
2355         if (len < 0 || len >  MAX_ARG_LEN)
2356                 return -EINVAL;
2357         if (len != set_arglen[SET_CMDID(cmd)]) {
2358                 pr_err("set_ctl: len %u != %u\n",
2359                        len, set_arglen[SET_CMDID(cmd)]);
2360                 return -EINVAL;
2361         }
2362
2363         if (copy_from_user(arg, user, len) != 0)
2364                 return -EFAULT;
2365
2366         /* increase the module use count */
2367         ip_vs_use_count_inc();
2368
2369         /* Handle daemons since they have another lock */
2370         if (cmd == IP_VS_SO_SET_STARTDAEMON ||
2371             cmd == IP_VS_SO_SET_STOPDAEMON) {
2372                 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
2373
2374                 if (mutex_lock_interruptible(&ipvs->sync_mutex)) {
2375                         ret = -ERESTARTSYS;
2376                         goto out_dec;
2377                 }
2378                 if (cmd == IP_VS_SO_SET_STARTDAEMON)
2379                         ret = start_sync_thread(net, dm->state, dm->mcast_ifn,
2380                                                 dm->syncid);
2381                 else
2382                         ret = stop_sync_thread(net, dm->state);
2383                 mutex_unlock(&ipvs->sync_mutex);
2384                 goto out_dec;
2385         }
2386
2387         if (mutex_lock_interruptible(&__ip_vs_mutex)) {
2388                 ret = -ERESTARTSYS;
2389                 goto out_dec;
2390         }
2391
2392         if (cmd == IP_VS_SO_SET_FLUSH) {
2393                 /* Flush the virtual service */
2394                 ret = ip_vs_flush(net);
2395                 goto out_unlock;
2396         } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
2397                 /* Set timeout values for (tcp tcpfin udp) */
2398                 ret = ip_vs_set_timeout(net, (struct ip_vs_timeout_user *)arg);
2399                 goto out_unlock;
2400         }
2401
2402         usvc_compat = (struct ip_vs_service_user *)arg;
2403         udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1);
2404
2405         /* We only use the new structs internally, so copy userspace compat
2406          * structs to extended internal versions */
2407         ip_vs_copy_usvc_compat(&usvc, usvc_compat);
2408         ip_vs_copy_udest_compat(&udest, udest_compat);
2409
2410         if (cmd == IP_VS_SO_SET_ZERO) {
2411                 /* if no service address is set, zero counters in all */
2412                 if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) {
2413                         ret = ip_vs_zero_all(net);
2414                         goto out_unlock;
2415                 }
2416         }
2417
2418         /* Check for valid protocol: TCP or UDP or SCTP, even for fwmark!=0 */
2419         if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP &&
2420             usvc.protocol != IPPROTO_SCTP) {
2421                 pr_err("set_ctl: invalid protocol: %d %pI4:%d %s\n",
2422                        usvc.protocol, &usvc.addr.ip,
2423                        ntohs(usvc.port), usvc.sched_name);
2424                 ret = -EFAULT;
2425                 goto out_unlock;
2426         }
2427
2428         /* Lookup the exact service by <protocol, addr, port> or fwmark */
2429         if (usvc.fwmark == 0)
2430                 svc = __ip_vs_service_find(net, usvc.af, usvc.protocol,
2431                                            &usvc.addr, usvc.port);
2432         else
2433                 svc = __ip_vs_svc_fwm_find(net, usvc.af, usvc.fwmark);
2434
2435         if (cmd != IP_VS_SO_SET_ADD
2436             && (svc == NULL || svc->protocol != usvc.protocol)) {
2437                 ret = -ESRCH;
2438                 goto out_unlock;
2439         }
2440
2441         switch (cmd) {
2442         case IP_VS_SO_SET_ADD:
2443                 if (svc != NULL)
2444                         ret = -EEXIST;
2445                 else
2446                         ret = ip_vs_add_service(net, &usvc, &svc);
2447                 break;
2448         case IP_VS_SO_SET_EDIT:
2449                 ret = ip_vs_edit_service(svc, &usvc);
2450                 break;
2451         case IP_VS_SO_SET_DEL:
2452                 ret = ip_vs_del_service(svc);
2453                 if (!ret)
2454                         goto out_unlock;
2455                 break;
2456         case IP_VS_SO_SET_ZERO:
2457                 ret = ip_vs_zero_service(svc);
2458                 break;
2459         case IP_VS_SO_SET_ADDDEST:
2460                 ret = ip_vs_add_dest(svc, &udest);
2461                 break;
2462         case IP_VS_SO_SET_EDITDEST:
2463                 ret = ip_vs_edit_dest(svc, &udest);
2464                 break;
2465         case IP_VS_SO_SET_DELDEST:
2466                 ret = ip_vs_del_dest(svc, &udest);
2467                 break;
2468         default:
2469                 ret = -EINVAL;
2470         }
2471
2472   out_unlock:
2473         mutex_unlock(&__ip_vs_mutex);
2474   out_dec:
2475         /* decrease the module use count */
2476         ip_vs_use_count_dec();
2477
2478         return ret;
2479 }
2480
2481
2482 static void
2483 ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2484 {
2485         dst->protocol = src->protocol;
2486         dst->addr = src->addr.ip;
2487         dst->port = src->port;
2488         dst->fwmark = src->fwmark;
2489         strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
2490         dst->flags = src->flags;
2491         dst->timeout = src->timeout / HZ;
2492         dst->netmask = src->netmask;
2493         dst->num_dests = src->num_dests;
2494         ip_vs_copy_stats(&dst->stats, &src->stats);
2495 }
2496
2497 static inline int
2498 __ip_vs_get_service_entries(struct net *net,
2499                             const struct ip_vs_get_services *get,
2500                             struct ip_vs_get_services __user *uptr)
2501 {
2502         int idx, count=0;
2503         struct ip_vs_service *svc;
2504         struct ip_vs_service_entry entry;
2505         int ret = 0;
2506
2507         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2508                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2509                         /* Only expose IPv4 entries to old interface */
2510                         if (svc->af != AF_INET || !net_eq(svc->net, net))
2511                                 continue;
2512
2513                         if (count >= get->num_services)
2514                                 goto out;
2515                         memset(&entry, 0, sizeof(entry));
2516                         ip_vs_copy_service(&entry, svc);
2517                         if (copy_to_user(&uptr->entrytable[count],
2518                                          &entry, sizeof(entry))) {
2519                                 ret = -EFAULT;
2520                                 goto out;
2521                         }
2522                         count++;
2523                 }
2524         }
2525
2526         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2527                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2528                         /* Only expose IPv4 entries to old interface */
2529                         if (svc->af != AF_INET || !net_eq(svc->net, net))
2530                                 continue;
2531
2532                         if (count >= get->num_services)
2533                                 goto out;
2534                         memset(&entry, 0, sizeof(entry));
2535                         ip_vs_copy_service(&entry, svc);
2536                         if (copy_to_user(&uptr->entrytable[count],
2537                                          &entry, sizeof(entry))) {
2538                                 ret = -EFAULT;
2539                                 goto out;
2540                         }
2541                         count++;
2542                 }
2543         }
2544 out:
2545         return ret;
2546 }
2547
2548 static inline int
2549 __ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get,
2550                          struct ip_vs_get_dests __user *uptr)
2551 {
2552         struct ip_vs_service *svc;
2553         union nf_inet_addr addr = { .ip = get->addr };
2554         int ret = 0;
2555
2556         if (get->fwmark)
2557                 svc = __ip_vs_svc_fwm_find(net, AF_INET, get->fwmark);
2558         else
2559                 svc = __ip_vs_service_find(net, AF_INET, get->protocol, &addr,
2560                                            get->port);
2561
2562         if (svc) {
2563                 int count = 0;
2564                 struct ip_vs_dest *dest;
2565                 struct ip_vs_dest_entry entry;
2566
2567                 list_for_each_entry(dest, &svc->destinations, n_list) {
2568                         if (count >= get->num_dests)
2569                                 break;
2570
2571                         entry.addr = dest->addr.ip;
2572                         entry.port = dest->port;
2573                         entry.conn_flags = atomic_read(&dest->conn_flags);
2574                         entry.weight = atomic_read(&dest->weight);
2575                         entry.u_threshold = dest->u_threshold;
2576                         entry.l_threshold = dest->l_threshold;
2577                         entry.activeconns = atomic_read(&dest->activeconns);
2578                         entry.inactconns = atomic_read(&dest->inactconns);
2579                         entry.persistconns = atomic_read(&dest->persistconns);
2580                         ip_vs_copy_stats(&entry.stats, &dest->stats);
2581                         if (copy_to_user(&uptr->entrytable[count],
2582                                          &entry, sizeof(entry))) {
2583                                 ret = -EFAULT;
2584                                 break;
2585                         }
2586                         count++;
2587                 }
2588         } else
2589                 ret = -ESRCH;
2590         return ret;
2591 }
2592
2593 static inline void
2594 __ip_vs_get_timeouts(struct net *net, struct ip_vs_timeout_user *u)
2595 {
2596 #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
2597         struct ip_vs_proto_data *pd;
2598 #endif
2599
2600         memset(u, 0, sizeof (*u));
2601
2602 #ifdef CONFIG_IP_VS_PROTO_TCP
2603         pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
2604         u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
2605         u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
2606 #endif
2607 #ifdef CONFIG_IP_VS_PROTO_UDP
2608         pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
2609         u->udp_timeout =
2610                         pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
2611 #endif
2612 }
2613
2614
2615 #define GET_CMDID(cmd)          (cmd - IP_VS_BASE_CTL)
2616 #define GET_INFO_ARG_LEN        (sizeof(struct ip_vs_getinfo))
2617 #define GET_SERVICES_ARG_LEN    (sizeof(struct ip_vs_get_services))
2618 #define GET_SERVICE_ARG_LEN     (sizeof(struct ip_vs_service_entry))
2619 #define GET_DESTS_ARG_LEN       (sizeof(struct ip_vs_get_dests))
2620 #define GET_TIMEOUT_ARG_LEN     (sizeof(struct ip_vs_timeout_user))
2621 #define GET_DAEMON_ARG_LEN      (sizeof(struct ip_vs_daemon_user) * 2)
2622
2623 static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
2624         [GET_CMDID(IP_VS_SO_GET_VERSION)]       = 64,
2625         [GET_CMDID(IP_VS_SO_GET_INFO)]          = GET_INFO_ARG_LEN,
2626         [GET_CMDID(IP_VS_SO_GET_SERVICES)]      = GET_SERVICES_ARG_LEN,
2627         [GET_CMDID(IP_VS_SO_GET_SERVICE)]       = GET_SERVICE_ARG_LEN,
2628         [GET_CMDID(IP_VS_SO_GET_DESTS)]         = GET_DESTS_ARG_LEN,
2629         [GET_CMDID(IP_VS_SO_GET_TIMEOUT)]       = GET_TIMEOUT_ARG_LEN,
2630         [GET_CMDID(IP_VS_SO_GET_DAEMON)]        = GET_DAEMON_ARG_LEN,
2631 };
2632
2633 static int
2634 do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2635 {
2636         unsigned char arg[128];
2637         int ret = 0;
2638         unsigned int copylen;
2639         struct net *net = sock_net(sk);
2640         struct netns_ipvs *ipvs = net_ipvs(net);
2641
2642         BUG_ON(!net);
2643         if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2644                 return -EPERM;
2645
2646         if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_GET_MAX)
2647                 return -EINVAL;
2648
2649         if (*len < get_arglen[GET_CMDID(cmd)]) {
2650                 pr_err("get_ctl: len %u < %u\n",
2651                        *len, get_arglen[GET_CMDID(cmd)]);
2652                 return -EINVAL;
2653         }
2654
2655         copylen = get_arglen[GET_CMDID(cmd)];
2656         if (copylen > 128)
2657                 return -EINVAL;
2658
2659         if (copy_from_user(arg, user, copylen) != 0)
2660                 return -EFAULT;
2661         /*
2662          * Handle daemons first since it has its own locking
2663          */
2664         if (cmd == IP_VS_SO_GET_DAEMON) {
2665                 struct ip_vs_daemon_user d[2];
2666
2667                 memset(&d, 0, sizeof(d));
2668                 if (mutex_lock_interruptible(&ipvs->sync_mutex))
2669                         return -ERESTARTSYS;
2670
2671                 if (ipvs->sync_state & IP_VS_STATE_MASTER) {
2672                         d[0].state = IP_VS_STATE_MASTER;
2673                         strlcpy(d[0].mcast_ifn, ipvs->master_mcast_ifn,
2674                                 sizeof(d[0].mcast_ifn));
2675                         d[0].syncid = ipvs->master_syncid;
2676                 }
2677                 if (ipvs->sync_state & IP_VS_STATE_BACKUP) {
2678                         d[1].state = IP_VS_STATE_BACKUP;
2679                         strlcpy(d[1].mcast_ifn, ipvs->backup_mcast_ifn,
2680                                 sizeof(d[1].mcast_ifn));
2681                         d[1].syncid = ipvs->backup_syncid;
2682                 }
2683                 if (copy_to_user(user, &d, sizeof(d)) != 0)
2684                         ret = -EFAULT;
2685                 mutex_unlock(&ipvs->sync_mutex);
2686                 return ret;
2687         }
2688
2689         if (mutex_lock_interruptible(&__ip_vs_mutex))
2690                 return -ERESTARTSYS;
2691
2692         switch (cmd) {
2693         case IP_VS_SO_GET_VERSION:
2694         {
2695                 char buf[64];
2696
2697                 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
2698                         NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
2699                 if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
2700                         ret = -EFAULT;
2701                         goto out;
2702                 }
2703                 *len = strlen(buf)+1;
2704         }
2705         break;
2706
2707         case IP_VS_SO_GET_INFO:
2708         {
2709                 struct ip_vs_getinfo info;
2710                 info.version = IP_VS_VERSION_CODE;
2711                 info.size = ip_vs_conn_tab_size;
2712                 info.num_services = ipvs->num_services;
2713                 if (copy_to_user(user, &info, sizeof(info)) != 0)
2714                         ret = -EFAULT;
2715         }
2716         break;
2717
2718         case IP_VS_SO_GET_SERVICES:
2719         {
2720                 struct ip_vs_get_services *get;
2721                 int size;
2722
2723                 get = (struct ip_vs_get_services *)arg;
2724                 size = sizeof(*get) +
2725                         sizeof(struct ip_vs_service_entry) * get->num_services;
2726                 if (*len != size) {
2727                         pr_err("length: %u != %u\n", *len, size);
2728                         ret = -EINVAL;
2729                         goto out;
2730                 }
2731                 ret = __ip_vs_get_service_entries(net, get, user);
2732         }
2733         break;
2734
2735         case IP_VS_SO_GET_SERVICE:
2736         {
2737                 struct ip_vs_service_entry *entry;
2738                 struct ip_vs_service *svc;
2739                 union nf_inet_addr addr;
2740
2741                 entry = (struct ip_vs_service_entry *)arg;
2742                 addr.ip = entry->addr;
2743                 if (entry->fwmark)
2744                         svc = __ip_vs_svc_fwm_find(net, AF_INET, entry->fwmark);
2745                 else
2746                         svc = __ip_vs_service_find(net, AF_INET,
2747                                                    entry->protocol, &addr,
2748                                                    entry->port);
2749                 if (svc) {
2750                         ip_vs_copy_service(entry, svc);
2751                         if (copy_to_user(user, entry, sizeof(*entry)) != 0)
2752                                 ret = -EFAULT;
2753                 } else
2754                         ret = -ESRCH;
2755         }
2756         break;
2757
2758         case IP_VS_SO_GET_DESTS:
2759         {
2760                 struct ip_vs_get_dests *get;
2761                 int size;
2762
2763                 get = (struct ip_vs_get_dests *)arg;
2764                 size = sizeof(*get) +
2765                         sizeof(struct ip_vs_dest_entry) * get->num_dests;
2766                 if (*len != size) {
2767                         pr_err("length: %u != %u\n", *len, size);
2768                         ret = -EINVAL;
2769                         goto out;
2770                 }
2771                 ret = __ip_vs_get_dest_entries(net, get, user);
2772         }
2773         break;
2774
2775         case IP_VS_SO_GET_TIMEOUT:
2776         {
2777                 struct ip_vs_timeout_user t;
2778
2779                 __ip_vs_get_timeouts(net, &t);
2780                 if (copy_to_user(user, &t, sizeof(t)) != 0)
2781                         ret = -EFAULT;
2782         }
2783         break;
2784
2785         default:
2786                 ret = -EINVAL;
2787         }
2788
2789 out:
2790         mutex_unlock(&__ip_vs_mutex);
2791         return ret;
2792 }
2793
2794
2795 static struct nf_sockopt_ops ip_vs_sockopts = {
2796         .pf             = PF_INET,
2797         .set_optmin     = IP_VS_BASE_CTL,
2798         .set_optmax     = IP_VS_SO_SET_MAX+1,
2799         .set            = do_ip_vs_set_ctl,
2800         .get_optmin     = IP_VS_BASE_CTL,
2801         .get_optmax     = IP_VS_SO_GET_MAX+1,
2802         .get            = do_ip_vs_get_ctl,
2803         .owner          = THIS_MODULE,
2804 };
2805
2806 /*
2807  * Generic Netlink interface
2808  */
2809
2810 /* IPVS genetlink family */
2811 static struct genl_family ip_vs_genl_family = {
2812         .id             = GENL_ID_GENERATE,
2813         .hdrsize        = 0,
2814         .name           = IPVS_GENL_NAME,
2815         .version        = IPVS_GENL_VERSION,
2816         .maxattr        = IPVS_CMD_MAX,
2817         .netnsok        = true,         /* Make ipvsadm to work on netns */
2818 };
2819
2820 /* Policy used for first-level command attributes */
2821 static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = {
2822         [IPVS_CMD_ATTR_SERVICE]         = { .type = NLA_NESTED },
2823         [IPVS_CMD_ATTR_DEST]            = { .type = NLA_NESTED },
2824         [IPVS_CMD_ATTR_DAEMON]          = { .type = NLA_NESTED },
2825         [IPVS_CMD_ATTR_TIMEOUT_TCP]     = { .type = NLA_U32 },
2826         [IPVS_CMD_ATTR_TIMEOUT_TCP_FIN] = { .type = NLA_U32 },
2827         [IPVS_CMD_ATTR_TIMEOUT_UDP]     = { .type = NLA_U32 },
2828 };
2829
2830 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */
2831 static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = {
2832         [IPVS_DAEMON_ATTR_STATE]        = { .type = NLA_U32 },
2833         [IPVS_DAEMON_ATTR_MCAST_IFN]    = { .type = NLA_NUL_STRING,
2834                                             .len = IP_VS_IFNAME_MAXLEN },
2835         [IPVS_DAEMON_ATTR_SYNC_ID]      = { .type = NLA_U32 },
2836 };
2837
2838 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */
2839 static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = {
2840         [IPVS_SVC_ATTR_AF]              = { .type = NLA_U16 },
2841         [IPVS_SVC_ATTR_PROTOCOL]        = { .type = NLA_U16 },
2842         [IPVS_SVC_ATTR_ADDR]            = { .type = NLA_BINARY,
2843                                             .len = sizeof(union nf_inet_addr) },
2844         [IPVS_SVC_ATTR_PORT]            = { .type = NLA_U16 },
2845         [IPVS_SVC_ATTR_FWMARK]          = { .type = NLA_U32 },
2846         [IPVS_SVC_ATTR_SCHED_NAME]      = { .type = NLA_NUL_STRING,
2847                                             .len = IP_VS_SCHEDNAME_MAXLEN },
2848         [IPVS_SVC_ATTR_PE_NAME]         = { .type = NLA_NUL_STRING,
2849                                             .len = IP_VS_PENAME_MAXLEN },
2850         [IPVS_SVC_ATTR_FLAGS]           = { .type = NLA_BINARY,
2851                                             .len = sizeof(struct ip_vs_flags) },
2852         [IPVS_SVC_ATTR_TIMEOUT]         = { .type = NLA_U32 },
2853         [IPVS_SVC_ATTR_NETMASK]         = { .type = NLA_U32 },
2854         [IPVS_SVC_ATTR_STATS]           = { .type = NLA_NESTED },
2855 };
2856
2857 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */
2858 static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = {
2859         [IPVS_DEST_ATTR_ADDR]           = { .type = NLA_BINARY,
2860                                             .len = sizeof(union nf_inet_addr) },
2861         [IPVS_DEST_ATTR_PORT]           = { .type = NLA_U16 },
2862         [IPVS_DEST_ATTR_FWD_METHOD]     = { .type = NLA_U32 },
2863         [IPVS_DEST_ATTR_WEIGHT]         = { .type = NLA_U32 },
2864         [IPVS_DEST_ATTR_U_THRESH]       = { .type = NLA_U32 },
2865         [IPVS_DEST_ATTR_L_THRESH]       = { .type = NLA_U32 },
2866         [IPVS_DEST_ATTR_ACTIVE_CONNS]   = { .type = NLA_U32 },
2867         [IPVS_DEST_ATTR_INACT_CONNS]    = { .type = NLA_U32 },
2868         [IPVS_DEST_ATTR_PERSIST_CONNS]  = { .type = NLA_U32 },
2869         [IPVS_DEST_ATTR_STATS]          = { .type = NLA_NESTED },
2870 };
2871
2872 static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,
2873                                  struct ip_vs_stats *stats)
2874 {
2875         struct ip_vs_stats_user ustats;
2876         struct nlattr *nl_stats = nla_nest_start(skb, container_type);
2877         if (!nl_stats)
2878                 return -EMSGSIZE;
2879
2880         ip_vs_copy_stats(&ustats, stats);
2881
2882         if (nla_put_u32(skb, IPVS_STATS_ATTR_CONNS, ustats.conns) ||
2883             nla_put_u32(skb, IPVS_STATS_ATTR_INPKTS, ustats.inpkts) ||
2884             nla_put_u32(skb, IPVS_STATS_ATTR_OUTPKTS, ustats.outpkts) ||
2885             nla_put_u64(skb, IPVS_STATS_ATTR_INBYTES, ustats.inbytes) ||
2886             nla_put_u64(skb, IPVS_STATS_ATTR_OUTBYTES, ustats.outbytes) ||
2887             nla_put_u32(skb, IPVS_STATS_ATTR_CPS, ustats.cps) ||
2888             nla_put_u32(skb, IPVS_STATS_ATTR_INPPS, ustats.inpps) ||
2889             nla_put_u32(skb, IPVS_STATS_ATTR_OUTPPS, ustats.outpps) ||
2890             nla_put_u32(skb, IPVS_STATS_ATTR_INBPS, ustats.inbps) ||
2891             nla_put_u32(skb, IPVS_STATS_ATTR_OUTBPS, ustats.outbps))
2892                 goto nla_put_failure;
2893         nla_nest_end(skb, nl_stats);
2894
2895         return 0;
2896
2897 nla_put_failure:
2898         nla_nest_cancel(skb, nl_stats);
2899         return -EMSGSIZE;
2900 }
2901
2902 static int ip_vs_genl_fill_service(struct sk_buff *skb,
2903                                    struct ip_vs_service *svc)
2904 {
2905         struct nlattr *nl_service;
2906         struct ip_vs_flags flags = { .flags = svc->flags,
2907                                      .mask = ~0 };
2908
2909         nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE);
2910         if (!nl_service)
2911                 return -EMSGSIZE;
2912
2913         if (nla_put_u16(skb, IPVS_SVC_ATTR_AF, svc->af))
2914                 goto nla_put_failure;
2915         if (svc->fwmark) {
2916                 if (nla_put_u32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark))
2917                         goto nla_put_failure;
2918         } else {
2919                 if (nla_put_u16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol) ||
2920                     nla_put(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr) ||
2921                     nla_put_u16(skb, IPVS_SVC_ATTR_PORT, svc->port))
2922                         goto nla_put_failure;
2923         }
2924
2925         if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name) ||
2926             (svc->pe &&
2927              nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, svc->pe->name)) ||
2928             nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) ||
2929             nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) ||
2930             nla_put_u32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask))
2931                 goto nla_put_failure;
2932         if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &svc->stats))
2933                 goto nla_put_failure;
2934
2935         nla_nest_end(skb, nl_service);
2936
2937         return 0;
2938
2939 nla_put_failure:
2940         nla_nest_cancel(skb, nl_service);
2941         return -EMSGSIZE;
2942 }
2943
2944 static int ip_vs_genl_dump_service(struct sk_buff *skb,
2945                                    struct ip_vs_service *svc,
2946                                    struct netlink_callback *cb)
2947 {
2948         void *hdr;
2949
2950         hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
2951                           &ip_vs_genl_family, NLM_F_MULTI,
2952                           IPVS_CMD_NEW_SERVICE);
2953         if (!hdr)
2954                 return -EMSGSIZE;
2955
2956         if (ip_vs_genl_fill_service(skb, svc) < 0)
2957                 goto nla_put_failure;
2958
2959         return genlmsg_end(skb, hdr);
2960
2961 nla_put_failure:
2962         genlmsg_cancel(skb, hdr);
2963         return -EMSGSIZE;
2964 }
2965
2966 static int ip_vs_genl_dump_services(struct sk_buff *skb,
2967                                     struct netlink_callback *cb)
2968 {
2969         int idx = 0, i;
2970         int start = cb->args[0];
2971         struct ip_vs_service *svc;
2972         struct net *net = skb_sknet(skb);
2973
2974         mutex_lock(&__ip_vs_mutex);
2975         for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
2976                 list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
2977                         if (++idx <= start || !net_eq(svc->net, net))
2978                                 continue;
2979                         if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
2980                                 idx--;
2981                                 goto nla_put_failure;
2982                         }
2983                 }
2984         }
2985
2986         for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
2987                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) {
2988                         if (++idx <= start || !net_eq(svc->net, net))
2989                                 continue;
2990                         if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
2991                                 idx--;
2992                                 goto nla_put_failure;
2993                         }
2994                 }
2995         }
2996
2997 nla_put_failure:
2998         mutex_unlock(&__ip_vs_mutex);
2999         cb->args[0] = idx;
3000
3001         return skb->len;
3002 }
3003
3004 static int ip_vs_genl_parse_service(struct net *net,
3005                                     struct ip_vs_service_user_kern *usvc,
3006                                     struct nlattr *nla, int full_entry,
3007                                     struct ip_vs_service **ret_svc)
3008 {
3009         struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1];
3010         struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr;
3011         struct ip_vs_service *svc;
3012
3013         /* Parse mandatory identifying service fields first */
3014         if (nla == NULL ||
3015             nla_parse_nested(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy))
3016                 return -EINVAL;
3017
3018         nla_af          = attrs[IPVS_SVC_ATTR_AF];
3019         nla_protocol    = attrs[IPVS_SVC_ATTR_PROTOCOL];
3020         nla_addr        = attrs[IPVS_SVC_ATTR_ADDR];
3021         nla_port        = attrs[IPVS_SVC_ATTR_PORT];
3022         nla_fwmark      = attrs[IPVS_SVC_ATTR_FWMARK];
3023
3024         if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr))))
3025                 return -EINVAL;
3026
3027         memset(usvc, 0, sizeof(*usvc));
3028
3029         usvc->af = nla_get_u16(nla_af);
3030 #ifdef CONFIG_IP_VS_IPV6
3031         if (usvc->af != AF_INET && usvc->af != AF_INET6)
3032 #else
3033         if (usvc->af != AF_INET)
3034 #endif
3035                 return -EAFNOSUPPORT;
3036
3037         if (nla_fwmark) {
3038                 usvc->protocol = IPPROTO_TCP;
3039                 usvc->fwmark = nla_get_u32(nla_fwmark);
3040         } else {
3041                 usvc->protocol = nla_get_u16(nla_protocol);
3042                 nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr));
3043                 usvc->port = nla_get_u16(nla_port);
3044                 usvc->fwmark = 0;
3045         }
3046
3047         if (usvc->fwmark)
3048                 svc = __ip_vs_svc_fwm_find(net, usvc->af, usvc->fwmark);
3049         else
3050                 svc = __ip_vs_service_find(net, usvc->af, usvc->protocol,
3051                                            &usvc->addr, usvc->port);
3052         *ret_svc = svc;
3053
3054         /* If a full entry was requested, check for the additional fields */
3055         if (full_entry) {
3056                 struct nlattr *nla_sched, *nla_flags, *nla_pe, *nla_timeout,
3057                               *nla_netmask;
3058                 struct ip_vs_flags flags;
3059
3060                 nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME];
3061                 nla_pe = attrs[IPVS_SVC_ATTR_PE_NAME];
3062                 nla_flags = attrs[IPVS_SVC_ATTR_FLAGS];
3063                 nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT];
3064                 nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK];
3065
3066                 if (!(nla_sched && nla_flags && nla_timeout && nla_netmask))
3067                         return -EINVAL;
3068
3069                 nla_memcpy(&flags, nla_flags, sizeof(flags));
3070
3071                 /* prefill flags from service if it already exists */
3072                 if (svc)
3073                         usvc->flags = svc->flags;
3074
3075                 /* set new flags from userland */
3076                 usvc->flags = (usvc->flags & ~flags.mask) |
3077                               (flags.flags & flags.mask);
3078                 usvc->sched_name = nla_data(nla_sched);
3079                 usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL;
3080                 usvc->timeout = nla_get_u32(nla_timeout);
3081                 usvc->netmask = nla_get_u32(nla_netmask);
3082         }
3083
3084         return 0;
3085 }
3086
3087 static struct ip_vs_service *ip_vs_genl_find_service(struct net *net,
3088                                                      struct nlattr *nla)
3089 {
3090         struct ip_vs_service_user_kern usvc;
3091         struct ip_vs_service *svc;
3092         int ret;
3093
3094         ret = ip_vs_genl_parse_service(net, &usvc, nla, 0, &svc);
3095         return ret ? ERR_PTR(ret) : svc;
3096 }
3097
3098 static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
3099 {
3100         struct nlattr *nl_dest;
3101
3102         nl_dest = nla_nest_start(skb, IPVS_CMD_ATTR_DEST);
3103         if (!nl_dest)
3104                 return -EMSGSIZE;
3105
3106         if (nla_put(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr) ||
3107             nla_put_u16(skb, IPVS_DEST_ATTR_PORT, dest->port) ||
3108             nla_put_u32(skb, IPVS_DEST_ATTR_FWD_METHOD,
3109                         (atomic_read(&dest->conn_flags) &
3110                          IP_VS_CONN_F_FWD_MASK)) ||
3111             nla_put_u32(skb, IPVS_DEST_ATTR_WEIGHT,
3112                         atomic_read(&dest->weight)) ||
3113             nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) ||
3114             nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) ||
3115             nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS,
3116                         atomic_read(&dest->activeconns)) ||
3117             nla_put_u32(skb, IPVS_DEST_ATTR_INACT_CONNS,
3118                         atomic_read(&dest->inactconns)) ||
3119             nla_put_u32(skb, IPVS_DEST_ATTR_PERSIST_CONNS,
3120                         atomic_read(&dest->persistconns)))
3121                 goto nla_put_failure;
3122         if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats))
3123                 goto nla_put_failure;
3124
3125         nla_nest_end(skb, nl_dest);
3126
3127         return 0;
3128
3129 nla_put_failure:
3130         nla_nest_cancel(skb, nl_dest);
3131         return -EMSGSIZE;
3132 }
3133
3134 static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest,
3135                                 struct netlink_callback *cb)
3136 {
3137         void *hdr;
3138
3139         hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
3140                           &ip_vs_genl_family, NLM_F_MULTI,
3141                           IPVS_CMD_NEW_DEST);
3142         if (!hdr)
3143                 return -EMSGSIZE;
3144
3145         if (ip_vs_genl_fill_dest(skb, dest) < 0)
3146                 goto nla_put_failure;
3147
3148         return genlmsg_end(skb, hdr);
3149
3150 nla_put_failure:
3151         genlmsg_cancel(skb, hdr);
3152         return -EMSGSIZE;
3153 }
3154
3155 static int ip_vs_genl_dump_dests(struct sk_buff *skb,
3156                                  struct netlink_callback *cb)
3157 {
3158         int idx = 0;
3159         int start = cb->args[0];
3160         struct ip_vs_service *svc;
3161         struct ip_vs_dest *dest;
3162         struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1];
3163         struct net *net = skb_sknet(skb);
3164
3165         mutex_lock(&__ip_vs_mutex);
3166
3167         /* Try to find the service for which to dump destinations */
3168         if (nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs,
3169                         IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy))
3170                 goto out_err;
3171
3172
3173         svc = ip_vs_genl_find_service(net, attrs[IPVS_CMD_ATTR_SERVICE]);
3174         if (IS_ERR(svc) || svc == NULL)
3175                 goto out_err;
3176
3177         /* Dump the destinations */
3178         list_for_each_entry(dest, &svc->destinations, n_list) {
3179                 if (++idx <= start)
3180                         continue;
3181                 if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) {
3182                         idx--;
3183                         goto nla_put_failure;
3184                 }
3185         }
3186
3187 nla_put_failure:
3188         cb->args[0] = idx;
3189
3190 out_err:
3191         mutex_unlock(&__ip_vs_mutex);
3192
3193         return skb->len;
3194 }
3195
3196 static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
3197                                  struct nlattr *nla, int full_entry)
3198 {
3199         struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1];
3200         struct nlattr *nla_addr, *nla_port;
3201
3202         /* Parse mandatory identifying destination fields first */
3203         if (nla == NULL ||
3204             nla_parse_nested(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy))
3205                 return -EINVAL;
3206
3207         nla_addr        = attrs[IPVS_DEST_ATTR_ADDR];
3208         nla_port        = attrs[IPVS_DEST_ATTR_PORT];
3209
3210         if (!(nla_addr && nla_port))
3211                 return -EINVAL;
3212
3213         memset(udest, 0, sizeof(*udest));
3214
3215         nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr));
3216         udest->port = nla_get_u16(nla_port);
3217
3218         /* If a full entry was requested, check for the additional fields */
3219         if (full_entry) {
3220                 struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh,
3221                               *nla_l_thresh;
3222
3223                 nla_fwd         = attrs[IPVS_DEST_ATTR_FWD_METHOD];
3224                 nla_weight      = attrs[IPVS_DEST_ATTR_WEIGHT];
3225                 nla_u_thresh    = attrs[IPVS_DEST_ATTR_U_THRESH];
3226                 nla_l_thresh    = attrs[IPVS_DEST_ATTR_L_THRESH];
3227
3228                 if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh))
3229                         return -EINVAL;
3230
3231                 udest->conn_flags = nla_get_u32(nla_fwd)
3232                                     & IP_VS_CONN_F_FWD_MASK;
3233                 udest->weight = nla_get_u32(nla_weight);
3234                 udest->u_threshold = nla_get_u32(nla_u_thresh);
3235                 udest->l_threshold = nla_get_u32(nla_l_thresh);
3236         }
3237