2 * Routines having to do with the 'struct sk_buff' memory handlers.
4 * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk>
5 * Florian La Roche <rzsfl@rz.uni-sb.de>
8 * Alan Cox : Fixed the worst of the load
10 * Dave Platt : Interrupt stacking fix.
11 * Richard Kooijman : Timestamp fixes.
12 * Alan Cox : Changed buffer format.
13 * Alan Cox : destructor hook for AF_UNIX etc.
14 * Linus Torvalds : Better skb_clone.
15 * Alan Cox : Added skb_copy.
16 * Alan Cox : Added all the changed routines Linus
17 * only put in the headers
18 * Ray VanTassle : Fixed --skb->lock in free
19 * Alan Cox : skb_copy copy arp field
20 * Andi Kleen : slabified it.
21 * Robert Olsson : Removed skb_head_pool
24 * The __skb_ routines should be called with interrupts
25 * disabled, or you better be *real* sure that the operation is atomic
26 * with respect to whatever list is being frobbed (e.g. via lock_sock()
27 * or via disabling bottom half handlers, etc).
29 * This program is free software; you can redistribute it and/or
30 * modify it under the terms of the GNU General Public License
31 * as published by the Free Software Foundation; either version
32 * 2 of the License, or (at your option) any later version.
36 * The functions in this file will not compile correctly with gcc 2.4.x
39 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
41 #include <linux/module.h>
42 #include <linux/types.h>
43 #include <linux/kernel.h>
44 #include <linux/kmemcheck.h>
46 #include <linux/interrupt.h>
48 #include <linux/inet.h>
49 #include <linux/slab.h>
50 #include <linux/netdevice.h>
51 #ifdef CONFIG_NET_CLS_ACT
52 #include <net/pkt_sched.h>
54 #include <linux/string.h>
55 #include <linux/skbuff.h>
56 #include <linux/splice.h>
57 #include <linux/cache.h>
58 #include <linux/rtnetlink.h>
59 #include <linux/init.h>
60 #include <linux/scatterlist.h>
61 #include <linux/errqueue.h>
62 #include <linux/prefetch.h>
64 #include <net/protocol.h>
67 #include <net/checksum.h>
70 #include <asm/uaccess.h>
71 #include <trace/events/skb.h>
72 #include <linux/highmem.h>
74 struct kmem_cache *skbuff_head_cache __read_mostly;
75 static struct kmem_cache *skbuff_fclone_cache __read_mostly;
77 static void sock_pipe_buf_release(struct pipe_inode_info *pipe,
78 struct pipe_buffer *buf)
83 static void sock_pipe_buf_get(struct pipe_inode_info *pipe,
84 struct pipe_buffer *buf)
89 static int sock_pipe_buf_steal(struct pipe_inode_info *pipe,
90 struct pipe_buffer *buf)
96 /* Pipe buffer operations for a socket. */
97 static const struct pipe_buf_operations sock_pipe_buf_ops = {
99 .map = generic_pipe_buf_map,
100 .unmap = generic_pipe_buf_unmap,
101 .confirm = generic_pipe_buf_confirm,
102 .release = sock_pipe_buf_release,
103 .steal = sock_pipe_buf_steal,
104 .get = sock_pipe_buf_get,
108 * skb_panic - private function for out-of-line support
112 * @msg: skb_over_panic or skb_under_panic
114 * Out-of-line support for skb_put() and skb_push().
115 * Called via the wrapper skb_over_panic() or skb_under_panic().
116 * Keep out of line to prevent kernel bloat.
117 * __builtin_return_address is not used because it is not always reliable.
119 static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr,
122 pr_emerg("%s: text:%p len:%d put:%d head:%p data:%p tail:%#lx end:%#lx dev:%s\n",
123 msg, addr, skb->len, sz, skb->head, skb->data,
124 (unsigned long)skb->tail, (unsigned long)skb->end,
125 skb->dev ? skb->dev->name : "<NULL>");
129 static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr)
131 skb_panic(skb, sz, addr, __func__);
134 static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
136 skb_panic(skb, sz, addr, __func__);
140 * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
141 * the caller if emergency pfmemalloc reserves are being used. If it is and
142 * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
143 * may be used. Otherwise, the packet data may be discarded until enough
146 #define kmalloc_reserve(size, gfp, node, pfmemalloc) \
147 __kmalloc_reserve(size, gfp, node, _RET_IP_, pfmemalloc)
149 static void *__kmalloc_reserve(size_t size, gfp_t flags, int node,
150 unsigned long ip, bool *pfmemalloc)
153 bool ret_pfmemalloc = false;
156 * Try a regular allocation, when that fails and we're not entitled
157 * to the reserves, fail.
159 obj = kmalloc_node_track_caller(size,
160 flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
162 if (obj || !(gfp_pfmemalloc_allowed(flags)))
165 /* Try again but now we are using pfmemalloc reserves */
166 ret_pfmemalloc = true;
167 obj = kmalloc_node_track_caller(size, flags, node);
171 *pfmemalloc = ret_pfmemalloc;
176 /* Allocate a new skbuff. We do this ourselves so we can fill in a few
177 * 'private' fields and also do memory statistics to find all the
183 * __alloc_skb - allocate a network buffer
184 * @size: size to allocate
185 * @gfp_mask: allocation mask
186 * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
187 * instead of head cache and allocate a cloned (child) skb.
188 * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
189 * allocations in case the data is required for writeback
190 * @node: numa node to allocate memory on
192 * Allocate a new &sk_buff. The returned buffer has no headroom and a
193 * tail room of at least size bytes. The object has a reference count
194 * of one. The return is the buffer. On a failure the return is %NULL.
196 * Buffers may only be allocated from interrupts using a @gfp_mask of
199 struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
202 struct kmem_cache *cache;
203 struct skb_shared_info *shinfo;
208 cache = (flags & SKB_ALLOC_FCLONE)
209 ? skbuff_fclone_cache : skbuff_head_cache;
211 if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
212 gfp_mask |= __GFP_MEMALLOC;
215 skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node);
220 /* We do our best to align skb_shared_info on a separate cache
221 * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
222 * aligned memory blocks, unless SLUB/SLAB debug is enabled.
223 * Both skb->head and skb_shared_info are cache line aligned.
225 size = SKB_DATA_ALIGN(size);
226 size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
227 data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc);
230 /* kmalloc(size) might give us more room than requested.
231 * Put skb_shared_info exactly at the end of allocated zone,
232 * to allow max possible filling before reallocation.
234 size = SKB_WITH_OVERHEAD(ksize(data));
235 prefetchw(data + size);
238 * Only clear those fields we need to clear, not those that we will
239 * actually initialise below. Hence, don't put any more fields after
240 * the tail pointer in struct sk_buff!
242 memset(skb, 0, offsetof(struct sk_buff, tail));
243 /* Account for allocated memory : skb + skb->head */
244 skb->truesize = SKB_TRUESIZE(size);
245 skb->pfmemalloc = pfmemalloc;
246 atomic_set(&skb->users, 1);
249 skb_reset_tail_pointer(skb);
250 skb->end = skb->tail + size;
251 #ifdef NET_SKBUFF_DATA_USES_OFFSET
252 skb->mac_header = ~0U;
253 skb->transport_header = ~0U;
256 /* make sure we initialize shinfo sequentially */
257 shinfo = skb_shinfo(skb);
258 memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
259 atomic_set(&shinfo->dataref, 1);
260 kmemcheck_annotate_variable(shinfo->destructor_arg);
262 if (flags & SKB_ALLOC_FCLONE) {
263 struct sk_buff *child = skb + 1;
264 atomic_t *fclone_ref = (atomic_t *) (child + 1);
266 kmemcheck_annotate_bitfield(child, flags1);
267 kmemcheck_annotate_bitfield(child, flags2);
268 skb->fclone = SKB_FCLONE_ORIG;
269 atomic_set(fclone_ref, 1);
271 child->fclone = SKB_FCLONE_UNAVAILABLE;
272 child->pfmemalloc = pfmemalloc;
277 kmem_cache_free(cache, skb);
281 EXPORT_SYMBOL(__alloc_skb);
284 * build_skb - build a network buffer
285 * @data: data buffer provided by caller
286 * @frag_size: size of fragment, or 0 if head was kmalloced
288 * Allocate a new &sk_buff. Caller provides space holding head and
289 * skb_shared_info. @data must have been allocated by kmalloc()
290 * The return is the new skb buffer.
291 * On a failure the return is %NULL, and @data is not freed.
293 * Before IO, driver allocates only data buffer where NIC put incoming frame
294 * Driver should add room at head (NET_SKB_PAD) and
295 * MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info))
296 * After IO, driver calls build_skb(), to allocate sk_buff and populate it
297 * before giving packet to stack.
298 * RX rings only contains data buffers, not full skbs.
300 struct sk_buff *build_skb(void *data, unsigned int frag_size)
302 struct skb_shared_info *shinfo;
304 unsigned int size = frag_size ? : ksize(data);
306 skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
310 size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
312 memset(skb, 0, offsetof(struct sk_buff, tail));
313 skb->truesize = SKB_TRUESIZE(size);
314 skb->head_frag = frag_size != 0;
315 atomic_set(&skb->users, 1);
318 skb_reset_tail_pointer(skb);
319 skb->end = skb->tail + size;
320 #ifdef NET_SKBUFF_DATA_USES_OFFSET
321 skb->mac_header = ~0U;
322 skb->transport_header = ~0U;
325 /* make sure we initialize shinfo sequentially */
326 shinfo = skb_shinfo(skb);
327 memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
328 atomic_set(&shinfo->dataref, 1);
329 kmemcheck_annotate_variable(shinfo->destructor_arg);
333 EXPORT_SYMBOL(build_skb);
335 struct netdev_alloc_cache {
336 struct page_frag frag;
337 /* we maintain a pagecount bias, so that we dont dirty cache line
338 * containing page->_count every time we allocate a fragment.
340 unsigned int pagecnt_bias;
342 static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache);
344 static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
346 struct netdev_alloc_cache *nc;
351 local_irq_save(flags);
352 nc = &__get_cpu_var(netdev_alloc_cache);
353 if (unlikely(!nc->frag.page)) {
355 for (order = NETDEV_FRAG_PAGE_MAX_ORDER; ;) {
356 gfp_t gfp = gfp_mask;
359 gfp |= __GFP_COMP | __GFP_NOWARN;
360 nc->frag.page = alloc_pages(gfp, order);
361 if (likely(nc->frag.page))
366 nc->frag.size = PAGE_SIZE << order;
368 atomic_set(&nc->frag.page->_count, NETDEV_PAGECNT_MAX_BIAS);
369 nc->pagecnt_bias = NETDEV_PAGECNT_MAX_BIAS;
373 if (nc->frag.offset + fragsz > nc->frag.size) {
374 /* avoid unnecessary locked operations if possible */
375 if ((atomic_read(&nc->frag.page->_count) == nc->pagecnt_bias) ||
376 atomic_sub_and_test(nc->pagecnt_bias, &nc->frag.page->_count))
381 data = page_address(nc->frag.page) + nc->frag.offset;
382 nc->frag.offset += fragsz;
385 local_irq_restore(flags);
390 * netdev_alloc_frag - allocate a page fragment
391 * @fragsz: fragment size
393 * Allocates a frag from a page for receive buffer.
394 * Uses GFP_ATOMIC allocations.
396 void *netdev_alloc_frag(unsigned int fragsz)
398 return __netdev_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD);
400 EXPORT_SYMBOL(netdev_alloc_frag);
403 * __netdev_alloc_skb - allocate an skbuff for rx on a specific device
404 * @dev: network device to receive on
405 * @length: length to allocate
406 * @gfp_mask: get_free_pages mask, passed to alloc_skb
408 * Allocate a new &sk_buff and assign it a usage count of one. The
409 * buffer has unspecified headroom built in. Users should allocate
410 * the headroom they think they need without accounting for the
411 * built in space. The built in space is used for optimisations.
413 * %NULL is returned if there is no free memory.
415 struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
416 unsigned int length, gfp_t gfp_mask)
418 struct sk_buff *skb = NULL;
419 unsigned int fragsz = SKB_DATA_ALIGN(length + NET_SKB_PAD) +
420 SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
422 if (fragsz <= PAGE_SIZE && !(gfp_mask & (__GFP_WAIT | GFP_DMA))) {
425 if (sk_memalloc_socks())
426 gfp_mask |= __GFP_MEMALLOC;
428 data = __netdev_alloc_frag(fragsz, gfp_mask);
431 skb = build_skb(data, fragsz);
433 put_page(virt_to_head_page(data));
436 skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask,
437 SKB_ALLOC_RX, NUMA_NO_NODE);
440 skb_reserve(skb, NET_SKB_PAD);
445 EXPORT_SYMBOL(__netdev_alloc_skb);
447 void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
448 int size, unsigned int truesize)
450 skb_fill_page_desc(skb, i, page, off, size);
452 skb->data_len += size;
453 skb->truesize += truesize;
455 EXPORT_SYMBOL(skb_add_rx_frag);
457 static void skb_drop_list(struct sk_buff **listp)
459 struct sk_buff *list = *listp;
464 struct sk_buff *this = list;
470 static inline void skb_drop_fraglist(struct sk_buff *skb)
472 skb_drop_list(&skb_shinfo(skb)->frag_list);
475 static void skb_clone_fraglist(struct sk_buff *skb)
477 struct sk_buff *list;
479 skb_walk_frags(skb, list)
483 static void skb_free_head(struct sk_buff *skb)
486 put_page(virt_to_head_page(skb->head));
491 static void skb_release_data(struct sk_buff *skb)
494 !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
495 &skb_shinfo(skb)->dataref)) {
496 if (skb_shinfo(skb)->nr_frags) {
498 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
499 skb_frag_unref(skb, i);
503 * If skb buf is from userspace, we need to notify the caller
504 * the lower device DMA has done;
506 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
507 struct ubuf_info *uarg;
509 uarg = skb_shinfo(skb)->destructor_arg;
511 uarg->callback(uarg, true);
514 if (skb_has_frag_list(skb))
515 skb_drop_fraglist(skb);
522 * Free an skbuff by memory without cleaning the state.
524 static void kfree_skbmem(struct sk_buff *skb)
526 struct sk_buff *other;
527 atomic_t *fclone_ref;
529 switch (skb->fclone) {
530 case SKB_FCLONE_UNAVAILABLE:
531 kmem_cache_free(skbuff_head_cache, skb);
534 case SKB_FCLONE_ORIG:
535 fclone_ref = (atomic_t *) (skb + 2);
536 if (atomic_dec_and_test(fclone_ref))
537 kmem_cache_free(skbuff_fclone_cache, skb);
540 case SKB_FCLONE_CLONE:
541 fclone_ref = (atomic_t *) (skb + 1);
544 /* The clone portion is available for
545 * fast-cloning again.
547 skb->fclone = SKB_FCLONE_UNAVAILABLE;
549 if (atomic_dec_and_test(fclone_ref))
550 kmem_cache_free(skbuff_fclone_cache, other);
555 static void skb_release_head_state(struct sk_buff *skb)
559 secpath_put(skb->sp);
561 if (skb->destructor) {
563 skb->destructor(skb);
565 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
566 nf_conntrack_put(skb->nfct);
568 #ifdef NET_SKBUFF_NF_DEFRAG_NEEDED
569 nf_conntrack_put_reasm(skb->nfct_reasm);
571 #ifdef CONFIG_BRIDGE_NETFILTER
572 nf_bridge_put(skb->nf_bridge);
574 /* XXX: IS this still necessary? - JHS */
575 #ifdef CONFIG_NET_SCHED
577 #ifdef CONFIG_NET_CLS_ACT
583 /* Free everything but the sk_buff shell. */
584 static void skb_release_all(struct sk_buff *skb)
586 skb_release_head_state(skb);
587 skb_release_data(skb);
591 * __kfree_skb - private function
594 * Free an sk_buff. Release anything attached to the buffer.
595 * Clean the state. This is an internal helper function. Users should
596 * always call kfree_skb
599 void __kfree_skb(struct sk_buff *skb)
601 skb_release_all(skb);
604 EXPORT_SYMBOL(__kfree_skb);
607 * kfree_skb - free an sk_buff
608 * @skb: buffer to free
610 * Drop a reference to the buffer and free it if the usage count has
613 void kfree_skb(struct sk_buff *skb)
617 if (likely(atomic_read(&skb->users) == 1))
619 else if (likely(!atomic_dec_and_test(&skb->users)))
621 trace_kfree_skb(skb, __builtin_return_address(0));
624 EXPORT_SYMBOL(kfree_skb);
627 * skb_tx_error - report an sk_buff xmit error
628 * @skb: buffer that triggered an error
630 * Report xmit error if a device callback is tracking this skb.
631 * skb must be freed afterwards.
633 void skb_tx_error(struct sk_buff *skb)
635 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
636 struct ubuf_info *uarg;
638 uarg = skb_shinfo(skb)->destructor_arg;
640 uarg->callback(uarg, false);
641 skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY;
644 EXPORT_SYMBOL(skb_tx_error);
647 * consume_skb - free an skbuff
648 * @skb: buffer to free
650 * Drop a ref to the buffer and free it if the usage count has hit zero
651 * Functions identically to kfree_skb, but kfree_skb assumes that the frame
652 * is being dropped after a failure and notes that
654 void consume_skb(struct sk_buff *skb)
658 if (likely(atomic_read(&skb->users) == 1))
660 else if (likely(!atomic_dec_and_test(&skb->users)))
662 trace_consume_skb(skb);
665 EXPORT_SYMBOL(consume_skb);
667 static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
669 new->tstamp = old->tstamp;
671 new->transport_header = old->transport_header;
672 new->network_header = old->network_header;
673 new->mac_header = old->mac_header;
674 new->inner_transport_header = old->inner_transport_header;
675 new->inner_network_header = old->inner_network_header;
676 skb_dst_copy(new, old);
677 new->rxhash = old->rxhash;
678 new->ooo_okay = old->ooo_okay;
679 new->l4_rxhash = old->l4_rxhash;
680 new->no_fcs = old->no_fcs;
681 new->encapsulation = old->encapsulation;
683 new->sp = secpath_get(old->sp);
685 memcpy(new->cb, old->cb, sizeof(old->cb));
686 new->csum = old->csum;
687 new->local_df = old->local_df;
688 new->pkt_type = old->pkt_type;
689 new->ip_summed = old->ip_summed;
690 skb_copy_queue_mapping(new, old);
691 new->priority = old->priority;
692 #if IS_ENABLED(CONFIG_IP_VS)
693 new->ipvs_property = old->ipvs_property;
695 new->pfmemalloc = old->pfmemalloc;
696 new->protocol = old->protocol;
697 new->mark = old->mark;
698 new->skb_iif = old->skb_iif;
700 #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
701 new->nf_trace = old->nf_trace;
703 #ifdef CONFIG_NET_SCHED
704 new->tc_index = old->tc_index;
705 #ifdef CONFIG_NET_CLS_ACT
706 new->tc_verd = old->tc_verd;
709 new->vlan_tci = old->vlan_tci;
711 skb_copy_secmark(new, old);
715 * You should not add any new code to this function. Add it to
716 * __copy_skb_header above instead.
718 static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
720 #define C(x) n->x = skb->x