Merge branch 'akpm' (Andrew's patch-bomb)
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 29 Mar 2012 00:19:27 +0000 (17:19 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 29 Mar 2012 00:19:28 +0000 (17:19 -0700)
Merge third batch of patches from Andrew Morton:
 - Some MM stragglers
 - core SMP library cleanups (on_each_cpu_mask)
 - Some IPI optimisations
 - kexec
 - kdump
 - IPMI
 - the radix-tree iterator work
 - various other misc bits.

 "That'll do for -rc1.  I still have ~10 patches for 3.4, will send
  those along when they've baked a little more."

* emailed from Andrew Morton <akpm@linux-foundation.org>: (35 commits)
  backlight: fix typo in tosa_lcd.c
  crc32: add help text for the algorithm select option
  mm: move hugepage test examples to tools/testing/selftests/vm
  mm: move slabinfo.c to tools/vm
  mm: move page-types.c from Documentation to tools/vm
  selftests/Makefile: make `run_tests' depend on `all'
  selftests: launch individual selftests from the main Makefile
  radix-tree: use iterators in find_get_pages* functions
  radix-tree: rewrite gang lookup using iterator
  radix-tree: introduce bit-optimized iterator
  fs/proc/namespaces.c: prevent crash when ns_entries[] is empty
  nbd: rename the nbd_device variable from lo to nbd
  pidns: add reboot_pid_ns() to handle the reboot syscall
  sysctl: use bitmap library functions
  ipmi: use locks on watchdog timeout set on reboot
  ipmi: simplify locking
  ipmi: fix message handling during panics
  ipmi: use a tasklet for handling received messages
  ipmi: increase KCS timeouts
  ipmi: decrease the IPMI message transaction time in interrupt mode
  ...

60 files changed:
Documentation/Makefile
Documentation/vm/Makefile [deleted file]
Documentation/vm/hugepage-mmap.c [deleted file]
Documentation/vm/hugepage-shm.c [deleted file]
Documentation/vm/map_hugetlb.c [deleted file]
Documentation/vm/page-types.c [deleted file]
arch/arm/kernel/smp_tlb.c
arch/ia64/kernel/acpi.c
arch/ia64/kernel/irq_ia64.c
arch/ia64/kernel/mca.c
arch/ia64/kernel/msi_ia64.c
arch/ia64/kernel/setup.c
arch/ia64/kernel/smp.c
arch/ia64/kernel/smpboot.c
arch/ia64/kernel/topology.c
arch/tile/include/asm/smp.h
arch/tile/kernel/smp.c
arch/x86/kernel/setup.c
drivers/block/nbd.c
drivers/char/ipmi/ipmi_kcs_sm.c
drivers/char/ipmi/ipmi_msghandler.c
drivers/char/ipmi/ipmi_si_intf.c
drivers/char/ipmi/ipmi_watchdog.c
drivers/video/backlight/tosa_lcd.c
fs/buffer.c
fs/proc/array.c
fs/proc/namespaces.c
fs/proc/task_mmu.c
include/linux/cpumask.h
include/linux/mm.h
include/linux/pid_namespace.h
include/linux/radix-tree.h
include/linux/smp.h
include/linux/swap.h
kernel/kexec.c
kernel/pid_namespace.c
kernel/smp.c
kernel/sys.c
kernel/sysctl.c
lib/Kconfig
lib/cpumask.c
lib/radix-tree.c
mm/filemap.c
mm/memcontrol.c
mm/page_alloc.c
mm/slub.c
mm/swapfile.c
mm/truncate.c
tools/slub/slabinfo.c [deleted file]
tools/testing/selftests/Makefile
tools/testing/selftests/breakpoints/Makefile
tools/testing/selftests/run_tests [deleted file]
tools/testing/selftests/vm/Makefile [new file with mode: 0644]
tools/testing/selftests/vm/hugepage-mmap.c [new file with mode: 0644]
tools/testing/selftests/vm/hugepage-shm.c [new file with mode: 0644]
tools/testing/selftests/vm/map_hugetlb.c [new file with mode: 0644]
tools/testing/selftests/vm/run_vmtests [new file with mode: 0644]
tools/vm/Makefile [new file with mode: 0644]
tools/vm/page-types.c [new file with mode: 0644]
tools/vm/slabinfo.c [new file with mode: 0644]

index 9b4bc5c76f335341d568062d1f4034ec530458a7..30b656ece7aaed21076daa92d865ad472f607e22 100644 (file)
@@ -1,3 +1,3 @@
 obj-m := DocBook/ accounting/ auxdisplay/ connector/ \
        filesystems/ filesystems/configfs/ ia64/ laptops/ networking/ \
-       pcmcia/ spi/ timers/ vm/ watchdog/src/
+       pcmcia/ spi/ timers/ watchdog/src/
diff --git a/Documentation/vm/Makefile b/Documentation/vm/Makefile
deleted file mode 100644 (file)
index 3fa4d06..0000000
+++ /dev/null
@@ -1,8 +0,0 @@
-# kbuild trick to avoid linker error. Can be omitted if a module is built.
-obj- := dummy.o
-
-# List of programs to build
-hostprogs-y := page-types hugepage-mmap hugepage-shm map_hugetlb
-
-# Tell kbuild to always build the programs
-always := $(hostprogs-y)
diff --git a/Documentation/vm/hugepage-mmap.c b/Documentation/vm/hugepage-mmap.c
deleted file mode 100644 (file)
index db0dd9a..0000000
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * hugepage-mmap:
- *
- * Example of using huge page memory in a user application using the mmap
- * system call.  Before running this application, make sure that the
- * administrator has mounted the hugetlbfs filesystem (on some directory
- * like /mnt) using the command mount -t hugetlbfs nodev /mnt. In this
- * example, the app is requesting memory of size 256MB that is backed by
- * huge pages.
- *
- * For the ia64 architecture, the Linux kernel reserves Region number 4 for
- * huge pages.  That means that if one requires a fixed address, a huge page
- * aligned address starting with 0x800000... will be required.  If a fixed
- * address is not required, the kernel will select an address in the proper
- * range.
- * Other architectures, such as ppc64, i386 or x86_64 are not so constrained.
- */
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <sys/mman.h>
-#include <fcntl.h>
-
-#define FILE_NAME "/mnt/hugepagefile"
-#define LENGTH (256UL*1024*1024)
-#define PROTECTION (PROT_READ | PROT_WRITE)
-
-/* Only ia64 requires this */
-#ifdef __ia64__
-#define ADDR (void *)(0x8000000000000000UL)
-#define FLAGS (MAP_SHARED | MAP_FIXED)
-#else
-#define ADDR (void *)(0x0UL)
-#define FLAGS (MAP_SHARED)
-#endif
-
-static void check_bytes(char *addr)
-{
-       printf("First hex is %x\n", *((unsigned int *)addr));
-}
-
-static void write_bytes(char *addr)
-{
-       unsigned long i;
-
-       for (i = 0; i < LENGTH; i++)
-               *(addr + i) = (char)i;
-}
-
-static void read_bytes(char *addr)
-{
-       unsigned long i;
-
-       check_bytes(addr);
-       for (i = 0; i < LENGTH; i++)
-               if (*(addr + i) != (char)i) {
-                       printf("Mismatch at %lu\n", i);
-                       break;
-               }
-}
-
-int main(void)
-{
-       void *addr;
-       int fd;
-
-       fd = open(FILE_NAME, O_CREAT | O_RDWR, 0755);
-       if (fd < 0) {
-               perror("Open failed");
-               exit(1);
-       }
-
-       addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, fd, 0);
-       if (addr == MAP_FAILED) {
-               perror("mmap");
-               unlink(FILE_NAME);
-               exit(1);
-       }
-
-       printf("Returned address is %p\n", addr);
-       check_bytes(addr);
-       write_bytes(addr);
-       read_bytes(addr);
-
-       munmap(addr, LENGTH);
-       close(fd);
-       unlink(FILE_NAME);
-
-       return 0;
-}
diff --git a/Documentation/vm/hugepage-shm.c b/Documentation/vm/hugepage-shm.c
deleted file mode 100644 (file)
index 07956d8..0000000
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * hugepage-shm:
- *
- * Example of using huge page memory in a user application using Sys V shared
- * memory system calls.  In this example the app is requesting 256MB of
- * memory that is backed by huge pages.  The application uses the flag
- * SHM_HUGETLB in the shmget system call to inform the kernel that it is
- * requesting huge pages.
- *
- * For the ia64 architecture, the Linux kernel reserves Region number 4 for
- * huge pages.  That means that if one requires a fixed address, a huge page
- * aligned address starting with 0x800000... will be required.  If a fixed
- * address is not required, the kernel will select an address in the proper
- * range.
- * Other architectures, such as ppc64, i386 or x86_64 are not so constrained.
- *
- * Note: The default shared memory limit is quite low on many kernels,
- * you may need to increase it via:
- *
- * echo 268435456 > /proc/sys/kernel/shmmax
- *
- * This will increase the maximum size per shared memory segment to 256MB.
- * The other limit that you will hit eventually is shmall which is the
- * total amount of shared memory in pages. To set it to 16GB on a system
- * with a 4kB pagesize do:
- *
- * echo 4194304 > /proc/sys/kernel/shmall
- */
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <sys/types.h>
-#include <sys/ipc.h>
-#include <sys/shm.h>
-#include <sys/mman.h>
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-#define LENGTH (256UL*1024*1024)
-
-#define dprintf(x)  printf(x)
-
-/* Only ia64 requires this */
-#ifdef __ia64__
-#define ADDR (void *)(0x8000000000000000UL)
-#define SHMAT_FLAGS (SHM_RND)
-#else
-#define ADDR (void *)(0x0UL)
-#define SHMAT_FLAGS (0)
-#endif
-
-int main(void)
-{
-       int shmid;
-       unsigned long i;
-       char *shmaddr;
-
-       if ((shmid = shmget(2, LENGTH,
-                           SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W)) < 0) {
-               perror("shmget");
-               exit(1);
-       }
-       printf("shmid: 0x%x\n", shmid);
-
-       shmaddr = shmat(shmid, ADDR, SHMAT_FLAGS);
-       if (shmaddr == (char *)-1) {
-               perror("Shared memory attach failure");
-               shmctl(shmid, IPC_RMID, NULL);
-               exit(2);
-       }
-       printf("shmaddr: %p\n", shmaddr);
-
-       dprintf("Starting the writes:\n");
-       for (i = 0; i < LENGTH; i++) {
-               shmaddr[i] = (char)(i);
-               if (!(i % (1024 * 1024)))
-                       dprintf(".");
-       }
-       dprintf("\n");
-
-       dprintf("Starting the Check...");
-       for (i = 0; i < LENGTH; i++)
-               if (shmaddr[i] != (char)i)
-                       printf("\nIndex %lu mismatched\n", i);
-       dprintf("Done.\n");
-
-       if (shmdt((const void *)shmaddr) != 0) {
-               perror("Detach failure");
-               shmctl(shmid, IPC_RMID, NULL);
-               exit(3);
-       }
-
-       shmctl(shmid, IPC_RMID, NULL);
-
-       return 0;
-}
diff --git a/Documentation/vm/map_hugetlb.c b/Documentation/vm/map_hugetlb.c
deleted file mode 100644 (file)
index eda1a6d..0000000
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Example of using hugepage memory in a user application using the mmap
- * system call with MAP_HUGETLB flag.  Before running this program make
- * sure the administrator has allocated enough default sized huge pages
- * to cover the 256 MB allocation.
- *
- * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages.
- * That means the addresses starting with 0x800000... will need to be
- * specified.  Specifying a fixed address is not required on ppc64, i386
- * or x86_64.
- */
-#include <stdlib.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <sys/mman.h>
-#include <fcntl.h>
-
-#define LENGTH (256UL*1024*1024)
-#define PROTECTION (PROT_READ | PROT_WRITE)
-
-#ifndef MAP_HUGETLB
-#define MAP_HUGETLB 0x40000 /* arch specific */
-#endif
-
-/* Only ia64 requires this */
-#ifdef __ia64__
-#define ADDR (void *)(0x8000000000000000UL)
-#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED)
-#else
-#define ADDR (void *)(0x0UL)
-#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB)
-#endif
-
-static void check_bytes(char *addr)
-{
-       printf("First hex is %x\n", *((unsigned int *)addr));
-}
-
-static void write_bytes(char *addr)
-{
-       unsigned long i;
-
-       for (i = 0; i < LENGTH; i++)
-               *(addr + i) = (char)i;
-}
-
-static void read_bytes(char *addr)
-{
-       unsigned long i;
-
-       check_bytes(addr);
-       for (i = 0; i < LENGTH; i++)
-               if (*(addr + i) != (char)i) {
-                       printf("Mismatch at %lu\n", i);
-                       break;
-               }
-}
-
-int main(void)
-{
-       void *addr;
-
-       addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, 0, 0);
-       if (addr == MAP_FAILED) {
-               perror("mmap");
-               exit(1);
-       }
-
-       printf("Returned address is %p\n", addr);
-       check_bytes(addr);
-       write_bytes(addr);
-       read_bytes(addr);
-
-       munmap(addr, LENGTH);
-
-       return 0;
-}
diff --git a/Documentation/vm/page-types.c b/Documentation/vm/page-types.c
deleted file mode 100644 (file)
index 0b13f02..0000000
+++ /dev/null
@@ -1,1102 +0,0 @@
-/*
- * page-types: Tool for querying page flags
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; version 2.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should find a copy of v2 of the GNU General Public License somewhere on
- * your Linux system; if not, write to the Free Software Foundation, Inc., 59
- * Temple Place, Suite 330, Boston, MA 02111-1307 USA.
- *
- * Copyright (C) 2009 Intel corporation
- *
- * Authors: Wu Fengguang <fengguang.wu@intel.com>
- */
-
-#define _LARGEFILE64_SOURCE
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <stdint.h>
-#include <stdarg.h>
-#include <string.h>
-#include <getopt.h>
-#include <limits.h>
-#include <assert.h>
-#include <sys/types.h>
-#include <sys/errno.h>
-#include <sys/fcntl.h>
-#include <sys/mount.h>
-#include <sys/statfs.h>
-#include "../../include/linux/magic.h"
-
-
-#ifndef MAX_PATH
-# define MAX_PATH 256
-#endif
-
-#ifndef STR
-# define _STR(x) #x
-# define STR(x) _STR(x)
-#endif
-
-/*
- * pagemap kernel ABI bits
- */
-
-#define PM_ENTRY_BYTES      sizeof(uint64_t)
-#define PM_STATUS_BITS      3
-#define PM_STATUS_OFFSET    (64 - PM_STATUS_BITS)
-#define PM_STATUS_MASK      (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET)
-#define PM_STATUS(nr)       (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK)
-#define PM_PSHIFT_BITS      6
-#define PM_PSHIFT_OFFSET    (PM_STATUS_OFFSET - PM_PSHIFT_BITS)
-#define PM_PSHIFT_MASK      (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET)
-#define PM_PSHIFT(x)        (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
-#define PM_PFRAME_MASK      ((1LL << PM_PSHIFT_OFFSET) - 1)
-#define PM_PFRAME(x)        ((x) & PM_PFRAME_MASK)
-
-#define PM_PRESENT          PM_STATUS(4LL)
-#define PM_SWAP             PM_STATUS(2LL)
-
-
-/*
- * kernel page flags
- */
-
-#define KPF_BYTES              8
-#define PROC_KPAGEFLAGS                "/proc/kpageflags"
-
-/* copied from kpageflags_read() */
-#define KPF_LOCKED             0
-#define KPF_ERROR              1
-#define KPF_REFERENCED         2
-#define KPF_UPTODATE           3
-#define KPF_DIRTY              4
-#define KPF_LRU                        5
-#define KPF_ACTIVE             6
-#define KPF_SLAB               7
-#define KPF_WRITEBACK          8
-#define KPF_RECLAIM            9
-#define KPF_BUDDY              10
-
-/* [11-20] new additions in 2.6.31 */
-#define KPF_MMAP               11
-#define KPF_ANON               12
-#define KPF_SWAPCACHE          13
-#define KPF_SWAPBACKED         14
-#define KPF_COMPOUND_HEAD      15
-#define KPF_COMPOUND_TAIL      16
-#define KPF_HUGE               17
-#define KPF_UNEVICTABLE                18
-#define KPF_HWPOISON           19
-#define KPF_NOPAGE             20
-#define KPF_KSM                        21
-#define KPF_THP                        22
-
-/* [32-] kernel hacking assistances */
-#define KPF_RESERVED           32
-#define KPF_MLOCKED            33
-#define KPF_MAPPEDTODISK       34
-#define KPF_PRIVATE            35
-#define KPF_PRIVATE_2          36
-#define KPF_OWNER_PRIVATE      37
-#define KPF_ARCH               38
-#define KPF_UNCACHED           39
-
-/* [48-] take some arbitrary free slots for expanding overloaded flags
- * not part of kernel API
- */
-#define KPF_READAHEAD          48
-#define KPF_SLOB_FREE          49
-#define KPF_SLUB_FROZEN                50
-#define KPF_SLUB_DEBUG         51
-
-#define KPF_ALL_BITS           ((uint64_t)~0ULL)
-#define KPF_HACKERS_BITS       (0xffffULL << 32)
-#define KPF_OVERLOADED_BITS    (0xffffULL << 48)
-#define BIT(name)              (1ULL << KPF_##name)
-#define BITS_COMPOUND          (BIT(COMPOUND_HEAD) | BIT(COMPOUND_TAIL))
-
-static const char *page_flag_names[] = {
-       [KPF_LOCKED]            = "L:locked",
-       [KPF_ERROR]             = "E:error",
-       [KPF_REFERENCED]        = "R:referenced",
-       [KPF_UPTODATE]          = "U:uptodate",
-       [KPF_DIRTY]             = "D:dirty",
-       [KPF_LRU]               = "l:lru",
-       [KPF_ACTIVE]            = "A:active",
-       [KPF_SLAB]              = "S:slab",
-       [KPF_WRITEBACK]         = "W:writeback",
-       [KPF_RECLAIM]           = "I:reclaim",
-       [KPF_BUDDY]             = "B:buddy",
-
-       [KPF_MMAP]              = "M:mmap",
-       [KPF_ANON]              = "a:anonymous",
-       [KPF_SWAPCACHE]         = "s:swapcache",
-       [KPF_SWAPBACKED]        = "b:swapbacked",
-       [KPF_COMPOUND_HEAD]     = "H:compound_head",
-       [KPF_COMPOUND_TAIL]     = "T:compound_tail",
-       [KPF_HUGE]              = "G:huge",
-       [KPF_UNEVICTABLE]       = "u:unevictable",
-       [KPF_HWPOISON]          = "X:hwpoison",
-       [KPF_NOPAGE]            = "n:nopage",
-       [KPF_KSM]               = "x:ksm",
-       [KPF_THP]               = "t:thp",
-
-       [KPF_RESERVED]          = "r:reserved",
-       [KPF_MLOCKED]           = "m:mlocked",
-       [KPF_MAPPEDTODISK]      = "d:mappedtodisk",
-       [KPF_PRIVATE]           = "P:private",
-       [KPF_PRIVATE_2]         = "p:private_2",
-       [KPF_OWNER_PRIVATE]     = "O:owner_private",
-       [KPF_ARCH]              = "h:arch",
-       [KPF_UNCACHED]          = "c:uncached",
-
-       [KPF_READAHEAD]         = "I:readahead",
-       [KPF_SLOB_FREE]         = "P:slob_free",
-       [KPF_SLUB_FROZEN]       = "A:slub_frozen",
-       [KPF_SLUB_DEBUG]        = "E:slub_debug",
-};
-
-
-static const char *debugfs_known_mountpoints[] = {
-       "/sys/kernel/debug",
-       "/debug",
-       0,
-};
-
-/*
- * data structures
- */
-
-static int             opt_raw;        /* for kernel developers */
-static int             opt_list;       /* list pages (in ranges) */
-static int             opt_no_summary; /* don't show summary */
-static pid_t           opt_pid;        /* process to walk */
-
-#define MAX_ADDR_RANGES        1024
-static int             nr_addr_ranges;
-static unsigned long   opt_offset[MAX_ADDR_RANGES];
-static unsigned long   opt_size[MAX_ADDR_RANGES];
-
-#define MAX_VMAS       10240
-static int             nr_vmas;
-static unsigned long   pg_start[MAX_VMAS];
-static unsigned long   pg_end[MAX_VMAS];
-
-#define MAX_BIT_FILTERS        64
-static int             nr_bit_filters;
-static uint64_t                opt_mask[MAX_BIT_FILTERS];
-static uint64_t                opt_bits[MAX_BIT_FILTERS];
-
-static int             page_size;
-
-static int             pagemap_fd;
-static int             kpageflags_fd;
-
-static int             opt_hwpoison;
-static int             opt_unpoison;
-
-static char            hwpoison_debug_fs[MAX_PATH+1];
-static int             hwpoison_inject_fd;
-static int             hwpoison_forget_fd;
-
-#define HASH_SHIFT     13
-#define HASH_SIZE      (1 << HASH_SHIFT)
-#define HASH_MASK      (HASH_SIZE - 1)
-#define HASH_KEY(flags)        (flags & HASH_MASK)
-
-static unsigned long   total_pages;
-static unsigned long   nr_pages[HASH_SIZE];
-static uint64_t        page_flags[HASH_SIZE];
-
-
-/*
- * helper functions
- */
-
-#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
-
-#define min_t(type, x, y) ({                   \
-       type __min1 = (x);                      \
-       type __min2 = (y);                      \
-       __min1 < __min2 ? __min1 : __min2; })
-
-#define max_t(type, x, y) ({                   \
-       type __max1 = (x);                      \
-       type __max2 = (y);                      \
-       __max1 > __max2 ? __max1 : __max2; })
-
-static unsigned long pages2mb(unsigned long pages)
-{
-       return (pages * page_size) >> 20;
-}
-
-static void fatal(const char *x, ...)
-{
-       va_list ap;
-
-       va_start(ap, x);
-       vfprintf(stderr, x, ap);
-       va_end(ap);
-       exit(EXIT_FAILURE);
-}
-
-static int checked_open(const char *pathname, int flags)
-{
-       int fd = open(pathname, flags);
-
-       if (fd < 0) {
-               perror(pathname);
-               exit(EXIT_FAILURE);
-       }
-
-       return fd;
-}
-
-/*
- * pagemap/kpageflags routines
- */
-
-static unsigned long do_u64_read(int fd, char *name,
-                                uint64_t *buf,
-                                unsigned long index,
-                                unsigned long count)
-{
-       long bytes;
-
-       if (index > ULONG_MAX / 8)
-               fatal("index overflow: %lu\n", index);
-
-       if (lseek(fd, index * 8, SEEK_SET) < 0) {
-               perror(name);
-               exit(EXIT_FAILURE);
-       }
-
-       bytes = read(fd, buf, count * 8);
-       if (bytes < 0) {
-               perror(name);
-               exit(EXIT_FAILURE);
-       }
-       if (bytes % 8)
-               fatal("partial read: %lu bytes\n", bytes);
-
-       return bytes / 8;
-}
-
-static unsigned long kpageflags_read(uint64_t *buf,
-                                    unsigned long index,
-                                    unsigned long pages)
-{
-       return do_u64_read(kpageflags_fd, PROC_KPAGEFLAGS, buf, index, pages);
-}
-
-static unsigned long pagemap_read(uint64_t *buf,
-                                 unsigned long index,
-                                 unsigned long pages)
-{
-       return do_u64_read(pagemap_fd, "/proc/pid/pagemap", buf, index, pages);
-}
-
-static unsigned long pagemap_pfn(uint64_t val)
-{
-       unsigned long pfn;
-
-       if (val & PM_PRESENT)
-               pfn = PM_PFRAME(val);
-       else
-               pfn = 0;
-
-       return pfn;
-}
-
-
-/*
- * page flag names
- */
-
-static char *page_flag_name(uint64_t flags)
-{
-       static char buf[65];
-       int present;
-       int i, j;
-
-       for (i = 0, j = 0; i < ARRAY_SIZE(page_flag_names); i++) {
-               present = (flags >> i) & 1;
-               if (!page_flag_names[i]) {
-                       if (present)
-                               fatal("unknown flag bit %d\n", i);
-                       continue;
-               }
-               buf[j++] = present ? page_flag_names[i][0] : '_';
-       }
-
-       return buf;
-}
-
-static char *page_flag_longname(uint64_t flags)
-{
-       static char buf[1024];
-       int i, n;
-
-       for (i = 0, n = 0; i < ARRAY_SIZE(page_flag_names); i++) {
-               if (!page_flag_names[i])
-                       continue;
-               if ((flags >> i) & 1)
-                       n += snprintf(buf + n, sizeof(buf) - n, "%s,",
-                                       page_flag_names[i] + 2);
-       }
-       if (n)
-               n--;
-       buf[n] = '\0';
-
-       return buf;
-}
-
-
-/*
- * page list and summary
- */
-
-static void show_page_range(unsigned long voffset,
-                           unsigned long offset, uint64_t flags)
-{
-       static uint64_t      flags0;
-       static unsigned long voff;
-       static unsigned long index;
-       static unsigned long count;
-
-       if (flags == flags0 && offset == index + count &&
-           (!opt_pid || voffset == voff + count)) {
-               count++;
-               return;
-       }
-
-       if (count) {
-               if (opt_pid)
-                       printf("%lx\t", voff);
-               printf("%lx\t%lx\t%s\n",
-                               index, count, page_flag_name(flags0));
-       }
-
-       flags0 = flags;
-       index  = offset;
-       voff   = voffset;
-       count  = 1;
-}
-
-static void show_page(unsigned long voffset,
-                     unsigned long offset, uint64_t flags)
-{
-       if (opt_pid)
-               printf("%lx\t", voffset);
-       printf("%lx\t%s\n", offset, page_flag_name(flags));
-}
-
-static void show_summary(void)
-{
-       int i;
-
-       printf("             flags\tpage-count       MB"
-               "  symbolic-flags\t\t\tlong-symbolic-flags\n");
-
-       for (i = 0; i < ARRAY_SIZE(nr_pages); i++) {
-               if (nr_pages[i])
-                       printf("0x%016llx\t%10lu %8lu  %s\t%s\n",
-                               (unsigned long long)page_flags[i],
-                               nr_pages[i],
-                               pages2mb(nr_pages[i]),
-                               page_flag_name(page_flags[i]),
-                               page_flag_longname(page_flags[i]));
-       }
-
-       printf("             total\t%10lu %8lu\n",
-                       total_pages, pages2mb(total_pages));
-}
-
-
-/*
- * page flag filters
- */
-
-static int bit_mask_ok(uint64_t flags)
-{
-       int i;
-
-       for (i = 0; i < nr_bit_filters; i++) {
-               if (opt_bits[i] == KPF_ALL_BITS) {
-                       if ((flags & opt_mask[i]) == 0)
-                               return 0;
-               } else {
-                       if ((flags & opt_mask[i]) != opt_bits[i])
-                               return 0;
-               }
-       }
-
-       return 1;
-}
-
-static uint64_t expand_overloaded_flags(uint64_t flags)
-{
-       /* SLOB/SLUB overload several page flags */
-       if (flags & BIT(SLAB)) {
-               if (flags & BIT(PRIVATE))
-                       flags ^= BIT(PRIVATE) | BIT(SLOB_FREE);
-               if (flags & BIT(ACTIVE))
-                       flags ^= BIT(ACTIVE) | BIT(SLUB_FROZEN);
-               if (flags & BIT(ERROR))
-                       flags ^= BIT(ERROR) | BIT(SLUB_DEBUG);
-       }
-
-       /* PG_reclaim is overloaded as PG_readahead in the read path */
-       if ((flags & (BIT(RECLAIM) | BIT(WRITEBACK))) == BIT(RECLAIM))
-               flags ^= BIT(RECLAIM) | BIT(READAHEAD);
-
-       return flags;
-}
-
-static uint64_t well_known_flags(uint64_t flags)
-{
-       /* hide flags intended only for kernel hacker */
-       flags &= ~KPF_HACKERS_BITS;
-
-       /* hide non-hugeTLB compound pages */
-       if ((flags & BITS_COMPOUND) && !(flags & BIT(HUGE)))
-               flags &= ~BITS_COMPOUND;
-
-       return flags;
-}
-
-static uint64_t kpageflags_flags(uint64_t flags)
-{
-       flags = expand_overloaded_flags(flags);
-
-       if (!opt_raw)
-               flags = well_known_flags(flags);
-
-       return flags;
-}
-
-/* verify that a mountpoint is actually a debugfs instance */
-static int debugfs_valid_mountpoint(const char *debugfs)
-{
-       struct statfs st_fs;
-
-       if (statfs(debugfs, &st_fs) < 0)
-               return -ENOENT;
-       else if (st_fs.f_type != (long) DEBUGFS_MAGIC)
-               return -ENOENT;
-
-       return 0;
-}
-
-/* find the path to the mounted debugfs */
-static const char *debugfs_find_mountpoint(void)
-{
-       const char **ptr;
-       char type[100];
-       FILE *fp;
-
-       ptr = debugfs_known_mountpoints;
-       while (*ptr) {
-               if (debugfs_valid_mountpoint(*ptr) == 0) {
-                       strcpy(hwpoison_debug_fs, *ptr);
-                       return hwpoison_debug_fs;
-               }
-               ptr++;
-       }
-
-       /* give up and parse /proc/mounts */
-       fp = fopen("/proc/mounts", "r");
-       if (fp == NULL)
-               perror("Can't open /proc/mounts for read");
-
-       while (fscanf(fp, "%*s %"
-                     STR(MAX_PATH)
-                     "s %99s %*s %*d %*d\n",
-                     hwpoison_debug_fs, type) == 2) {
-               if (strcmp(type, "debugfs") == 0)
-                       break;
-       }
-       fclose(fp);
-
-       if (strcmp(type, "debugfs") != 0)
-               return NULL;
-
-       return hwpoison_debug_fs;
-}
-
-/* mount the debugfs somewhere if it's not mounted */
-
-static void debugfs_mount(void)
-{
-       const char **ptr;
-
-       /* see if it's already mounted */
-       if (debugfs_find_mountpoint())
-               return;
-
-       ptr = debugfs_known_mountpoints;
-       while (*ptr) {
-               if (mount(NULL, *ptr, "debugfs", 0, NULL) == 0) {
-                       /* save the mountpoint */
-                       strcpy(hwpoison_debug_fs, *ptr);
-                       break;
-               }
-               ptr++;
-       }
-
-       if (*ptr == NULL) {
-               perror("mount debugfs");
-               exit(EXIT_FAILURE);
-       }
-}
-
-/*
- * page actions
- */
-
-static void prepare_hwpoison_fd(void)
-{
-       char buf[MAX_PATH + 1];
-
-       debugfs_mount();
-
-       if (opt_hwpoison && !hwpoison_inject_fd) {
-               snprintf(buf, MAX_PATH, "%s/hwpoison/corrupt-pfn",
-                       hwpoison_debug_fs);
-               hwpoison_inject_fd = checked_open(buf, O_WRONLY);
-       }
-
-       if (opt_unpoison && !hwpoison_forget_fd) {
-               snprintf(buf, MAX_PATH, "%s/hwpoison/unpoison-pfn",
-                       hwpoison_debug_fs);
-               hwpoison_forget_fd = checked_open(buf, O_WRONLY);
-       }
-}
-
-static int hwpoison_page(unsigned long offset)
-{
-       char buf[100];
-       int len;
-
-       len = sprintf(buf, "0x%lx\n", offset);
-       len = write(hwpoison_inject_fd, buf, len);
-       if (len < 0) {
-               perror("hwpoison inject");
-               return len;
-       }
-       return 0;
-}
-
-static int unpoison_page(unsigned long offset)
-{
-       char buf[100];
-       int len;
-
-       len = sprintf(buf, "0x%lx\n", offset);
-       len = write(hwpoison_forget_fd, buf, len);
-       if (len < 0) {
-               perror("hwpoison forget");
-               return len;
-       }
-       return 0;
-}
-
-/*
- * page frame walker
- */
-
-static int hash_slot(uint64_t flags)
-{
-       int k = HASH_KEY(flags);
-       int i;
-
-       /* Explicitly reserve slot 0 for flags 0: the following logic
-        * cannot distinguish an unoccupied slot from slot (flags==0).
-        */
-       if (flags == 0)
-               return 0;
-
-       /* search through the remaining (HASH_SIZE-1) slots */
-       for (i = 1; i < ARRAY_SIZE(page_flags); i++, k++) {
-               if (!k || k >= ARRAY_SIZE(page_flags))
-                       k = 1;
-               if (page_flags[k] == 0) {
-                       page_flags[k] = flags;
-                       return k;
-               }
-               if (page_flags[k] == flags)
-                       return k;
-       }
-
-       fatal("hash table full: bump up HASH_SHIFT?\n");
-       exit(EXIT_FAILURE);
-}
-
-static void add_page(unsigned long voffset,
-                    unsigned long offset, uint64_t flags)
-{
-       flags = kpageflags_flags(flags);
-
-       if (!bit_mask_ok(flags))
-               return;
-
-       if (opt_hwpoison)
-               hwpoison_page(offset);
-       if (opt_unpoison)
-               unpoison_page(offset);
-
-       if (opt_list == 1)
-               show_page_range(voffset, offset, flags);
-       else if (opt_list == 2)
-               show_page(voffset, offset, flags);
-
-       nr_pages[hash_slot(flags)]++;
-       total_pages++;
-}
-
-#define KPAGEFLAGS_BATCH       (64 << 10)      /* 64k pages */
-static void walk_pfn(unsigned long voffset,
-                    unsigned long index,
-                    unsigned long count)
-{
-       uint64_t buf[KPAGEFLAGS_BATCH];
-       unsigned long batch;
-       long pages;
-       unsigned long i;
-
-       while (count) {
-               batch = min_t(unsigned long, count, KPAGEFLAGS_BATCH);
-               pages = kpageflags_read(buf, index, batch);
-               if (pages == 0)
-                       break;
-
-               for (i = 0; i < pages; i++)
-                       add_page(voffset + i, index + i, buf[i]);
-
-               index += pages;
-               count -= pages;
-       }
-}
-
-#define PAGEMAP_BATCH  (64 << 10)
-static void walk_vma(unsigned long index, unsigned long count)
-{
-       uint64_t buf[PAGEMAP_BATCH];
-       unsigned long batch;
-       unsigned long pages;
-       unsigned long pfn;
-       unsigned long i;
-
-       while (count) {
-               batch = min_t(unsigned long, count, PAGEMAP_BATCH);
-               pages = pagemap_read(buf, index, batch);
-               if (pages == 0)
-                       break;
-
-               for (i = 0; i < pages; i++) {
-                       pfn = pagemap_pfn(buf[i]);
-                       if (pfn)
-                               walk_pfn(index + i, pfn, 1);
-               }
-
-               index += pages;
-               count -= pages;
-       }
-}
-
-static void walk_task(unsigned long index, unsigned long count)
-{
-       const unsigned long end = index + count;
-       unsigned long start;
-       int i = 0;
-
-       while (index < end) {
-
-               while (pg_end[i] <= index)
-                       if (++i >= nr_vmas)
-                               return;
-               if (pg_start[i] >= end)
-                       return;
-
-               start = max_t(unsigned long, pg_start[i], index);
-               index = min_t(unsigned long, pg_end[i], end);
-
-               assert(start < index);
-               walk_vma(start, index - start);
-       }
-}
-
-static void add_addr_range(unsigned long offset, unsigned long size)
-{
-       if (nr_addr_ranges >= MAX_ADDR_RANGES)
-               fatal("too many addr ranges\n");
-
-       opt_offset[nr_addr_ranges] = offset;
-       opt_size[nr_addr_ranges] = min_t(unsigned long, size, ULONG_MAX-offset);
-       nr_addr_ranges++;
-}
-
-static void walk_addr_ranges(void)
-{
-       int i;
-
-       kpageflags_fd = checked_open(PROC_KPAGEFLAGS, O_RDONLY);
-
-       if (!nr_addr_ranges)
-               add_addr_range(0, ULONG_MAX);
-
-       for (i = 0; i < nr_addr_ranges; i++)
-               if (!opt_pid)
-                       walk_pfn(0, opt_offset[i], opt_size[i]);
-               else
-                       walk_task(opt_offset[i], opt_size[i]);
-
-       close(kpageflags_fd);
-}
-
-
-/*
- * user interface
- */
-
-static const char *page_flag_type(uint64_t flag)
-{
-       if (flag & KPF_HACKERS_BITS)
-               return "(r)";
-       if (flag & KPF_OVERLOADED_BITS)
-               return "(o)";
-       return "   ";
-}
-
-static void usage(void)
-{
-       int i, j;
-
-       printf(
-"page-types [options]\n"
-"            -r|--raw                   Raw mode, for kernel developers\n"
-"            -d|--describe flags        Describe flags\n"
-"            -a|--addr    addr-spec     Walk a range of pages\n"
-"            -b|--bits    bits-spec     Walk pages with specified bits\n"
-"            -p|--pid     pid           Walk process address space\n"
-#if 0 /* planned features */
-"            -f|--file    filename      Walk file address space\n"
-#endif
-"            -l|--list                  Show page details in ranges\n"
-"            -L|--list-each             Show page details one by one\n"
-"            -N|--no-summary            Don't show summary info\n"
-"            -X|--hwpoison              hwpoison pages\n"
-"            -x|--unpoison              unpoison pages\n"
-"            -h|--help                  Show this usage message\n"
-"flags:\n"
-"            0x10                       bitfield format, e.g.\n"
-"            anon                       bit-name, e.g.\n"
-"            0x10,anon                  comma-separated list, e.g.\n"
-"addr-spec:\n"
-"            N                          one page at offset N (unit: pages)\n"
-"            N+M                        pages range from N to N+M-1\n"
-"            N,M                        pages range from N to M-1\n"
-"            N,                         pages range from N to end\n"
-"            ,M                         pages range from 0 to M-1\n"
-"bits-spec:\n"
-"            bit1,bit2                  (flags & (bit1|bit2)) != 0\n"
-"            bit1,bit2=bit1             (flags & (bit1|bit2)) == bit1\n"
-"            bit1,~bit2                 (flags & (bit1|bit2)) == bit1\n"
-"            =bit1,bit2                 flags == (bit1|bit2)\n"
-"bit-names:\n"
-       );
-
-       for (i = 0, j = 0; i < ARRAY_SIZE(page_flag_names); i++) {
-               if (!page_flag_names[i])
-                       continue;
-               printf("%16s%s", page_flag_names[i] + 2,
-                                page_flag_type(1ULL << i));
-               if (++j > 3) {
-                       j = 0;
-                       putchar('\n');
-               }
-       }
-       printf("\n                                   "
-               "(r) raw mode bits  (o) overloaded bits\n");
-}
-
-static unsigned long long parse_number(const char *str)
-{
-       unsigned long long n;
-
-       n = strtoll(str, NULL, 0);
-
-       if (n == 0 && str[0] != '0')
-               fatal("invalid name or number: %s\n", str);
-
-       return n;
-}
-
-static void parse_pid(const char *str)
-{
-       FILE *file;
-       char buf[5000];
-
-       opt_pid = parse_number(str);
-
-       sprintf(buf, "/proc/%d/pagemap", opt_pid);
-       pagemap_fd = checked_open(buf, O_RDONLY);
-
-       sprintf(buf, "/proc/%d/maps", opt_pid);
-       file = fopen(buf, "r");
-       if (!file) {
-               perror(buf);
-               exit(EXIT_FAILURE);
-       }
-
-       while (fgets(buf, sizeof(buf), file) != NULL) {
-               unsigned long vm_start;
-               unsigned long vm_end;
-               unsigned long long pgoff;
-               int major, minor;
-               char r, w, x, s;
-               unsigned long ino;
-               int n;
-
-               n = sscanf(buf, "%lx-%lx %c%c%c%c %llx %x:%x %lu",
-                          &vm_start,
-                          &vm_end,
-                          &r, &w, &x, &s,
-                          &pgoff,
-                          &major, &minor,
-                          &ino);
-               if (n < 10) {
-                       fprintf(stderr, "unexpected line: %s\n", buf);
-                       continue;
-               }
-               pg_start[nr_vmas] = vm_start / page_size;
-               pg_end[nr_vmas] = vm_end / page_size;
-               if (++nr_vmas >= MAX_VMAS) {
-                       fprintf(stderr, "too many VMAs\n");
-                       break;
-               }
-       }
-       fclose(file);
-}
-
-static void parse_file(const char *name)
-{
-}
-
-static void parse_addr_range(const char *optarg)
-{
-       unsigned long offset;
-       unsigned long size;
-       char *p;
-
-       p = strchr(optarg, ',');
-       if (!p)
-               p = strchr(optarg, '+');
-
-       if (p == optarg) {
-               offset = 0;
-               size   = parse_number(p + 1);
-       } else if (p) {
-               offset = parse_number(optarg);
-               if (p[1] == '\0')
-                       size = ULONG_MAX;
-               else {
-                       size = parse_number(p + 1);
-                       if (*p == ',') {
-                               if (size < offset)
-                                       fatal("invalid range: %lu,%lu\n",
-                                                       offset, size);
-                               size -= offset;
-                       }
-               }
-       } else {
-               offset = parse_number(optarg);
-               size   = 1;
-       }
-
-       add_addr_range(offset, size);
-}
-
-static void add_bits_filter(uint64_t mask, uint64_t bits)
-{
-       if (nr_bit_filters >= MAX_BIT_FILTERS)
-               fatal("too much bit filters\n");
-
-       opt_mask[nr_bit_filters] = mask;
-       opt_bits[nr_bit_filters] = bits;
-       nr_bit_filters++;
-}
-
-static uint64_t parse_flag_name(const char *str, int len)
-{
-       int i;
-
-       if (!*str || !len)
-               return 0;
-
-       if (len <= 8 && !strncmp(str, "compound", len))
-               return BITS_COMPOUND;
-
-       for (i = 0; i < ARRAY_SIZE(page_flag_names); i++) {
-               if (!page_flag_names[i])
-                       continue;
-               if (!strncmp(str, page_flag_names[i] + 2, len))
-                       return 1ULL << i;
-       }
-
-       return parse_number(str);
-}
-
-static uint64_t parse_flag_names(const char *str, int all)
-{
-       const char *p    = str;
-       uint64_t   flags = 0;
-
-       while (1) {
-               if (*p == ',' || *p == '=' || *p == '\0') {
-                       if ((*str != '~') || (*str == '~' && all && *++str))
-                               flags |= parse_flag_name(str, p - str);
-                       if (*p != ',')
-                               break;
-                       str = p + 1;
-               }
-               p++;
-       }
-
-       return flags;
-}
-
-static void parse_bits_mask(const char *optarg)
-{
-       uint64_t mask;
-       uint64_t bits;
-       const char *p;
-
-       p = strchr(optarg, '=');
-       if (p == optarg) {
-               mask = KPF_ALL_BITS;
-               bits = parse_flag_names(p + 1, 0);
-       } else if (p) {
-               mask = parse_flag_names(optarg, 0);
-               bits = parse_flag_names(p + 1, 0);
-       } else if (strchr(optarg, '~')) {
-               mask = parse_flag_names(optarg, 1);
-               bits = parse_flag_names(optarg, 0);
-       } else {
-               mask = parse_flag_names(optarg, 0);
-               bits = KPF_ALL_BITS;
-       }
-
-       add_bits_filter(mask, bits);
-}
-
-static void describe_flags(const char *optarg)
-{
-       uint64_t flags = parse_flag_names(optarg, 0);
-
-       printf("0x%016llx\t%s\t%s\n",
-               (unsigned long long)flags,
-               page_flag_name(flags),
-               page_flag_longname(flags));
-}
-
-static const struct option opts[] = {
-       { "raw"       , 0, NULL, 'r' },
-       { "pid"       , 1, NULL, 'p' },
-       { "file"      , 1, NULL, 'f' },
-       { "addr"      , 1, NULL, 'a' },
-       { "bits"      , 1, NULL, 'b' },
-       { "describe"  , 1, NULL, 'd' },
-       { "list"      , 0, NULL, 'l' },
-       { "list-each" , 0, NULL, 'L' },
-       { "no-summary", 0, NULL, 'N' },
-       { "hwpoison"  , 0, NULL, 'X' },
-       { "unpoison"  , 0, NULL, 'x' },
-       { "help"      , 0, NULL, 'h' },
-       { NULL        , 0, NULL, 0 }
-};
-
-int main(int argc, char *argv[])
-{
-       int c;
-
-       page_size = getpagesize();
-
-       while ((c = getopt_long(argc, argv,
-                               "rp:f:a:b:d:lLNXxh", opts, NULL)) != -1) {
-               switch (c) {
-               case 'r':
-                       opt_raw = 1;
-                       break;
-               case 'p':
-                       parse_pid(optarg);
-                       break;
-               case 'f':
-                       parse_file(optarg);
-                       break;
-               case 'a':
-                       parse_addr_range(optarg);
-                       break;
-               case 'b':
-                       parse_bits_mask(optarg);
-                       break;
-               case 'd':
-                       describe_flags(optarg);
-                       exit(0);
-               case 'l':
-                       opt_list = 1;
-                       break;
-               case 'L':
-                       opt_list = 2;
-                       break;
-               case 'N':
-                       opt_no_summary = 1;
-                       break;
-               case 'X':
-                       opt_hwpoison = 1;
-                       prepare_hwpoison_fd();
-                       break;
-               case 'x':
-                       opt_unpoison = 1;
-                       prepare_hwpoison_fd();
-                       break;
-               case 'h':
-                       usage();
-                       exit(0);
-               default:
-                       usage();
-                       exit(1);
-               }
-       }
-
-       if (opt_list && opt_pid)
-               printf("voffset\t");
-       if (opt_list == 1)
-               printf("offset\tlen\tflags\n");
-       if (opt_list == 2)
-               printf("offset\tflags\n");
-
-       walk_addr_ranges();
-
-       if (opt_list == 1)
-               show_page_range(0, 0, 0);  /* drain the buffer */
-
-       if (opt_no_summary)
-               return 0;
-
-       if (opt_list)
-               printf("\n\n");
-
-       show_summary();
-
-       return 0;
-}
index 7dcb35285be7bd0d805c303f3dfc64d2ef74ebfb..02c5d2ce23bf121f17479bba2b52460419481930 100644 (file)
 #include <asm/smp_plat.h>
 #include <asm/tlbflush.h>
 
-static void on_each_cpu_mask(void (*func)(void *), void *info, int wait,
-       const struct cpumask *mask)
-{
-       preempt_disable();
-
-       smp_call_function_many(mask, func, info, wait);
-       if (cpumask_test_cpu(smp_processor_id(), mask))
-               func(info);
-
-       preempt_enable();
-}
-
 /**********************************************************************/
 
 /*
@@ -87,7 +75,7 @@ void flush_tlb_all(void)
 void flush_tlb_mm(struct mm_struct *mm)
 {
        if (tlb_ops_need_broadcast())
-               on_each_cpu_mask(ipi_flush_tlb_mm, mm, 1, mm_cpumask(mm));
+               on_each_cpu_mask(mm_cpumask(mm), ipi_flush_tlb_mm, mm, 1);
        else
                local_flush_tlb_mm(mm);
 }
@@ -98,7 +86,8 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr)
                struct tlb_args ta;
                ta.ta_vma = vma;
                ta.ta_start = uaddr;
-               on_each_cpu_mask(ipi_flush_tlb_page, &ta, 1, mm_cpumask(vma->vm_mm));
+               on_each_cpu_mask(mm_cpumask(vma->vm_mm), ipi_flush_tlb_page,
+                                       &ta, 1);
        } else
                local_flush_tlb_page(vma, uaddr);
 }
@@ -121,7 +110,8 @@ void flush_tlb_range(struct vm_area_struct *vma,
                ta.ta_vma = vma;
                ta.ta_start = start;
                ta.ta_end = end;
-               on_each_cpu_mask(ipi_flush_tlb_range, &ta, 1, mm_cpumask(vma->vm_mm));
+               on_each_cpu_mask(mm_cpumask(vma->vm_mm), ipi_flush_tlb_range,
+                                       &ta, 1);
        } else
                local_flush_tlb_range(vma, start, end);
 }
index d1cc81e63ba63a08602fc58f4c1e3e63db1a065c..ac795d311f4411cef9cd8c76b1e91cc6b7d5988c 100644 (file)
@@ -843,7 +843,7 @@ early_param("additional_cpus", setup_additional_cpus);
  * are onlined, or offlined. The reason is per-cpu data-structures
  * are allocated by some modules at init time, and dont expect to
  * do this dynamically on cpu arrival/departure.
- * cpu_present_map on the other hand can change dynamically.
+ * cpu_present_mask on the other hand can change dynamically.
  * In case when cpu_hotplug is not compiled, then we resort to current
  * behaviour, which is cpu_possible == cpu_present.
  * - Ashok Raj
@@ -921,7 +921,7 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu)
 
        acpi_map_cpu2node(handle, cpu, physid);
 
-       cpu_set(cpu, cpu_present_map);
+       set_cpu_present(cpu, true);
        ia64_cpu_to_sapicid[cpu] = physid;
 
        acpi_processor_set_pdc(handle);
@@ -940,7 +940,7 @@ EXPORT_SYMBOL(acpi_map_lsapic);
 int acpi_unmap_lsapic(int cpu)
 {
        ia64_cpu_to_sapicid[cpu] = -1;
-       cpu_clear(cpu, cpu_present_map);
+       set_cpu_present(cpu, false);
 
 #ifdef CONFIG_ACPI_NUMA
        /* NUMA specific cleanup's */
index 08113b1d30f714a25e903875d3dc1f6e9c85dbc5..5c3e0888265a80a2fe75d48f6ed8177aeb3aad7f 100644 (file)
@@ -117,7 +117,7 @@ static inline int find_unassigned_vector(cpumask_t domain)
        cpumask_t mask;
        int pos, vector;
 
-       cpus_and(mask, domain, cpu_online_map);
+       cpumask_and(&mask, &domain, cpu_online_mask);
        if (cpus_empty(mask))
                return -EINVAL;
 
@@ -140,7 +140,7 @@ static int __bind_irq_vector(int irq, int vector, cpumask_t domain)
        BUG_ON((unsigned)irq >= NR_IRQS);
        BUG_ON((unsigned)vector >= IA64_NUM_VECTORS);
 
-       cpus_and(mask, domain, cpu_online_map);
+       cpumask_and(&mask, &domain, cpu_online_mask);
        if (cpus_empty(mask))
                return -EINVAL;
        if ((cfg->vector == vector) && cpus_equal(cfg->domain, domain))
@@ -178,7 +178,7 @@ static void __clear_irq_vector(int irq)
        BUG_ON(cfg->vector == IRQ_VECTOR_UNASSIGNED);
        vector = cfg->vector;
        domain = cfg->domain;
-       cpus_and(mask, cfg->domain, cpu_online_map);
+       cpumask_and(&mask, &cfg->domain, cpu_online_mask);
        for_each_cpu_mask(cpu, mask)
                per_cpu(vector_irq, cpu)[vector] = -1;
        cfg->vector = IRQ_VECTOR_UNASSIGNED;
@@ -321,7 +321,7 @@ void irq_complete_move(unsigned irq)
        if (unlikely(cpu_isset(smp_processor_id(), cfg->old_domain)))
                return;
 
-       cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
+       cpumask_and(&cleanup_mask, &cfg->old_domain, cpu_online_mask);
        cfg->move_cleanup_count = cpus_weight(cleanup_mask);
        for_each_cpu_mask(i, cleanup_mask)
                platform_send_ipi(i, IA64_IRQ_MOVE_VECTOR, IA64_IPI_DM_INT, 0);
index a39fe098a7322e15a480add1176d8154e55c949a..65bf9cd390443c3574e8d4da5a175c566928b535 100644 (file)
@@ -1514,7 +1514,8 @@ static void
 ia64_mca_cmc_poll (unsigned long dummy)
 {
        /* Trigger a CMC interrupt cascade  */
-       platform_send_ipi(first_cpu(cpu_online_map), IA64_CMCP_VECTOR, IA64_IPI_DM_INT, 0);
+       platform_send_ipi(cpumask_first(cpu_online_mask), IA64_CMCP_VECTOR,
+                                                       IA64_IPI_DM_INT, 0);
 }
 
 /*
@@ -1590,7 +1591,8 @@ static void
 ia64_mca_cpe_poll (unsigned long dummy)
 {
        /* Trigger a CPE interrupt cascade  */
-       platform_send_ipi(first_cpu(cpu_online_map), IA64_CPEP_VECTOR, IA64_IPI_DM_INT, 0);
+       platform_send_ipi(cpumask_first(cpu_online_mask), IA64_CPEP_VECTOR,
+                                                       IA64_IPI_DM_INT, 0);
 }
 
 #endif /* CONFIG_ACPI */
index 94e0db72d4a68591370de63af9db07baaae203ab..fb2f1e622877e202f8bdb6be4d6922fa6431986f 100644 (file)
@@ -57,7 +57,7 @@ int ia64_setup_msi_irq(struct pci_dev *pdev, struct msi_desc *desc)
                return irq;
 
        irq_set_msi_desc(irq, desc);
-       cpus_and(mask, irq_to_domain(irq), cpu_online_map);
+       cpumask_and(&mask, &(irq_to_domain(irq)), cpu_online_mask);
        dest_phys_id = cpu_physical_id(first_cpu(mask));
        vector = irq_to_vector(irq);
 
@@ -179,7 +179,7 @@ msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
        unsigned dest;
        cpumask_t mask;
 
-       cpus_and(mask, irq_to_domain(irq), cpu_online_map);
+       cpumask_and(&mask, &(irq_to_domain(irq)), cpu_online_mask);
        dest = cpu_physical_id(first_cpu(mask));
 
        msg->address_hi = 0;
index c45e6ddb4ddb0ffc50c48d5936ea25379211eb3b..aaefd9b94f2fc6f57865b5d500bceaf50fdc84b3 100644 (file)
@@ -485,7 +485,7 @@ mark_bsp_online (void)
 {
 #ifdef CONFIG_SMP
        /* If we register an early console, allow CPU 0 to printk */
-       cpu_set(smp_processor_id(), cpu_online_map);
+       set_cpu_online(smp_processor_id(), true);
 #endif
 }
 
index e27f925032aeb73529196fcd41c9b550bda5c8ed..9fcd4e63048f65f5ae638833c7a51b756883acbb 100644 (file)
@@ -76,7 +76,7 @@ stop_this_cpu(void)
        /*
         * Remove this CPU:
         */
-       cpu_clear(smp_processor_id(), cpu_online_map);
+       set_cpu_online(smp_processor_id(), false);
        max_xtp();
        local_irq_disable();
        cpu_halt();
index fb7927be75c4a4aedf774801b39e3d7ab1df32b7..796f6a5b966a742d46a4f63417939e0a737a0732 100644 (file)
@@ -400,7 +400,7 @@ smp_callin (void)
        /* Setup the per cpu irq handling data structures */
        __setup_vector_irq(cpuid);
        notify_cpu_starting(cpuid);
-       cpu_set(cpuid, cpu_online_map);
+       set_cpu_online(cpuid, true);
        per_cpu(cpu_state, cpuid) = CPU_ONLINE;
        spin_unlock(&vector_lock);
        ipi_call_unlock_irq();
@@ -547,7 +547,7 @@ do_rest:
        if (!cpu_isset(cpu, cpu_callin_map)) {
                printk(KERN_ERR "Processor 0x%x/0x%x is stuck.\n", cpu, sapicid);
                ia64_cpu_to_sapicid[cpu] = -1;
-               cpu_clear(cpu, cpu_online_map);  /* was set in smp_callin() */
+               set_cpu_online(cpu, false);  /* was set in smp_callin() */
                return -EINVAL;
        }
        return 0;
@@ -577,8 +577,7 @@ smp_build_cpu_map (void)
        }
 
        ia64_cpu_to_sapicid[0] = boot_cpu_id;
-       cpus_clear(cpu_present_map);
-       set_cpu_present(0, true);
+       init_cpu_present(cpumask_of(0));
        set_cpu_possible(0, true);
        for (cpu = 1, i = 0; i < smp_boot_data.cpu_count; i++) {
                sapicid = smp_boot_data.cpu_phys_id[i];
@@ -605,10 +604,6 @@ smp_prepare_cpus (unsigned int max_cpus)
 
        smp_setup_percpu_timer();
 
-       /*
-        * We have the boot CPU online for sure.
-        */
-       cpu_set(0, cpu_online_map);
        cpu_set(0, cpu_callin_map);
 
        local_cpu_data->loops_per_jiffy = loops_per_jiffy;
@@ -632,7 +627,7 @@ smp_prepare_cpus (unsigned int max_cpus)
 
 void __devinit smp_prepare_boot_cpu(void)
 {
-       cpu_set(smp_processor_id(), cpu_online_map);
+       set_cpu_online(smp_processor_id(), true);
        cpu_set(smp_processor_id(), cpu_callin_map);
        set_numa_node(cpu_to_node_map[smp_processor_id()]);
        per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
@@ -689,7 +684,7 @@ int migrate_platform_irqs(unsigned int cpu)
                        /*
                         * Now re-target the CPEI to a different processor
                         */
-                       new_cpei_cpu = any_online_cpu(cpu_online_map);
+                       new_cpei_cpu = cpumask_any(cpu_online_mask);
                        mask = cpumask_of(new_cpei_cpu);
                        set_cpei_target_cpu(new_cpei_cpu);
                        data = irq_get_irq_data(ia64_cpe_irq);
@@ -731,10 +726,10 @@ int __cpu_disable(void)
                        return -EBUSY;
        }
 
-       cpu_clear(cpu, cpu_online_map);
+       set_cpu_online(cpu, false);
 
        if (migrate_platform_irqs(cpu)) {
-               cpu_set(cpu, cpu_online_map);
+               set_cpu_online(cpu, true);
                return -EBUSY;
        }
 
index 9deb21dbf62965740f2b6c34c1630dbf80d1f4aa..c64460b9c704d56e7cb5efd9fb750e251a9f707f 100644 (file)
@@ -220,7 +220,8 @@ static ssize_t show_shared_cpu_map(struct cache_info *this_leaf, char *buf)
        ssize_t len;
        cpumask_t shared_cpu_map;
 
-       cpus_and(shared_cpu_map, this_leaf->shared_cpu_map, cpu_online_map);
+       cpumask_and(&shared_cpu_map,
+                               &this_leaf->shared_cpu_map, cpu_online_mask);
        len = cpumask_scnprintf(buf, NR_CPUS+1, &shared_cpu_map);
        len += sprintf(buf+len, "\n");
        return len;
index 532124ae4b128f896335b0629af9043fc5277aaa..1aa759aeb5b3f0db2842a22c99b2f493dca10cf7 100644 (file)
@@ -43,10 +43,6 @@ void evaluate_message(int tag);
 /* Boot a secondary cpu */
 void online_secondary(void);
 
-/* Call a function on a specified set of CPUs (may include this one). */
-extern void on_each_cpu_mask(const struct cpumask *mask,
-                            void (*func)(void *), void *info, bool wait);
-
 /* Topology of the supervisor tile grid, and coordinates of boot processor */
 extern HV_Topology smp_topology;
 
@@ -91,9 +87,6 @@ void print_disabled_cpus(void);
 
 #else /* !CONFIG_SMP */
 
-#define on_each_cpu_mask(mask, func, info, wait)               \
-  do { if (cpumask_test_cpu(0, (mask))) func(info); } while (0)
-
 #define smp_master_cpu         0
 #define smp_height             1
 #define smp_width              1
index c52224d5ed45f06a38f4461e19f485a2bb4cadb3..a44e103c5a636670bdbd358f5143419c24d19d7a 100644 (file)
@@ -87,25 +87,6 @@ void send_IPI_allbutself(int tag)
        send_IPI_many(&mask, tag);
 }
 
-
-/*
- * Provide smp_call_function_mask, but also run function locally
- * if specified in the mask.
- */
-void on_each_cpu_mask(const struct cpumask *mask, void (*func)(void *),
-                     void *info, bool wait)
-{
-       int cpu = get_cpu();
-       smp_call_function_many(mask, func, info, wait);
-       if (cpumask_test_cpu(cpu, mask)) {
-               local_irq_disable();
-               func(info);
-               local_irq_enable();
-       }
-       put_cpu();
-}
-
-
 /*
  * Functions related to starting/stopping cpus.
  */
index 8cbeb7209c3e2a24af1e8b681a12494e4b00ee1c..1a2901562059a48a4ec863c64e4150b881ed9271 100644 (file)
@@ -508,15 +508,6 @@ static void __init memblock_x86_reserve_range_setup_data(void)
 
 #ifdef CONFIG_KEXEC
 
-static inline unsigned long long get_total_mem(void)
-{
-       unsigned long long total;
-
-       total = max_pfn - min_low_pfn;
-
-       return total << PAGE_SHIFT;
-}
-
 /*
  * Keep the crash kernel below this limit.  On 32 bits earlier kernels
  * would limit the kernel to the low 512 MiB due to mapping restrictions.
@@ -535,7 +526,7 @@ static void __init reserve_crashkernel(void)
        unsigned long long crash_size, crash_base;
        int ret;
 
-       total_mem = get_total_mem();
+       total_mem = memblock_phys_mem_size();
 
        ret = parse_crashkernel(boot_command_line, total_mem,
                        &crash_size, &crash_base);
index c7ba11f9b203a4e83a66307b299bc67a513f4365..061427a75d375a5ed0655bdac8422e668a252a5e 100644 (file)
@@ -38,7 +38,7 @@
 
 #include <linux/nbd.h>
 
-#define LO_MAGIC 0x68797548
+#define NBD_MAGIC 0x68797548
 
 #ifdef NDEBUG
 #define dprintk(flags, fmt...)
@@ -115,7 +115,7 @@ static void nbd_end_request(struct request *req)
        spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
-static void sock_shutdown(struct nbd_device *lo, int lock)
+static void sock_shutdown(struct nbd_device *nbd, int lock)
 {
        /* Forcibly shutdown the socket causing all listeners
         * to error
@@ -124,14 +124,14 @@ static void sock_shutdown(struct nbd_device *lo, int lock)
         * there should be a more generic interface rather than
         * calling socket ops directly here */
        if (lock)
-               mutex_lock(&lo->tx_lock);
-       if (lo->sock) {
-               dev_warn(disk_to_dev(lo->disk), "shutting down socket\n");
-               kernel_sock_shutdown(lo->sock, SHUT_RDWR);
-               lo->sock = NULL;
+               mutex_lock(&nbd->tx_lock);
+       if (nbd->sock) {
+               dev_warn(disk_to_dev(nbd->disk), "shutting down socket\n");
+               kernel_sock_shutdown(nbd->sock, SHUT_RDWR);
+               nbd->sock = NULL;
        }
        if (lock)
-               mutex_unlock(&lo->tx_lock);
+               mutex_unlock(&nbd->tx_lock);
 }
 
 static void nbd_xmit_timeout(unsigned long arg)
@@ -146,17 +146,17 @@ static void nbd_xmit_timeout(unsigned long arg)
 /*
  *  Send or receive packet.
  */
-static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size,
+static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size,
                int msg_flags)
 {
-       struct socket *sock = lo->sock;
+       struct socket *sock = nbd->sock;
        int result;
        struct msghdr msg;
        struct kvec iov;
        sigset_t blocked, oldset;
 
        if (unlikely(!sock)) {
-               dev_err(disk_to_dev(lo->disk),
+               dev_err(disk_to_dev(nbd->disk),
                        "Attempted %s on closed socket in sock_xmit\n",
                        (send ? "send" : "recv"));
                return -EINVAL;
@@ -180,15 +180,15 @@ static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size,
                if (send) {
                        struct timer_list ti;
 
-                       if (lo->xmit_timeout) {
+                       if (nbd->xmit_timeout) {
                                init_timer(&ti);
                                ti.function = nbd_xmit_timeout;
                                ti.data = (unsigned long)current;
-                               ti.expires = jiffies + lo->xmit_timeout;
+                               ti.expires = jiffies + nbd->xmit_timeout;
                                add_timer(&ti);
                        }
                        result = kernel_sendmsg(sock, &msg, &iov, 1, size);
-                       if (lo->xmit_timeout)
+                       if (nbd->xmit_timeout)
                                del_timer_sync(&ti);
                } else
                        result = kernel_recvmsg(sock, &msg, &iov, 1, size,
@@ -200,7 +200,7 @@ static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size,
                                task_pid_nr(current), current->comm,
                                dequeue_signal_lock(current, &current->blocked, &info));
                        result = -EINTR;
-                       sock_shutdown(lo, !send);
+                       sock_shutdown(nbd, !send);
                        break;
                }
 
@@ -218,18 +218,19 @@ static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size,
        return result;
 }
 
-static inline int sock_send_bvec(struct nbd_device *lo, struct bio_vec *bvec,
+static inline int sock_send_bvec(struct nbd_device *nbd, struct bio_vec *bvec,
                int flags)
 {
        int result;
        void *kaddr = kmap(bvec->bv_page);
-       result = sock_xmit(lo, 1, kaddr + bvec->bv_offset, bvec->bv_len, flags);
+       result = sock_xmit(nbd, 1, kaddr + bvec->bv_offset,
+                          bvec->bv_len, flags);
        kunmap(bvec->bv_page);
        return result;
 }
 
 /* always call with the tx_lock held */
-static int nbd_send_req(struct nbd_device *lo, struct request *req)
+static int nbd_send_req(struct nbd_device *nbd, struct request *req)
 {
        int result, flags;
        struct nbd_request request;
@@ -242,14 +243,14 @@ static int nbd_send_req(struct nbd_device *lo, struct request *req)
        memcpy(request.handle, &req, sizeof(req));
 
        dprintk(DBG_TX, "%s: request %p: sending control (%s@%llu,%uB)\n",
-                       lo->disk->disk_name, req,
+                       nbd->disk->disk_name, req,
                        nbdcmd_to_ascii(nbd_cmd(req)),
                        (unsigned long long)blk_rq_pos(req) << 9,
                        blk_rq_bytes(req));
-       result = sock_xmit(lo, 1, &request, sizeof(request),
+       result = sock_xmit(nbd, 1, &request, sizeof(request),
                        (nbd_cmd(req) == NBD_CMD_WRITE) ? MSG_MORE : 0);
        if (result <= 0) {
-               dev_err(disk_to_dev(lo->disk),
+               dev_err(disk_to_dev(nbd->disk),
                        "Send control failed (result %d)\n", result);
                goto error_out;
        }
@@ -266,10 +267,10 @@ static int nbd_send_req(struct nbd_device *lo, struct request *req)
                        if (!rq_iter_last(req, iter))
                                flags = MSG_MORE;
                        dprintk(DBG_TX, "%s: request %p: sending %d bytes data\n",
-                                       lo->disk->disk_name, req, bvec->bv_len);
-                       result = sock_send_bvec(lo, bvec, flags);
+                                       nbd->disk->disk_name, req, bvec->bv_len);
+                       result = sock_send_bvec(nbd, bvec, flags);
                        if (result <= 0) {
-                               dev_err(disk_to_dev(lo->disk),
+                               dev_err(disk_to_dev(nbd->disk),
                                        "Send data failed (result %d)\n",
                                        result);
                                goto error_out;
@@ -282,25 +283,25 @@ error_out:
        return -EIO;
 }
 
-static struct request *nbd_find_request(struct nbd_device *lo,
+static struct request *nbd_find_request(struct nbd_device *nbd,
                                        struct request *xreq)
 {
        struct request *req, *tmp;
        int err;
 
-       err = wait_event_interruptible(lo->active_wq, lo->active_req != xreq);
+       err = wait_event_interruptible(nbd->active_wq, nbd->active_req != xreq);
        if (unlikely(err))
                goto out;
 
-       spin_lock(&lo->queue_lock);
-       list_for_each_entry_safe(req, tmp, &lo->queue_head, queuelist) {
+       spin_lock(&nbd->queue_lock);
+       list_for_each_entry_safe(req, tmp, &nbd->queue_head, queuelist) {
                if (req != xreq)
                        continue;
                list_del_init(&req->queuelist);
-               spin_unlock(&lo->queue_lock);
+               spin_unlock(&nbd->queue_lock);
                return req;
        }
-       spin_unlock(&lo->queue_lock);
+       spin_unlock(&nbd->queue_lock);
 
        err = -ENOENT;
 
@@ -308,78 +309,78 @@ out:
        return ERR_PTR(err);
 }
 
-static inline int sock_recv_bvec(struct nbd_device *lo, struct bio_vec *bvec)
+static inline int sock_recv_bvec(struct nbd_device *nbd, struct bio_vec *bvec)
 {
        int result;
        void *kaddr = kmap(bvec->bv_page);
-       result = sock_xmit(lo, 0, kaddr + bvec->bv_offset, bvec->bv_len,
+       result = sock_xmit(nbd, 0, kaddr + bvec->bv_offset, bvec->bv_len,
                        MSG_WAITALL);
        kunmap(bvec->bv_page);
        return result;
 }
 
 /* NULL returned = something went wrong, inform userspace */
-static struct request *nbd_read_stat(struct nbd_device *lo)
+static struct request *nbd_read_stat(struct nbd_device *nbd)
 {
        int result;
        struct nbd_reply reply;
        struct request *req;
 
        reply.magic = 0;
-       result = sock_xmit(lo, 0, &reply, sizeof(reply), MSG_WAITALL);
+       result = sock_xmit(nbd, 0, &reply, sizeof(reply), MSG_WAITALL);
        if (result <= 0) {
-               dev_err(disk_to_dev(lo->disk),
+               dev_err(disk_to_dev(nbd->disk),
                        "Receive control failed (result %d)\n", result);
                goto harderror;
        }
 
        if (ntohl(reply.magic) != NBD_REPLY_MAGIC) {
-               dev_err(disk_to_dev(lo->disk), "Wrong magic (0x%lx)\n",
+               dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n",
                                (unsigned long)ntohl(reply.magic));
                result = -EPROTO;
                goto harderror;
        }
 
-       req = nbd_find_request(lo, *(struct request **)reply.handle);
+       req = nbd_find_request(nbd, *(struct request **)reply.handle);
        if (IS_ERR(req)) {
                result = PTR_ERR(req);
                if (result != -ENOENT)
                        goto harderror;
 
-               dev_err(disk_to_dev(lo->disk), "Unexpected reply (%p)\n",
+               dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%p)\n",
                        reply.handle);
                result = -EBADR;
                goto harderror;
        }
 
        if (ntohl(reply.error)) {
-               dev_err(disk_to_dev(lo->disk), "Other side returned error (%d)\n",
+               dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n",
                        ntohl(reply.error));
                req->errors++;
                return req;
        }
 
        dprintk(DBG_RX, "%s: request %p: got reply\n",
-                       lo->disk->disk_name, req);
+                       nbd->disk->disk_name, req);
        if (nbd_cmd(req) == NBD_CMD_READ) {
                struct req_iterator iter;
                struct bio_vec *bvec;
 
                rq_for_each_segment(bvec, req, iter) {
-                       result = sock_recv_bvec(lo, bvec);
+                       result = sock_recv_bvec(nbd, bvec);
                        if (result <= 0) {
-                               dev_err(disk_to_dev(lo->disk), "Receive data failed (result %d)\n",
+                               dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
                                        result);
                                req->errors++;
                                return req;
                        }
                        dprintk(DBG_RX, "%s: request %p: got %d bytes data\n",
-                               lo->disk->disk_name, req, bvec->bv_len);
+                               nbd->disk->disk_name, req, bvec->bv_len);
                }
        }
        return req;
 harderror:
-       lo->harderror = result;
+       nbd->harderror = result;
        return NULL;
 }
 
@@ -397,48 +398,48 @@ static struct device_attribute pid_attr = {
        .show = pid_show,
 };
 
-static int nbd_do_it(struct nbd_device *lo)
+static int nbd_do_it(struct nbd_device *nbd)
 {
        struct request *req;
        int ret;
 
-       BUG_ON(lo->magic != LO_MAGIC);
+       BUG_ON(nbd->magic != NBD_MAGIC);
 
-       lo->pid = task_pid_nr(current);
-       ret = device_create_file(disk_to_dev(lo->disk), &pid_attr);
+       nbd->pid = task_pid_nr(current);
+       ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
        if (ret) {
-               dev_err(disk_to_dev(lo->disk), "device_create_file failed!\n");
-               lo->pid = 0;
+               dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
+               nbd->pid = 0;
                return ret;
        }
 
-       while ((req = nbd_read_stat(lo)) != NULL)
+       while ((req = nbd_read_stat(nbd)) != NULL)
                nbd_end_request(req);
 
-       device_remove_file(disk_to_dev(lo->disk), &pid_attr);
-       lo->pid = 0;
+       device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
+       nbd->pid = 0;
        return 0;
 }
 
-static void nbd_clear_que(struct nbd_device *lo)
+static void nbd_clear_que(struct nbd_device *nbd)
 {
        struct request *req;
 
-       BUG_ON(lo->magic != LO_MAGIC);
+       BUG_ON(nbd->magic != NBD_MAGIC);
 
        /*
-        * Because we have set lo->sock to NULL under the tx_lock, all
+        * Because we have set nbd->sock to NULL under the tx_lock, all
         * modifications to the list must have completed by now.  For
         * the same reason, the active_req must be NULL.
         *
         * As a consequence, we don't need to take the spin lock while
         * purging the list here.
         */
-       BUG_ON(lo->sock);
-       BUG_ON(lo->active_req);
+       BUG_ON(nbd->sock);
+       BUG_ON(nbd->active_req);
 
-       while (!list_empty(&lo->queue_head)) {
-               req = list_entry(lo->queue_head.next, struct request,
+       while (!list_empty(&nbd->queue_head)) {
+               req = list_entry(nbd->queue_head.next, struct request,
                                 queuelist);
                list_del_init(&req->queuelist);
                req->errors++;
@@ -447,7 +448,7 @@ static void nbd_clear_que(struct nbd_device *lo)
 }
 
 
-static void nbd_handle_req(struct nbd_device *lo, struct request *req)
+static void nbd_handle_req(struct nbd_device *nbd, struct request *req)
 {
        if (req->cmd_type != REQ_TYPE_FS)
                goto error_out;
@@ -455,8 +456,8 @@ static void nbd_handle_req(struct nbd_device *lo, struct request *req)
        nbd_cmd(req) = NBD_CMD_READ;
        if (rq_data_dir(req) == WRITE) {
                nbd_cmd(req) = NBD_CMD_WRITE;
-               if (lo->flags & NBD_READ_ONLY) {
-                       dev_err(disk_to_dev(lo->disk),
+               if (nbd->flags & NBD_READ_ONLY) {
+                       dev_err(disk_to_dev(nbd->disk),
                                "Write on read-only\n");
                        goto error_out;
                }
@@ -464,29 +465,29 @@ static void nbd_handle_req(struct nbd_device *lo, struct request *req)
 
        req->errors = 0;
 
-       mutex_lock(&lo->tx_lock);
-       if (unlikely(!lo->sock)) {
-               mutex_unlock(&lo->tx_lock);
-               dev_err(disk_to_dev(lo->disk),
+       mutex_lock(&nbd->tx_lock);
+       if (unlikely(!nbd->sock)) {
+               mutex_unlock(&nbd->tx_lock);
+               dev_err(disk_to_dev(nbd->disk),
                        "Attempted send on closed socket\n");
                goto error_out;
        }
 
-       lo->active_req = req;
+       nbd->active_req = req;
 
-       if (nbd_send_req(lo, req) != 0) {
-               dev_err(disk_to_dev(lo->disk), "Request send failed\n");
+       if (nbd_send_req(nbd, req) != 0) {
+               dev_err(disk_to_dev(nbd->disk), "Request send failed\n");
                req->errors++;
                nbd_end_request(req);
        } else {
-               spin_lock(&lo->queue_lock);
-               list_add(&req->queuelist, &lo->queue_head);
-               spin_unlock(&lo->queue_lock);
+               spin_lock(&nbd->queue_lock);
+               list_add(&req->queuelist, &nbd->queue_head);
+               spin_unlock(&nbd->queue_lock);
        }
 
-       lo->active_req = NULL;
-       mutex_unlock(&lo->tx_lock);
-       wake_up_all(&lo->active_wq);
+       nbd->active_req = NULL;
+       mutex_unlock(&nbd->tx_lock);
+       wake_up_all(&nbd->active_wq);
 
        return;
 
@@ -497,28 +498,28 @@ error_out:
 
 static int nbd_thread(void *data)
 {
-       struct nbd_device *lo = data;
+       struct nbd_device *nbd = data;
        struct request *req;
 
        set_user_nice(current, -20);
-       while (!kthread_should_stop() || !list_empty(&lo->waiting_queue)) {
+       while (!kthread_should_stop() || !list_empty(&nbd->waiting_queue)) {
                /* wait for something to do */
-               wait_event_interruptible(lo->waiting_wq,
+               wait_event_interruptible(nbd->waiting_wq,
                                         kthread_should_stop() ||
-                                        !list_empty(&lo->waiting_queue));
+                                        !list_empty(&nbd->waiting_queue));
 
                /* extract request */
-               if (list_empty(&lo->waiting_queue))
+               if (list_empty(&nbd->waiting_queue))
                        continue;
 
-               spin_lock_irq(&lo->queue_lock);
-               req = list_entry(lo->waiting_queue.next, struct request,
+               spin_lock_irq(&nbd->queue_lock);
+               req = list_entry(nbd->waiting_queue.next, struct request,
                                 queuelist);
                list_del_init(&req->queuelist);
-               spin_unlock_irq(&lo->queue_lock);
+               spin_unlock_irq(&nbd->queue_lock);
 
                /* handle request */
-               nbd_handle_req(lo, req);
+               nbd_handle_req(nbd, req);
        }
        return 0;
 }
@@ -526,7 +527,7 @@ static int nbd_thread(void *data)
 /*
  * We always wait for result of write, for now. It would be nice to make it optional
  * in future
- * if ((rq_data_dir(req) == WRITE) && (lo->flags & NBD_WRITE_NOCHK))
+ * if ((rq_data_dir(req) == WRITE) && (nbd->flags & NBD_WRITE_NOCHK))
  *   { printk( "Warning: Ignoring result!\n"); nbd_end_request( req ); }
  */
 
@@ -535,19 +536,19 @@ static void do_nbd_request(struct request_queue *q)
        struct request *req;
        
        while ((req = blk_fetch_request(q)) != NULL) {
-               struct nbd_device *lo;
+               struct nbd_device *nbd;
 
                spin_unlock_irq(q->queue_lock);
 
                dprintk(DBG_BLKDEV, "%s: request %p: dequeued (flags=%x)\n",
                                req->rq_disk->disk_name, req, req->cmd_type);
 
-               lo = req->rq_disk->private_data;
+               nbd = req->rq_disk->private_data;
 
-               BUG_ON(lo->magic != LO_MAGIC);
+               BUG_ON(nbd->magic != NBD_MAGIC);
 
-               if (unlikely(!lo->sock)) {
-                       dev_err(disk_to_dev(lo->disk),
+               if (unlikely(!nbd->sock)) {
+                       dev_err(disk_to_dev(nbd->disk),
                                "Attempted send on closed socket\n");
                        req->errors++;
                        nbd_end_request(req);
@@ -555,11 +556,11 @@ static void do_nbd_request(struct request_queue *q)
                        continue;
                }
 
-               spin_lock_irq(&lo->queue_lock);
-               list_add_tail(&req->queuelist, &lo->waiting_queue);
-               spin_unlock_irq(&lo->queue_lock);
+               spin_lock_irq(&nbd->queue_lock);
+               list_add_tail(&req->queuelist, &nbd->waiting_queue);
+               spin_unlock_irq(&nbd->queue_lock);
 
-               wake_up(&lo->waiting_wq);
+               wake_up(&nbd->waiting_wq);
 
                spin_lock_irq(q->queue_lock);
        }
@@ -567,32 +568,32 @@ static void do_nbd_request(struct request_queue *q)
 
 /* Must be called with tx_lock held */
 
-static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo,
+static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
                       unsigned int cmd, unsigned long arg)
 {
        switch (cmd) {
        case NBD_DISCONNECT: {
                struct request sreq;
 
-               dev_info(disk_to_dev(lo->disk), "NBD_DISCONNECT\n");
+               dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
 
                blk_rq_init(NULL, &sreq);
                sreq.cmd_type = REQ_TYPE_SPECIAL;
                nbd_cmd(&sreq) = NBD_CMD_DISC;
-               if (!lo->sock)
+               if (!nbd->sock)
                        return -EINVAL;
-               nbd_send_req(lo, &sreq);
+               nbd_send_req(nbd, &sreq);
                 return 0;
        }
  
        case NBD_CLEAR_SOCK: {
                struct file *file;
 
-               lo->sock = NULL;
-               file = lo->file;
-               lo->file = NULL;
-               nbd_clear_que(lo);
-               BUG_ON(!list_empty(&lo->queue_head));
+               nbd->sock = NULL;
+               file = nbd->file;
+               nbd->file = NULL;
+               nbd_clear_que(nbd);
+               BUG_ON(!list_empty(&nbd->queue_head));
                if (file)
                        fput(file);
                return 0;
@@ -600,14 +601,14 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo,
 
        case NBD_SET_SOCK: {
                struct file *file;
-               if (lo->file)
+               if (nbd->file)
                        return -EBUSY;
                file = fget(arg);
                if (file) {
                        struct inode *inode = file->f_path.dentry->d_inode;
                        if (S_ISSOCK(inode->i_mode)) {
-                               lo->file = file;
-                               lo->sock = SOCKET_I(inode);
+                               nbd->file = file;
+                               nbd->sock = SOCKET_I(inode);
                                if (max_part > 0)
                                        bdev->bd_invalidated = 1;
                                return 0;
@@ -619,29 +620,29 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo,
        }
 
        case NBD_SET_BLKSIZE:
-               lo->blksize = arg;
-               lo->bytesize &= ~(lo->blksize-1);
-               bdev->bd_inode->i_size = lo->bytesize;
-               set_blocksize(bdev, lo->blksize);
-               set_capacity(lo->disk, lo->bytesize >> 9);
+               nbd->blksize = arg;
+               nbd->bytesize &= ~(nbd->blksize-1);
+               bdev->bd_inode->i_size = nbd->bytesize;
+               set_blocksize(bdev, nbd->blksize);
+               set_capacity(nbd->disk, nbd->bytesize >> 9);
                return 0;
 
        case NBD_SET_SIZE:
-               lo->bytesize = arg & ~(lo->blksize-1);
-               bdev->bd_inode->i_size = lo->bytesize;
-               set_blocksize(bdev, lo->blksize);
-               set_capacity(lo->disk, lo->bytesize >> 9);
+               nbd->bytesize = arg & ~(nbd->blksize-1);
+               bdev->bd_inode->i_size = nbd->bytesize;
+               set_blocksize(bdev, nbd->blksize);
+               set_capacity(nbd->disk, nbd->bytesize >> 9);
                return 0;
 
        case NBD_SET_TIMEOUT:
-               lo->xmit_timeout = arg * HZ;
+               nbd->xmit_timeout = arg * HZ;
                return 0;
 
        case NBD_SET_SIZE_BLOCKS:
-               lo->bytesize = ((u64) arg) * lo->blksize;
-               bdev->bd_inode->i_size = lo->bytesize;
-               set_blocksize(bdev, lo->blksize);
-               set_capacity(lo->disk, lo->bytesize >> 9);
+               nbd->bytesize = ((u64) arg) * nbd->blksize;
+               bdev->bd_inode->i_size = nbd->bytesize;
+               set_blocksize(bdev, nbd->blksize);
+               set_capacity(nbd->disk, nbd->bytesize >> 9);
                return 0;
 
        case NBD_DO_IT: {
@@ -649,38 +650,38 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo,
                struct file *file;
                int error;
 
-               if (lo->pid)
+               if (nbd->pid)
                        return -EBUSY;
-               if (!lo->file)
+               if (!nbd->file)
                        return -EINVAL;
 
-               mutex_unlock(&lo->tx_lock);
+               mutex_unlock(&nbd->tx_lock);
 
-               thread = kthread_create(nbd_thread, lo, lo->disk->disk_name);
+               thread = kthread_create(nbd_thread, nbd, nbd->disk->disk_name);
                if (IS_ERR(thread)) {
-                       mutex_lock(&lo->tx_lock);
+                       mutex_lock(&nbd->tx_lock);
                        return PTR_ERR(thread);
                }
                wake_up_process(thread);
-               error = nbd_do_it(lo);
+               error = nbd_do_it(nbd);
                kthread_stop(thread);
 
-               mutex_lock(&lo->tx_lock);
+               mutex_lock(&nbd->tx_lock);
                if (error)
                        return error;
-               sock_shutdown(lo, 0);
-               file = lo->file;
-               lo->file = NULL;
-               nbd_clear_que(lo);
-               dev_warn(disk_to_dev(lo->disk), "queue cleared\n");
+               sock_shutdown(nbd, 0);
+               file = nbd->file;
+               nbd->file = NULL;
+               nbd_clear_que(nbd);
+               dev_warn(disk_to_dev(nbd->disk), "queue cleared\n");
                if (file)
                        fput(file);
-               lo->bytesize = 0;
+               nbd->bytesize = 0;
                bdev->bd_inode->i_size = 0;
-               set_capacity(lo->disk, 0);
+               set_capacity(nbd->disk, 0);
                if (max_part > 0)
                        ioctl_by_bdev(bdev, BLKRRPART, 0);
-               return lo->harderror;
+               return nbd->harderror;
        }
 
        case NBD_CLEAR_QUE:
@@ -688,14 +689,14 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo,
                 * This is for compatibility only.  The queue is always cleared
                 * by NBD_DO_IT or NBD_CLEAR_SOCK.
                 */
-               BUG_ON(!lo->sock && !list_empty(&lo->queue_head));
+               BUG_ON(!nbd->sock && !list_empty(&nbd->queue_head));
                return 0;
 
        case NBD_PRINT_DEBUG:
-               dev_info(disk_to_dev(lo->disk),
+               dev_info(disk_to_dev(nbd->disk),
                        "next = %p, prev = %p, head = %p\n",
-                       lo->queue_head.next, lo->queue_head.prev,
-                       &lo->queue_head);
+                       nbd->queue_head.next, nbd->queue_head.prev,
+                       &nbd->queue_head);
                return 0;
        }
        return -ENOTTY;
@@ -704,21 +705,21 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo,
 static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
                     unsigned int cmd, unsigned long arg)
 {
-       struct nbd_device *lo = bdev->bd_disk->private_data;
+       struct nbd_device *nbd = bdev->bd_disk->private_data;
        int error;
 
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
 
-       BUG_ON(lo->magic != LO_MAGIC);
+       BUG_ON(nbd->magic != NBD_MAGIC);
 
        /* Anyone capable of this syscall can do *real bad* things */
        dprintk(DBG_IOCTL, "%s: nbd_ioctl cmd=%s(0x%x) arg=%lu\n",
-                       lo->disk->disk_name, ioctl_cmd_to_ascii(cmd), cmd, arg);
+               nbd->disk->disk_name, ioctl_cmd_to_ascii(cmd), cmd, arg);
 
-       mutex_lock(&lo->tx_lock);
-       error = __nbd_ioctl(bdev, lo, cmd, arg);
-       mutex_unlock(&lo->tx_lock);
+       mutex_lock(&nbd->tx_lock);
+       error = __nbd_ioctl(bdev, nbd, cmd, arg);
+       mutex_unlock(&nbd->tx_lock);
 
        return error;
 }
@@ -804,7 +805,7 @@ static int __init nbd_init(void)
        for (i = 0; i < nbds_max; i++) {
                struct gendisk *disk = nbd_dev[i].disk;
                nbd_dev[i].file = NULL;
-               nbd_dev[i].magic = LO_MAGIC;
+               nbd_dev[i].magic = NBD_MAGIC;
                nbd_dev[i].flags = 0;
                INIT_LIST_HEAD(&nbd_dev[i].waiting_queue);
                spin_lock_init(&nbd_dev[i].queue_lock);
index cf82fedae09975272fd6517b1fa5346eff95a814..e53fc24c6af3e1b97ea95052dd383efc26138eb3 100644 (file)
@@ -118,8 +118,8 @@ enum kcs_states {
 #define MAX_KCS_WRITE_SIZE IPMI_MAX_MSG_LENGTH
 
 /* Timeouts in microseconds. */
-#define IBF_RETRY_TIMEOUT 1000000
-#define OBF_RETRY_TIMEOUT 1000000
+#define IBF_RETRY_TIMEOUT 5000000
+#define OBF_RETRY_TIMEOUT 5000000
 #define MAX_ERROR_RETRIES 10
 #define ERROR0_OBF_WAIT_JIFFIES (2*HZ)
 
index c90e9390b78c15d517e9431601c755541344bcce..2c29942b132654747af55e6b70ea4857dc9e0de4 100644 (file)
@@ -45,6 +45,7 @@
 #include <linux/init.h>
 #include <linux/proc_fs.h>
 #include <linux/rcupdate.h>
+#include <linux/interrupt.h>
 
 #define PFX "IPMI message handler: "
 
@@ -52,6 +53,8 @@
 
 static struct ipmi_recv_msg *ipmi_alloc_recv_msg(void);
 static int ipmi_init_msghandler(void);
+static void smi_recv_tasklet(unsigned long);
+static void handle_new_recv_msgs(ipmi_smi_t intf);
 
 static int initialized;
 
@@ -354,12 +357,15 @@ struct ipmi_smi {
        int curr_seq;
 
        /*
-        * Messages that were delayed for some reason (out of memory,
-        * for instance), will go in here to be processed later in a
-        * periodic timer interrupt.
+        * Messages queued for delivery.  If delivery fails (out of memory
+        * for instance), They will stay in here to be processed later in a
+        * periodic timer interrupt.  The tasklet is for handling received
+        * messages directly from the handler.
         */
        spinlock_t       waiting_msgs_lock;
        struct list_head waiting_msgs;
+       atomic_t         watchdog_pretimeouts_to_deliver;
+       struct tasklet_struct recv_tasklet;
 
        /*
         * The list of command receivers that are registered for commands
@@ -492,6 +498,8 @@ static void clean_up_interface_data(ipmi_smi_t intf)
        struct cmd_rcvr  *rcvr, *rcvr2;
        struct list_head list;
 
+       tasklet_kill(&intf->recv_tasklet);
+
        free_smi_msg_list(&intf->waiting_msgs);
        free_recv_msg_list(&intf->waiting_events);
 
@@ -2785,12 +2793,17 @@ channel_handler(ipmi_smi_t intf, struct ipmi_recv_msg *msg)
        return;
 }
 
-void ipmi_poll_interface(ipmi_user_t user)
+static void ipmi_poll(ipmi_smi_t intf)
 {
-       ipmi_smi_t intf = user->intf;
-
        if (intf->handlers->poll)
                intf->handlers->poll(intf->send_info);
+       /* In case something came in */
+       handle_new_recv_msgs(intf);
+}
+
+void ipmi_poll_interface(ipmi_user_t user)
+{
+       ipmi_poll(user->intf);
 }
 EXPORT_SYMBOL(ipmi_poll_interface);
 
@@ -2859,6 +2872,10 @@ int ipmi_register_smi(struct ipmi_smi_handlers *handlers,
 #endif
        spin_lock_init(&intf->waiting_msgs_lock);
        INIT_LIST_HEAD(&intf->waiting_msgs);
+       tasklet_init(&intf->recv_tasklet,
+                    smi_recv_tasklet,
+                    (unsigned long) intf);
+       atomic_set(&intf->watchdog_pretimeouts_to_deliver, 0);
        spin_lock_init(&intf->events_lock);
        INIT_LIST_HEAD(&intf->waiting_events);
        intf->waiting_events_count = 0;
@@ -3621,11 +3638,11 @@ static int handle_bmc_rsp(ipmi_smi_t          intf,
 }
 
 /*
- * Handle a new message.  Return 1 if the message should be requeued,
+ * Handle a received message.  Return 1 if the message should be requeued,
  * 0 if the message should be freed, or -1 if the message should not
  * be freed or requeued.
  */
-static int handle_new_recv_msg(ipmi_smi_t          intf,
+static int handle_one_recv_msg(ipmi_smi_t          intf,
                               struct ipmi_smi_msg *msg)
 {
        int requeue;
@@ -3783,12 +3800,72 @@ static int handle_new_recv_msg(ipmi_smi_t          intf,
        return requeue;
 }
 
+/*
+ * If there are messages in the queue or pretimeouts, handle them.
+ */
+static void handle_new_recv_msgs(ipmi_smi_t intf)
+{
+       struct ipmi_smi_msg  *smi_msg;
+       unsigned long        flags = 0;
+       int                  rv;
+       int                  run_to_completion = intf->run_to_completion;
+
+       /* See if any waiting messages need to be processed. */
+       if (!run_to_completion)
+               spin_lock_irqsave(&intf->waiting_msgs_lock, flags);
+       while (!list_empty(&intf->waiting_msgs)) {
+               smi_msg = list_entry(intf->waiting_msgs.next,
+                                    struct ipmi_smi_msg, link);
+               list_del(&smi_msg->link);
+               if (!run_to_completion)
+                       spin_unlock_irqrestore(&intf->waiting_msgs_lock, flags);
+               rv = handle_one_recv_msg(intf, smi_msg);
+               if (!run_to_completion)
+                       spin_lock_irqsave(&intf->waiting_msgs_lock, flags);
+               if (rv == 0) {
+                       /* Message handled */
+                       ipmi_free_smi_msg(smi_msg);
+               } else if (rv < 0) {
+                       /* Fatal error on the message, del but don't free. */
+               } else {
+                       /*
+                        * To preserve message order, quit if we
+                        * can't handle a message.
+                        */
+                       list_add(&smi_msg->link, &intf->waiting_msgs);
+                       break;
+               }
+       }
+       if (!run_to_completion)
+               spin_unlock_irqrestore(&intf->waiting_msgs_lock, flags);
+
+       /*
+        * If the pretimout count is non-zero, decrement one from it and
+        * deliver pretimeouts to all the users.
+        */
+       if (atomic_add_unless(&intf->watchdog_pretimeouts_to_deliver, -1, 0)) {
+               ipmi_user_t user;
+
+               rcu_read_lock();
+               list_for_each_entry_rcu(user, &intf->users, link) {
+                       if (user->handler->ipmi_watchdog_pretimeout)
+                               user->handler->ipmi_watchdog_pretimeout(
+                                       user->handler_data);
+               }
+               rcu_read_unlock();
+       }
+}
+
+static void smi_recv_tasklet(unsigned long val)
+{
+       handle_new_recv_msgs((ipmi_smi_t) val);
+}
+
 /* Handle a new message from the lower layer. */
 void ipmi_smi_msg_received(ipmi_smi_t          intf,
                           struct ipmi_smi_msg *msg)
 {
        unsigned long flags = 0; /* keep us warning-free. */
-       int           rv;
        int           run_to_completion;
 
 
@@ -3842,31 +3919,11 @@ void ipmi_smi_msg_received(ipmi_smi_t          intf,
        run_to_completion = intf->run_to_completion;
        if (!run_to_completion)
                spin_lock_irqsave(&intf->waiting_msgs_lock, flags);
-       if (!list_empty(&intf->waiting_msgs)) {
-               list_add_tail(&msg->link, &intf->waiting_msgs);
-               if (!run_to_completion)
-                       spin_unlock_irqrestore(&intf->waiting_msgs_lock, flags);
-               goto out;
-       }
+       list_add_tail(&msg->link, &intf->waiting_msgs);
        if (!run_to_completion)
                spin_unlock_irqrestore(&intf->waiting_msgs_lock, flags);
 
-       rv = handle_new_recv_msg(intf, msg);
-       if (rv > 0) {
-               /*
-                * Could not handle the message now, just add it to a
-                * list to handle later.
-                */
-               run_to_completion = intf->run_to_completion;
-               if (!run_to_completion)
-                       spin_lock_irqsave(&intf->waiting_msgs_lock, flags);
-               list_add_tail(&msg->link, &intf->waiting_msgs);
-               if (!run_to_completion)
-                       spin_unlock_irqrestore(&intf->waiting_msgs_lock, flags);
-       } else if (rv == 0) {
-               ipmi_free_smi_msg(msg);
-       }
-
+       tasklet_schedule(&intf->recv_tasklet);
  out:
        return;
 }
@@ -3874,16 +3931,8 @@ EXPORT_SYMBOL(ipmi_smi_msg_received);
 
 void ipmi_smi_watchdog_pretimeout(ipmi_smi_t intf)
 {
-       ipmi_user_t user;
-
-       rcu_read_lock();
-       list_for_each_entry_rcu(user, &intf->users, link) {
-               if (!user->handler->ipmi_watchdog_pretimeout)
-                       continue;
-
-               user->handler->ipmi_watchdog_pretimeout(user->handler_data);
-       }
-       rcu_read_unlock();
+       atomic_set(&intf->watchdog_pretimeouts_to_deliver, 1);
+       tasklet_schedule(&intf->recv_tasklet);
 }
 EXPORT_SYMBOL(ipmi_smi_watchdog_pretimeout);
 
@@ -3997,28 +4046,12 @@ static void ipmi_timeout_handler(long timeout_period)
        ipmi_smi_t           intf;
        struct list_head     timeouts;
        struct ipmi_recv_msg *msg, *msg2;
-       struct ipmi_smi_msg  *smi_msg, *smi_msg2;
        unsigned long        flags;
        int                  i;
 
        rcu_read_lock();
        list_for_each_entry_rcu(intf, &ipmi_interfaces, link) {
-               /* See if any waiting messages need to be processed. */
-               spin_lock_irqsave(&intf->waiting_msgs_lock, flags);
-               list_for_each_entry_safe(smi_msg, smi_msg2,
-                                        &intf->waiting_msgs, link) {
-                       if (!handle_new_recv_msg(intf, smi_msg)) {
-                               list_del(&smi_msg->link);
-                               ipmi_free_smi_msg(smi_msg);
-                       } else {
-                               /*
-                                * To preserve message order, quit if we
-                                * can't handle a message.
-                                */
-                               break;
-                       }
-               }
-               spin_unlock_irqrestore(&intf->waiting_msgs_lock, flags);
+               tasklet_schedule(&intf->recv_tasklet);
 
                /*
                 * Go through the seq table and find any messages that
@@ -4172,12 +4205,48 @@ EXPORT_SYMBOL(ipmi_free_recv_msg);
 
 #ifdef CONFIG_IPMI_PANIC_EVENT
 
+static atomic_t panic_done_count = ATOMIC_INIT(0);
+
 static void dummy_smi_done_handler(struct ipmi_smi_msg *msg)
 {
+       atomic_dec(&panic_done_count);
 }
 
 static void dummy_recv_done_handler(struct ipmi_recv_msg *msg)
 {
+       atomic_dec(&panic_done_count);
+}
+
+/*
+ * Inside a panic, send a message and wait for a response.
+ */
+static void ipmi_panic_request_and_wait(ipmi_smi_t           intf,
+                                       struct ipmi_addr     *addr,
+                                       struct kernel_ipmi_msg *msg)
+{
+       struct ipmi_smi_msg  smi_msg;
+       struct ipmi_recv_msg recv_msg;
+       int rv;
+
+       smi_msg.done = dummy_smi_done_handler;
+       recv_msg.done = dummy_recv_done_handler;
+       atomic_add(2, &panic_done_count);
+       rv = i_ipmi_request(NULL,
+                           intf,
+                           addr,
+                           0,
+                           msg,
+                           intf,
+                           &smi_msg,
+                           &recv_msg,
+                           0,
+                           intf->channels[0].address,
+                           intf->channels[0].lun,
+                           0, 1); /* Don't retry, and don't wait. */
+       if (rv)
+               atomic_sub(2, &panic_done_count);
+       while (atomic_read(&panic_done_count) != 0)
+               ipmi_poll(intf);
 }
 
 #ifdef CONFIG_IPMI_PANIC_STRING
@@ -4216,8 +4285,6 @@ static void send_panic_events(char *str)
        unsigned char                     data[16];
        struct ipmi_system_interface_addr *si;
        struct ipmi_addr                  addr;
-       struct ipmi_smi_msg               smi_msg;
-       struct ipmi_recv_msg              recv_msg;
 
        si = (struct ipmi_system_interface_addr *) &addr;
        si->addr_type = IPMI_SYSTEM_INTERFACE_ADDR_TYPE;
@@ -4245,9 +4312,6 @@ static void send_panic_events(char *str)
                data[7] = str[2];
        }
 
-       smi_msg.done = dummy_smi_done_handler;
-       recv_msg.done = dummy_recv_done_handler;
-
        /* For every registered interface, send the event. */
        list_for_each_entry_rcu(intf, &ipmi_interfaces, link) {
                if (!intf->handlers)
@@ -4257,18 +4321,7 @@ static void send_panic_events(char *str)
                intf->run_to_completion = 1;
                /* Send the event announcing the panic. */
                intf->handlers->set_run_to_completion(intf->send_info, 1);
-               i_ipmi_request(NULL,
-                              intf,
-                              &addr,
-                              0,
-                              &msg,
-                              intf,
-                              &smi_msg,
-                              &recv_msg,
-                              0,
-                              intf->channels[0].address,
-                              intf->channels[0].lun,
-                              0, 1); /* Don't retry, and don't wait. */
+               ipmi_panic_request_and_wait(intf, &addr, &msg);
        }
 
 #ifdef CONFIG_IPMI_PANIC_STRING
@@ -4316,18 +4369,7 @@ static void send_panic_events(char *str)
                msg.data = NULL;
                msg.data_len = 0;
                intf->null_user_handler = device_id_fetcher;
-               i_ipmi_request(NULL,
-                              intf,
-                              &addr,
-                              0,
-                              &msg,
-                              intf,
-                              &smi_msg,
-                              &recv_msg,
-                              0,
-                              intf->channels[0].address,
-                              intf->channels[0].lun,
-                              0, 1); /* Don't retry, and don't wait. */
+               ipmi_panic_request_and_wait(intf, &addr, &msg);
 
                if (intf->local_event_generator) {
                        /* Request the event receiver from the local MC. */
@@ -4336,18 +4378,7 @@ static void send_panic_events(char *str)
                        msg.data = NULL;
                        msg.data_len = 0;
                        intf->null_user_handler = event_receiver_fetcher;
-                       i_ipmi_request(NULL,
-                                      intf,
-                                      &addr,
-                                      0,
-                                      &msg,
-                                      intf,
-                                      &smi_msg,
-                                      &recv_msg,
-                                      0,
-                                      intf->channels[0].address,
-                                      intf->channels[0].lun,
-                                      0, 1); /* no retry, and no wait. */
+                       ipmi_panic_request_and_wait(intf, &addr, &msg);
                }
                intf->null_user_handler = NULL;
 
@@ -4404,18 +4435,7 @@ static void send_panic_events(char *str)
                        strncpy(data+5, p, 11);
                        p += size;
 
-                       i_ipmi_request(NULL,
-                                      intf,
-                                      &addr,
-                                      0,
-                                      &msg,
-                                      intf,
-                                      &smi_msg,
-                                      &recv_msg,
-                                      0,
-                                      intf->channels[0].address,
-                                      intf->channels[0].lun,
-                                      0, 1); /* no retry, and no wait. */
+                       ipmi_panic_request_and_wait(intf, &addr, &msg);
                }
        }
 #endif /* CONFIG_IPMI_PANIC_STRING */
index f9fdc114b31dfbdba423491afe58004178693767..1e638fff40ea2cb01d99d33745ae5f6526787237 100644 (file)
@@ -170,7 +170,6 @@ struct smi_info {
        struct si_sm_handlers  *handlers;
        enum si_type           si_type;
        spinlock_t             si_lock;
-       spinlock_t             msg_lock;
        struct list_head       xmit_msgs;
        struct list_head       hp_xmit_msgs;
        struct ipmi_smi_msg    *curr_msg;
@@ -319,16 +318,8 @@ static int register_xaction_notifier(struct notifier_block *nb)
 static void deliver_recv_msg(struct smi_info *smi_info,
                             struct ipmi_smi_msg *msg)
 {
-       /* Deliver the message to the upper layer with the lock
-          released. */
-
-       if (smi_info->run_to_completion) {
-               ipmi_smi_msg_received(smi_info->intf, msg);
-       } else {
-               spin_unlock(&(smi_info->si_lock));
-               ipmi_smi_msg_received(smi_info->intf, msg);
-               spin_lock(&(smi_info->si_lock));
-       }
+       /* Deliver the message to the upper layer. */
+       ipmi_smi_msg_received(smi_info->intf, msg);
 }
 
 static void return_hosed_msg(struct smi_info *smi_info, int cCode)
@@ -357,13 +348,6 @@ static enum si_sm_result start_next_msg(struct smi_info *smi_info)
        struct timeval t;
 #endif
 
-       /*
-        * No need to save flags, we aleady have interrupts off and we
-        * already hold the SMI lock.
-        */
-       if (!smi_info->run_to_completion)
-               spin_lock(&(smi_info->msg_lock));
-
        /* Pick the high priority queue first. */
        if (!list_empty(&(smi_info->hp_xmit_msgs))) {
                entry = smi_info->hp_xmit_msgs.next;
@@ -401,9 +385,6 @@ static enum si_sm_result start_next_msg(struct smi_info *smi_info)
                rv = SI_SM_CALL_WITHOUT_DELAY;
        }
  out:
-       if (!smi_info->run_to_completion)
-               spin_unlock(&(smi_info->msg_lock));
-
        return rv;
 }
 
@@ -480,9 +461,7 @@ static void handle_flags(struct smi_info *smi_info)
 
                start_clear_flags(smi_info);
                smi_info->msg_flags &= ~WDT_PRE_TIMEOUT_INT;
-               spin_unlock(&(smi_info->si_lock));
                ipmi_smi_watchdog_pretimeout(smi_info->intf);
-               spin_lock(&(smi_info->si_lock));
        } else if (smi_info->msg_flags & RECEIVE_MSG_AVAIL) {
                /* Messages available. */
                smi_info->curr_msg = ipmi_alloc_smi_msg();
@@ -888,19 +867,6 @@ static void sender(void                *send_info,
        printk("**Enqueue: %d.%9.9d\n", t.tv_sec, t.tv_usec);
 #endif
 
-       /*
-        * last_timeout_jiffies is updated here to avoid
-        * smi_timeout() handler passing very large time_diff
-        * value to smi_event_handler() that causes
-        * the send command to abort.
-        */
-       smi_info->last_timeout_jiffies = jiffies;
-
-       mod_timer(&smi_info->si_timer, jiffies + SI_TIMEOUT_JIFFIES);
-
-       if (smi_info->thread)
-               wake_up_process(smi_info->thread);
-
        if (smi_info->run_to_completion) {
                /*
                 * If we are running to completion, then throw it in
@@ -923,16 +889,29 @@ static void sender(void                *send_info,
                return;
        }
 
-       spin_lock_irqsave(&smi_info->msg_lock, flags);
+       spin_lock_irqsave(&smi_info->si_lock, flags);
        if (priority > 0)
                list_add_tail(&msg->link, &smi_info->hp_xmit_msgs);
        else
                list_add_tail(&msg->link, &smi_info->xmit_msgs);
-       spin_unlock_irqrestore(&smi_info->msg_lock, flags);
 
-       spin_lock_irqsave(&smi_info->si_lock, flags);
-       if (smi_info->si_state == SI_NORMAL && smi_info->curr_msg == NULL)
+       if (smi_info->si_state == SI_NORMAL && smi_info->curr_msg == NULL) {
+               /*
+                * last_timeout_jiffies is updated here to avoid
+                * smi_timeout() handler passing very large time_diff
+                * value to smi_event_handler() that causes
+                * the send command to abort.
+                */
+               smi_info->last_timeout_jiffies = jiffies;
+
+               mod_timer(&smi_info->si_timer, jiffies + SI_TIMEOUT_JIFFIES);
+
+               if (smi_info->thread)
+                       wake_up_process(smi_info->thread);
+
                start_next_msg(smi_info);
+               smi_event_handler(smi_info, 0);
+       }
        spin_unlock_irqrestore(&smi_info->si_lock, flags);
 }
 
@@ -1033,16 +1012,19 @@ static int ipmi_thread(void *data)
 static void poll(void *send_info)
 {
        struct smi_info *smi_info = send_info;
-       unsigned long flags;
+       unsigned long flags = 0;
+       int run_to_completion = smi_info->run_to_completion;
 
        /*
         * Make sure there is some delay in the poll loop so we can
         * drive time forward and timeout things.
         */
        udelay(10);
-       spin_lock_irqsave(&smi_info->si_lock, flags);
+       if (!run_to_completion)
+               spin_lock_irqsave(&smi_info->si_lock, flags);
        smi_event_handler(smi_info, 10);
-       spin_unlock_irqrestore(&smi_info->si_lock, flags);
+       if (!run_to_completion)
+               spin_unlock_irqrestore(&smi_info->si_lock, flags);
 }
 
 static void request_events(void *send_info)
@@ -1679,10 +1661,8 @@ static struct smi_info *smi_info_alloc(void)
 {
        struct smi_info *info = kzalloc(sizeof(*info), GFP_KERNEL);
 
-       if (info) {
+       if (info)
                spin_lock_init(&info->si_lock);
-               spin_lock_init(&info->msg_lock);
-       }
        return info;
 }
 
index 020a6aec2d86d70f15b9f27443f39af30f0bf914..7ed356e520351741173757d81270334e10ec4585 100644 (file)
@@ -520,6 +520,7 @@ static void panic_halt_ipmi_heartbeat(void)
        msg.cmd = IPMI_WDOG_RESET_TIMER;
        msg.data = NULL;
        msg.data_len = 0;
+       atomic_add(2, &panic_done_count);
        rv = ipmi_request_supply_msgs(watchdog_user,
                                      (struct ipmi_addr *) &addr,
                                      0,
@@ -528,8 +529,8 @@ static void panic_halt_ipmi_heartbeat(void)
                                      &panic_halt_heartbeat_smi_msg,
                                      &panic_halt_heartbeat_recv_msg,
                                      1);
-       if (!rv)
-               atomic_add(2, &panic_done_count);
+       if (rv)
+               atomic_sub(2, &panic_done_count);
 }
 
 static struct ipmi_smi_msg panic_halt_smi_msg = {
@@ -553,16 +554,18 @@ static void panic_halt_ipmi_set_timeout(void)
        /* Wait for the messages to be free. */
        while (atomic_read(&panic_done_count) != 0)
                ipmi_poll_interface(watchdog_user);
+       atomic_add(2, &panic_done_count);
        rv = i_ipmi_set_timeout(&panic_halt_smi_msg,
                                &panic_halt_recv_msg,
                                &send_heartbeat_now);
-       if (!rv) {
-               atomic_add(2, &panic_done_count);
-               if (send_heartbeat_now)
-                       panic_halt_ipmi_heartbeat();
-       } else
+       if (rv) {
+               atomic_sub(2, &panic_done_count);
                printk(KERN_WARNING PFX
                       "Unable to extend the watchdog timeout.");
+       } else {
+               if (send_heartbeat_now)
+                       panic_halt_ipmi_heartbeat();
+       }
        while (atomic_read(&panic_done_count) != 0)
                ipmi_poll_interface(watchdog_user);
 }
@@ -1164,7 +1167,7 @@ static int wdog_reboot_handler(struct notifier_block *this,
                if (code == SYS_POWER_OFF || code == SYS_HALT) {
                        /* Disable the WDT if we are shutting down. */
                        ipmi_watchdog_state = WDOG_TIMEOUT_NONE;
-                       panic_halt_ipmi_set_timeout();
+                       ipmi_set_timeout(IPMI_SET_TIMEOUT_NO_HB);
                } else if (ipmi_watchdog_state != WDOG_TIMEOUT_NONE) {
                        /* Set a long timer to let the reboot happens, but
                           reboot if it hangs, but only if the watchdog
@@ -1172,7 +1175,7 @@ static int wdog_reboot_handler(struct notifier_block *this,
                        timeout = 120;
                        pretimeout = 0;
                        ipmi_watchdog_state = WDOG_TIMEOUT_RESET;
-                       panic_halt_ipmi_set_timeout();
+                       ipmi_set_timeout(IPMI_SET_TIMEOUT_NO_HB);
                }
        }
        return NOTIFY_OK;
index a2161f631a831f66992d4d84b9756c56b384de3f..2231aec23918fec6c1f65125421d1d35c229c759 100644 (file)
@@ -271,7 +271,7 @@ static int tosa_lcd_resume(struct spi_device *spi)
 }
 #else
 #define tosa_lcd_suspend       NULL
-#define tosa_lcd_reume NULL
+#define tosa_lcd_resume NULL
 #endif
 
 static struct spi_driver tosa_lcd_driver = {
index 70e2017edd70e68c9958d49f1692f3401013df92..36d66653b93191c9c13c21e74dea6f511a6ac9ab 100644 (file)
@@ -1384,10 +1384,23 @@ static void invalidate_bh_lru(void *arg)
        }
        put_cpu_var(bh_lrus);
 }
+
+static bool has_bh_in_lru(int cpu, void *dummy)
+{
+       struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
+       int i;
        
+       for (i = 0; i < BH_LRU_SIZE; i++) {
+               if (b->bhs[i])
+                       return 1;
+       }
+
+       return 0;
+}
+
 void invalidate_bh_lrus(void)
 {
-       on_each_cpu(invalidate_bh_lru, NULL, 1);
+       on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1, GFP_KERNEL);
 }
 EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
 
index fbb53c249086bf074ef273b3c69378e5f1746157..f9bd395b3473f6f1ca5f0697679d7d469e9fdb7c 100644 (file)
@@ -550,7 +550,7 @@ int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
        seq_put_decimal_ull(m, ' ', shared);
        seq_put_decimal_ull(m, ' ', text);
        seq_put_decimal_ull(m, ' ', 0);
-       seq_put_decimal_ull(m, ' ', text);
+       seq_put_decimal_ull(m, ' ', data);
        seq_put_decimal_ull(m, ' ', 0);
        seq_putc(m, '\n');
 
index 3551f1f839eb163801a22bfb1243211ee41b404f..0d9e23a39e495f0d6dd0a69d0f84aaa05faef9b5 100644 (file)
@@ -156,15 +156,15 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir,
        if (!ptrace_may_access(task, PTRACE_MODE_READ))
                goto out;
 
-       last = &ns_entries[ARRAY_SIZE(ns_entries) - 1];
-       for (entry = ns_entries; entry <= last; entry++) {
+       last = &ns_entries[ARRAY_SIZE(ns_entries)];
+       for (entry = ns_entries; entry < last; entry++) {
                if (strlen((*entry)->name) != len)
                        continue;
                if (!memcmp(dentry->d_name.name, (*entry)->name, len))
                        break;
        }
        error = ERR_PTR(-ENOENT);
-       if (entry > last)
+       if (entry == last)
                goto out;
 
        error = proc_ns_instantiate(dir, dentry, task, *entry);
index 9694cc2835115f18c5c6f9719d8ab9a332631901..c283832d411d4ea79170210e68fb27f008aa49ea 100644 (file)
@@ -781,9 +781,6 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
        int err = 0;
        pagemap_entry_t pme = make_pme(PM_NOT_PRESENT);
 
-       if (pmd_trans_unstable(pmd))
-               return 0;
-
        /* find the first VMA at or above 'addr' */
        vma = find_vma(walk->mm, addr);
        spin_lock(&walk->mm->page_table_lock);
@@ -802,6 +799,8 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
                return err;
        }
 
+       if (pmd_trans_unstable(pmd))
+               return 0;
        for (; addr != end; addr += PAGE_SIZE) {
 
                /* check to see if we've left 'vma' behind
index 7b9b75a529be04fef475c57b30bf351e19695e7b..1ffdb9856bb9fcc0592a939e98de7447e10266af 100644 (file)
@@ -810,11 +810,10 @@ static inline const struct cpumask *get_cpu_mask(unsigned int cpu)
 #else /* NR_CPUS > 1 */
 int __first_cpu(const cpumask_t *srcp);
 int __next_cpu(int n, const cpumask_t *srcp);
-int __any_online_cpu(const cpumask_t *mask);
 
 #define first_cpu(src)         __first_cpu(&(src))
 #define next_cpu(n, src)       __next_cpu((n), &(src))
-#define any_online_cpu(mask) __any_online_cpu(&(mask))
+#define any_online_cpu(mask) cpumask_any_and(&mask, cpu_online_mask)
 #define for_each_cpu_mask(cpu, mask)                   \
        for ((cpu) = -1;                                \
                (cpu) = next_cpu((cpu), (mask)),        \
index f2a60dde8c9e4430283c88d9b7645e00b84a9f81..d8738a464b94a71822f191bdd45bcac29b01814c 100644 (file)
@@ -954,7 +954,7 @@ extern void truncate_pagecache(struct inode *inode, loff_t old, loff_t new);
 extern void truncate_setsize(struct inode *inode, loff_t newsize);
 extern int vmtruncate(struct inode *inode, loff_t offset);
 extern int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end);
-
+void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end);
 int truncate_inode_page(struct address_space *mapping, struct page *page);
 int generic_error_remove_page(struct address_space *mapping, struct page *page);
 
index f5bd679be46b50a93903b768754b7a363507c767..b067bd8c49d0ef9ed7e277ba0bb8564afafb48e6 100644 (file)
@@ -33,6 +33,7 @@ struct pid_namespace {
 #endif
        gid_t pid_gid;
        int hide_pid;
+       int reboot;     /* group exit code if this pidns was rebooted */
 };
 
 extern struct pid_namespace init_pid_ns;
@@ -48,6 +49,7 @@ static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns)
 extern struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *ns);
 extern void free_pid_ns(struct kref *kref);
 extern void zap_pid_ns_processes(struct pid_namespace *pid_ns);
+extern int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd);
 
 static inline void put_pid_ns(struct pid_namespace *ns)
 {
@@ -75,11 +77,15 @@ static inline void put_pid_ns(struct pid_namespace *ns)
 {
 }
 
-
 static inline void zap_pid_ns_processes(struct pid_namespace *ns)
 {
        BUG();
 }
+
+static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
+{
+       return 0;
+}
 #endif /* CONFIG_PID_NS */
 
 extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk);
index e9a48234e69396650a7169ec0ba65e5f73e492ec..0d04cd69ab9b8c483895fbf7bab7cd691508e77a 100644 (file)
@@ -2,6 +2,7 @@
  * Copyright (C) 2001 Momchil Velikov
  * Portions Copyright (C) 2001 Christoph Hellwig
  * Copyright (C) 2006 Nick Piggin
+ * Copyright (C) 2012 Konstantin Khlebnikov
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as
@@ -257,4 +258,199 @@ static inline void radix_tree_preload_end(void)
        preempt_enable();
 }
 
+/**
+ * struct radix_tree_iter - radix tree iterator state
+ *
+ * @index:     index of current slot
+ * @next_index:        next-to-last index for this chunk
+ * @tags:      bit-mask for tag-iterating
+ *
+ * This radix tree iterator works in terms of "chunks" of slots.  A chunk is a
+ * subinterval of slots contained within one radix tree leaf node.  It is
+ * described by a pointer to its first slot and a struct radix_tree_iter
+ * which holds the chunk's position in the tree and its size.  For tagged
+ * iteration radix_tree_iter also holds the slots' bit-mask for one chosen
+ * radix tree tag.
+ */
+struct radix_tree_iter {
+       unsigned long   index;
+       unsigned long   next_index;
+       unsigned long   tags;
+};
+
+#define RADIX_TREE_ITER_TAG_MASK       0x00FF  /* tag index in lower byte */
+#define RADIX_TREE_ITER_TAGGED         0x0100  /* lookup tagged slots */
+#define RADIX_TREE_ITER_CONTIG         0x0200  /* stop at first hole */
+
+/**
+ * radix_tree_iter_init - initialize radix tree iterator
+ *
+ * @iter:      pointer to iterator state
+ * @start:     iteration starting index
+ * Returns:    NULL
+ */
+static __always_inline void **
+radix_tree_iter_init(struct radix_tree_iter *iter, unsigned long start)
+{
+       /*
+        * Leave iter->tags uninitialized. radix_tree_next_chunk() will fill it
+        * in the case of a successful tagged chunk lookup.  If the lookup was
+        * unsuccessful or non-tagged then nobody cares about ->tags.
+        *
+        * Set index to zero to bypass next_index overflow protection.
+        * See the comment in radix_tree_next_chunk() for details.
+        */
+       iter->index = 0;
+       iter->next_index = start;
+       return NULL;
+}
+
+/**
+ * radix_tree_next_chunk - find next chunk of slots for iteration
+ *
+ * @root:      radix tree root
+ * @iter:      iterator state
+ * @flags:     RADIX_TREE_ITER_* flags and tag index
+ * Returns:    pointer to chunk first slot, or NULL if there no more left
+ *
+ * This function looks up the next chunk in the radix tree starting from
+ * @iter->next_index.  It returns a pointer to the chunk's first slot.
+ * Also it fills @iter with data about chunk: position in the tree (index),
+ * its end (next_index), and constructs a bit mask for tagged iterating (tags).
+ */
+void **radix_tree_next_chunk(struct radix_tree_root *root,
+                            struct radix_tree_iter *iter, unsigned flags);
+
+/**
+ * radix_tree_chunk_size - get current chunk size
+ *
+ * @iter:      pointer to radix tree iterator
+ * Returns:    current chunk size
+ */
+static __always_inline unsigned
+radix_tree_chunk_size(struct radix_tree_iter *iter)
+{
+       return iter->next_index - iter->index;
+}
+
+/**
+ * radix_tree_next_slot - find next slot in chunk
+ *
+ * @slot:      pointer to current slot
+ * @iter:      pointer to interator state
+ * @flags:     RADIX_TREE_ITER_*, should be constant
+ * Returns:    pointer to next slot, or NULL if there no more left
+ *
+ * This function updates @iter->index in the case of a successful lookup.
+ * For tagged lookup it also eats @iter->tags.
+ */
+static __always_inline void **
+radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags)
+{
+       if (flags & RADIX_TREE_ITER_TAGGED) {
+               iter->tags >>= 1;
+               if (likely(iter->tags & 1ul)) {
+                       iter->index++;
+                       return slot + 1;
+               }
+               if (!(flags & RADIX_TREE_ITER_CONTIG) && likely(iter->tags)) {
+                       unsigned offset = __ffs(iter->tags);
+
+                       iter->tags >>= offset;
+                       iter->index += offset + 1;
+                       return slot + offset + 1;
+               }
+       } else {
+               unsigned size = radix_tree_chunk_size(iter) - 1;
+
+               while (size--) {
+                       slot++;
+                       iter->index++;
+                       if (likely(*slot))
+                               return slot;
+                       if (flags & RADIX_TREE_ITER_CONTIG)
+                               break;
+               }
+       }
+       return NULL;
+}
+
+/**
+ * radix_tree_for_each_chunk - iterate over chunks
+ *
+ * @slot:      the void** variable for pointer to chunk first slot
+ * @root:      the struct radix_tree_root pointer
+ * @iter:      the struct radix_tree_iter pointer
+ * @start:     iteration starting index
+ * @flags:     RADIX_TREE_ITER_* and tag index
+ *
+ * Locks can be released and reacquired between iterations.
+ */
+#define radix_tree_for_each_chunk(slot, root, iter, start, flags)      \
+       for (slot = radix_tree_iter_init(iter, start) ;                 \
+             (slot = radix_tree_next_chunk(root, iter, flags)) ;)
+
+/**
+ * radix_tree_for_each_chunk_slot - iterate over slots in one chunk
+ *
+ * @slot:      the void** variable, at the beginning points to chunk first slot
+ * @iter:      the struct radix_tree_iter pointer
+ * @flags:     RADIX_TREE_ITER_*, should be constant
+ *
+ * This macro is designed to be nested inside radix_tree_for_each_chunk().
+ * @slot points to the radix tree slot, @iter->index contains its index.
+ */
+#define radix_tree_for_each_chunk_slot(slot, iter, flags)              \
+       for (; slot ; slot = radix_tree_next_slot(slot, iter, flags))
+
+/**
+ * radix_tree_for_each_slot - iterate over non-empty slots
+ *
+ * @slot:      the void** variable for pointer to slot
+ * @root:      the struct radix_tree_root pointer
+ * @iter:      the struct radix_tree_iter pointer
+ * @start:     iteration starting index
+ *
+ * @slot points to radix tree slot, @iter->index contains its index.
+ */
+#define radix_tree_for_each_slot(slot, root, iter, start)              \
+       for (slot = radix_tree_iter_init(iter, start) ;                 \
+            slot || (slot = radix_tree_next_chunk(root, iter, 0)) ;    \
+            slot = radix_tree_next_slot(slot, iter, 0))
+
+/**
+ * radix_tree_for_each_contig - iterate over contiguous slots
+ *
+ * @slot:      the void** variable for pointer to slot
+ * @root:      the struct radix_tree_root pointer
+ * @iter:      the struct radix_tree_iter pointer
+ * @start:     iteration starting index
+ *
+ * @slot points to radix tree slot, @iter->index contains its index.
+ */
+#define radix_tree_for_each_contig(slot, root, iter, start)            \
+       for (slot = radix_tree_iter_init(iter, start) ;                 \
+            slot || (slot = radix_tree_next_chunk(root, iter,          \
+                               RADIX_TREE_ITER_CONTIG)) ;              \
+            slot = radix_tree_next_slot(slot, iter,                    \
+                               RADIX_TREE_ITER_CONTIG))
+
+/**
+ * radix_tree_for_each_tagged - iterate over tagged slots
+ *
+ * @slot:      the void** variable for pointer to slot
+ * @root:      the struct radix_tree_root pointer
+ * @iter:      the struct radix_tree_iter pointer
+ * @start:     iteration starting index
+ * @tag:       tag index
+ *
+ * @slot points to radix tree slot, @iter->index contains its index.
+ */
+#define radix_tree_for_each_tagged(slot, root, iter, start, tag)       \
+       for (slot = radix_tree_iter_init(iter, start) ;                 \
+            slot || (slot = radix_tree_next_chunk(root, iter,          \
+                             RADIX_TREE_ITER_TAGGED | tag)) ;          \
+            slot = radix_tree_next_slot(slot, iter,                    \
+                               RADIX_TREE_ITER_TAGGED))
+
 #endif /* _LINUX_RADIX_TREE_H */
index 8cc38d3bab0c57a79f68fe53e4915f9393335cba..10530d92c04b05bd56ab269974094ecfd2efbaab 100644 (file)
@@ -101,6 +101,22 @@ static inline void call_function_init(void) { }
  */
 int on_each_cpu(smp_call_func_t func, void *info, int wait);
 
+/*
+ * Call a function on processors specified by mask, which might include
+ * the local one.
+ */
+void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func,
+               void *info, bool wait);
+
+/*
+ * Call a function on each processor for which the supplied function
+ * cond_func returns a positive value. This may include the local
+ * processor.
+ */
+void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
+               smp_call_func_t func, void *info, bool wait,
+               gfp_t gfp_flags);
+
 /*
  * Mark the boot cpu "online" so that it can call console drivers in
  * printk() and can access its per-cpu storage.
@@ -132,6 +148,36 @@ static inline int up_smp_call_function(smp_call_func_t func, void *info)
                local_irq_enable();             \
                0;                              \
        })
+/*
+ * Note we still need to test the mask even for UP
+ * because we actually can get an empty mask from
+ * code that on SMP might call us without the local
+ * CPU in the mask.
+ */
+#define on_each_cpu_mask(mask, func, info, wait) \
+       do {                                            \
+               if (cpumask_test_cpu(0, (mask))) {      \
+                       local_irq_disable();            \
+                       (func)(info);                   \
+                       local_irq_enable();             \
+               }                                       \
+       } while (0)
+/*
+ * Preemption is disabled here to make sure the cond_func is called under the
+ * same condtions in UP and SMP.
+ */
+#define on_each_cpu_cond(cond_func, func, info, wait, gfp_flags)\
+       do {                                                    \
+               void *__info = (info);                          \
+               preempt_disable();                              \
+               if ((cond_func)(0, __info)) {                   \
+                       local_irq_disable();                    \
+                       (func)(__info);                         \
+                       local_irq_enable();                     \
+               }                                               \
+               preempt_enable();                               \
+       } while (0)
+
 static inline void smp_send_reschedule(int cpu) { }
 #define num_booting_cpus()                     1
 #define smp_prepare_boot_cpu()                 do {} while (0)
index b86b5c20617d93e361d462e20c0c3c87856214b6..8dc0ea7caf02baa404e8896d13351577a177547a 100644 (file)
@@ -21,6 +21,9 @@ struct bio;
 #define SWAP_FLAG_PRIO_SHIFT   0
 #define SWAP_FLAG_DISCARD      0x10000 /* discard swap cluster after use */
 
+#define SWAP_FLAGS_VALID       (SWAP_FLAG_PRIO_MASK | SWAP_FLAG_PREFER | \
+                                SWAP_FLAG_DISCARD)
+
 static inline int current_is_kswapd(void)
 {
        return current->flags & PF_KSWAPD;
index 2a0deffa5dbec2930c610bfec7ea4b7fdc258b34..4e2e472f6aeb35e5fb78e9a6ccdcc4c08fae57e3 100644 (file)
@@ -1358,6 +1358,10 @@ static int __init parse_crashkernel_simple(char          *cmdline,
 
        if (*cur == '@')
                *crash_base = memparse(cur+1, &cur);
+       else if (*cur != ' ' && *cur != '\0') {
+               pr_warning("crashkernel: unrecognized char\n");
+               return -EINVAL;
+       }
 
        return 0;
 }
@@ -1461,7 +1465,9 @@ static int __init crash_save_vmcoreinfo_init(void)
 
        VMCOREINFO_SYMBOL(init_uts_ns);
        VMCOREINFO_SYMBOL(node_online_map);
+#ifdef CONFIG_MMU
        VMCOREINFO_SYMBOL(swapper_pg_dir);
+#endif
        VMCOREINFO_SYMBOL(_stext);
        VMCOREINFO_SYMBOL(vmlist);
 
index 17b232869a04efce1ac4d43f9c30a66869c470ee..57bc1fd35b3cbe6bffdbfe71af5f13fc00648b81 100644 (file)
@@ -15,6 +15,7 @@
 #include <linux/acct.h>
 #include <linux/slab.h>
 #include <linux/proc_fs.h>
+#include <linux/reboot.h>
 
 #define BITS_PER_PAGE          (PAGE_SIZE*8)
 
@@ -183,6 +184,9 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
                rc = sys_wait4(-1, NULL, __WALL, NULL);
        } while (rc != -ECHILD);
 
+       if (pid_ns->reboot)
+               current->signal->group_exit_code = pid_ns->reboot;
+
        acct_exit_ns(pid_ns);
        return;
 }
@@ -217,6 +221,35 @@ static struct ctl_table pid_ns_ctl_table[] = {
 
 static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } };
 
+int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
+{
+       if (pid_ns == &init_pid_ns)
+               return 0;
+
+       switch (cmd) {
+       case LINUX_REBOOT_CMD_RESTART2:
+       case LINUX_REBOOT_CMD_RESTART:
+               pid_ns->reboot = SIGHUP;
+               break;
+
+       case LINUX_REBOOT_CMD_POWER_OFF:
+       case LINUX_REBOOT_CMD_HALT:
+               pid_ns->reboot = SIGINT;
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       read_lock(&tasklist_lock);
+       force_sig(SIGKILL, pid_ns->child_reaper);
+       read_unlock(&tasklist_lock);
+
+       do_exit(0);
+
+       /* Not reached */
+       return 0;
+}
+
 static __init int pid_namespaces_init(void)
 {
        pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
index db197d60489b7c144f2f44fbea00e15944e31871..2f8b10ecf75996c1bf40feb78bf2a0c1d5471489 100644 (file)
@@ -701,3 +701,93 @@ int on_each_cpu(void (*func) (void *info), void *info, int wait)
        return ret;
 }
 EXPORT_SYMBOL(on_each_cpu);
+
+/**
+ * on_each_cpu_mask(): Run a function on processors specified by
+ * cpumask, which may include the local processor.
+ * @mask: The set of cpus to run on (only runs on online subset).
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @wait: If true, wait (atomically) until function has completed
+ *        on other CPUs.
+ *
+ * If @wait is true, then returns once @func has returned.
+ *
+ * You must not call this function with disabled interrupts or
+ * from a hardware interrupt handler or from a bottom half handler.
+ */
+void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func,
+                       void *info, bool wait)
+{
+       int cpu = get_cpu();
+
+       smp_call_function_many(mask, func, info, wait);
+       if (cpumask_test_cpu(cpu, mask)) {
+               local_irq_disable();
+               func(info);
+               local_irq_enable();
+       }
+       put_cpu();
+}
+EXPORT_SYMBOL(on_each_cpu_mask);
+
+/*
+ * on_each_cpu_cond(): Call a function on each processor for which
+ * the supplied function cond_func returns true, optionally waiting
+ * for all the required CPUs to finish. This may include the local
+ * processor.
+ * @cond_func: A callback function that is passed a cpu id and
+ *             the the info parameter. The function is called
+ *             with preemption disabled. The function should
+ *             return a blooean value indicating whether to IPI
+ *             the specified CPU.
+ * @func:      The function to run on all applicable CPUs.
+ *             This must be fast and non-blocking.
+ * @info:      An arbitrary pointer to pass to both functions.
+ * @wait:      If true, wait (atomically) until function has
+ *             completed on other CPUs.
+ * @gfp_flags: GFP flags to use when allocating the cpumask
+ *             used internally by the function.
+ *
+ * The function might sleep if the GFP flags indicates a non
+ * atomic allocation is allowed.
+ *
+ * Preemption is disabled to protect against CPUs going offline but not online.
+ * CPUs going online during the call will not be seen or sent an IPI.
+ *
+ * You must not call this function with disabled interrupts or
+ * from a hardware interrupt handler or from a bottom half handler.
+ */
+void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
+                       smp_call_func_t func, void *info, bool wait,
+                       gfp_t gfp_flags)
+{
+       cpumask_var_t cpus;
+       int cpu, ret;
+
+       might_sleep_if(gfp_flags & __GFP_WAIT);
+
+       if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) {
+               preempt_disable();
+               for_each_online_cpu(cpu)
+                       if (cond_func(cpu, info))
+                               cpumask_set_cpu(cpu, cpus);
+               on_each_cpu_mask(cpus, func, info, wait);
+               preempt_enable();
+               free_cpumask_var(cpus);
+       } else {
+               /*
+                * No free cpumask, bother. No matter, we'll
+                * just have to IPI them one by one.
+                */
+               preempt_disable();
+               for_each_online_cpu(cpu)
+                       if (cond_func(cpu, info)) {
+                               ret = smp_call_function_single(cpu, func,
+                                                               info, wait);
+                               WARN_ON_ONCE(!ret);
+                       }
+               preempt_enable();
+       }
+}
+EXPORT_SYMBOL(on_each_cpu_cond);
index 9eb7fcab8df622cdc1ad7fa032963324b777e982..e7006eb6c1e4fdc3fa7967eaa45541609b5d499f 100644 (file)
@@ -444,6 +444,15 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
                        magic2 != LINUX_REBOOT_MAGIC2C))
                return -EINVAL;
 
+       /*
+        * If pid namespaces are enabled and the current task is in a child
+        * pid_namespace, the command is handled by reboot_pid_ns() which will
+        * call do_exit().
+        */
+       ret = reboot_pid_ns(task_active_pid_ns(current), cmd);
+       if (ret)
+               return ret;
+
        /* Instead of trying to make the power_off code look like
         * halt when pm_power_off is not set do it the easy way.
         */
index 803a374f67662d82d6553d86f38bff815c5388b4..52b3a06a02f8406c4f78eb06cfaad89f51910a21 100644 (file)
@@ -23,6 +23,7 @@
 #include <linux/swap.h>
 #include <linux/slab.h>
 #include <linux/sysctl.h>
+#include <linux/bitmap.h>
 #include <linux/signal.h>
 #include <linux/printk.h>
 #include <linux/proc_fs.h>
@@ -2395,9 +2396,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
                                }
                        }
 
-                       while (val_a <= val_b)
-                               set_bit(val_a++, tmp_bitmap);
-
+                       bitmap_set(tmp_bitmap, val_a, val_b - val_a + 1);
                        first = 0;
                        proc_skip_char(&kbuf, &left, '\n');
                }
@@ -2440,8 +2439,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
                        if (*ppos)
                                bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len);
                        else
-                               memcpy(bitmap, tmp_bitmap,
-                                       BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long));
+                               bitmap_copy(bitmap, tmp_bitmap, bitmap_len);
                }
                kfree(tmp_bitmap);
                *lenp -= left;
index a0e5900a9d85eedba0da1420836797476753f8a0..4a8aba2e5cc016b84776a85af19d8d2d1b404f30 100644 (file)
@@ -88,6 +88,10 @@ choice
        prompt "CRC32 implementation"
        depends on CRC32
        default CRC32_SLICEBY8
+       help
+         This option allows a kernel builder to override the default choice
+         of CRC32 algorithm.  Choose the default ("slice by 8") unless you
+         know that you need one of the others.
 
 config CRC32_SLICEBY8
        bool "Slice by 8 bytes"
index 0b660118ed91d1323140b5cf1a503ee44ecb3aa6..402a54ac35cbf60f665485ddd18500d5034e9ee9 100644 (file)
@@ -26,18 +26,6 @@ int __next_cpu_nr(int n, const cpumask_t *srcp)
 EXPORT_SYMBOL(__next_cpu_nr);
 #endif
 
-int __any_online_cpu(const cpumask_t *mask)
-{
-       int cpu;
-
-       for_each_cpu(cpu, mask) {
-               if (cpu_online(cpu))
-                       break;
-       }
-       return cpu;
-}
-EXPORT_SYMBOL(__any_online_cpu);
-
 /**
  * cpumask_next_and - get the next cpu in *src1p & *src2p
  * @n: the cpu prior to the place to search (ie. return will be > @n)
index 3e69c2b66c9454cd074a9649ff946598467ba407..86516f5588e31782676087fd49fe45fd87538c03 100644 (file)
@@ -3,6 +3,7 @@
  * Portions Copyright (C) 2001 Christoph Hellwig
  * Copyright (C) 2005 SGI, Christoph Lameter
  * Copyright (C) 2006 Nick Piggin
+ * Copyright (C) 2012 Konstantin Khlebnikov
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as
@@ -146,6 +147,43 @@ static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag)
        }
        return 0;
 }
+
+/**
+ * radix_tree_find_next_bit - find the next set bit in a memory region
+ *
+ * @addr: The address to base the search on
+ * @size: The bitmap size in bits
+ * @offset: The bitnumber to start searching at
+ *
+ * Unrollable variant of find_next_bit() for constant size arrays.
+ * Tail bits starting from size to roundup(size, BITS_PER_LONG) must be zero.
+ * Returns next bit offset, or size if nothing found.
+ */
+static __always_inline unsigned long
+radix_tree_find_next_bit(const unsigned long *addr,
+                        unsigned long size, unsigned long offset)
+{
+       if (!__builtin_constant_p(size))
+               return find_next_bit(addr, size, offset);
+
+       if (offset < size) {
+               unsigned long tmp;
+
+               addr += offset / BITS_PER_LONG;
+               tmp = *addr >> (offset % BITS_PER_LONG);
+               if (tmp)
+                       return __ffs(tmp) + offset;
+               offset = (offset + BITS_PER_LONG) & ~(BITS_PER_LONG - 1);
+               while (offset < size) {
+                       tmp = *++addr;
+                       if (tmp)
+                               return __ffs(tmp) + offset;
+                       offset += BITS_PER_LONG;
+               }
+       }
+       return size;
+}
+
 /*
  * This assumes that the caller has performed appropriate preallocation, and
  * that the caller has pinned this thread of control to the current CPU.
@@ -612,6 +650,119 @@ int radix_tree_tag_get(struct radix_tree_root *root,
 }
 EXPORT_SYMBOL(radix_tree_tag_get);
 
+/**
+ * radix_tree_next_chunk - find next chunk of slots for iteration
+ *
+ * @root:      radix tree root
+ * @iter:      iterator state
+ * @flags:     RADIX_TREE_ITER_* flags and tag index
+ * Returns:    pointer to chunk first slot, or NULL if iteration is over
+ */
+void **radix_tree_next_chunk(struct radix_tree_root *root,
+                            struct radix_tree_iter *iter, unsigned flags)
+{
+       unsigned shift, tag = flags & RADIX_TREE_ITER_TAG_MASK;
+       struct radix_tree_node *rnode, *node;
+       unsigned long index, offset;
+
+       if ((flags & RADIX_TREE_ITER_TAGGED) && !root_tag_get(root, tag))
+               return NULL;
+
+       /*
+        * Catch next_index overflow after ~0UL. iter->index never overflows
+        * during iterating; it can be zero only at the beginning.
+        * And we cannot overflow iter->next_index in a single step,
+        * because RADIX_TREE_MAP_SHIFT < BITS_PER_LONG.
+        */
+       index = iter->next_index;
+       if (!index && iter->index)
+               return NULL;
+
+       rnode = rcu_dereference_raw(root->rnode);
+       if (radix_tree_is_indirect_ptr(rnode)) {
+               rnode = indirect_to_ptr(rnode);
+       } else if (rnode && !index) {
+               /* Single-slot tree */
+               iter->index = 0;
+               iter->next_index = 1;
+               iter->tags = 1;
+               return (void **)&root->rnode;
+       } else
+               return NULL;
+
+restart:
+       shift = (rnode->height - 1) * RADIX_TREE_MAP_SHIFT;
+       offset = index >> shift;
+
+       /* Index outside of the tree */
+       if (offset >= RADIX_TREE_MAP_SIZE)
+               return NULL;
+
+       node = rnode;
+       while (1) {
+               if ((flags & RADIX_TREE_ITER_TAGGED) ?
+                               !test_bit(offset, node->tags[tag]) :
+                               !node->slots[offset]) {
+                       /* Hole detected */
+                       if (flags & RADIX_TREE_ITER_CONTIG)
+                               return NULL;
+
+                       if (flags & RADIX_TREE_ITER_TAGGED)
+                               offset = radix_tree_find_next_bit(
+                                               node->tags[tag],
+                                               RADIX_TREE_MAP_SIZE,
+                                               offset + 1);
+                       else
+                               while (++offset < RADIX_TREE_MAP_SIZE) {
+                                       if (node->slots[offset])
+                                               break;
+                               }
+                       index &= ~((RADIX_TREE_MAP_SIZE << shift) - 1);
+                       index += offset << shift;
+                       /* Overflow after ~0UL */
+                       if (!index)
+                               return NULL;
+                       if (offset == RADIX_TREE_MAP_SIZE)
+                               goto restart;
+               }
+
+               /* This is leaf-node */
+               if (!shift)
+                       break;
+
+               node = rcu_dereference_raw(node->slots[offset]);
+               if (node == NULL)
+                       goto restart;
+               shift -= RADIX_TREE_MAP_SHIFT;
+               offset = (index >> shift) & RADIX_TREE_MAP_MASK;
+       }
+
+       /* Update the iterator state */
+       iter->index = index;
+       iter->next_index = (index | RADIX_TREE_MAP_MASK) + 1;
+
+       /* Construct iter->tags bit-mask from node->tags[tag] array */
+       if (flags & RADIX_TREE_ITER_TAGGED) {
+               unsigned tag_long, tag_bit;
+
+               tag_long = offset / BITS_PER_LONG;
+               tag_bit  = offset % BITS_PER_LONG;
+               iter->tags = node->tags[tag][tag_long] >> tag_bit;
+               /* This never happens if RADIX_TREE_TAG_LONGS == 1 */
+               if (tag_long < RADIX_TREE_TAG_LONGS - 1) {
+                       /* Pick tags from next element */
+                       if (tag_bit)
+                               iter->tags |= node->tags[tag][tag_long + 1] <<
+                                               (BITS_PER_LONG - tag_bit);
+                       /* Clip chunk size, here only BITS_PER_LONG tags */
+                       iter->next_index = index + BITS_PER_LONG;
+               }
+       }
+
+       return node->slots + offset;
+}
+EXPORT_SYMBOL(radix_tree_next_chunk);
+
 /**
  * radix_tree_range_tag_if_tagged - for each item in given range set given
  *                                tag if item has another tag set
@@ -817,57 +968,6 @@ unsigned long radix_tree_prev_hole(struct radix_tree_root *root,
 }
 EXPORT_SYMBOL(radix_tree_prev_hole);
 
-static unsigned int
-__lookup(struct radix_tree_node *slot, void ***results, unsigned long *indices,
-       unsigned long index, unsigned int max_items, unsigned long *next_index)
-{
-       unsigned int nr_found = 0;
-       unsigned int shift, height;
-       unsigned long i;
-
-       height = slot->height;
-       if (height == 0)
-               goto out;
-       shift = (height-1) * RADIX_TREE_MAP_SHIFT;
-
-       for ( ; height > 1; height--) {
-               i = (index >> shift) & RADIX_TREE_MAP_MASK;
-               for (;;) {
-                       if (slot->slots[i] != NULL)
-                               break;
-                       index &= ~((1UL << shift) - 1);
-                       index += 1UL << shift;
-                       if (index == 0)
-                               goto out;       /* 32-bit wraparound */
-                       i++;
-                       if (i == RADIX_TREE_MAP_SIZE)
-                               goto out;
-               }
-
-               shift -= RADIX_TREE_MAP_SHIFT;
-               slot = rcu_dereference_raw(slot->slots[i]);
-               if (slot == NULL)
-                       goto out;
-       }
-
-       /* Bottom level: grab some items */
-       for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE; i++) {
-               if (slot->slots[i]) {
-                       results[nr_found] = &(slot->slots[i]);
-                       if (indices)
-                               indices[nr_found] = index;
-                       if (++nr_found == max_items) {
-                               index++;
-                               goto out;
-                       }
-               }
-               index++;
-       }
-out:
-       *next_index = index;
-       return nr_found;
-}
-
 /**
  *     radix_tree_gang_lookup - perform multiple lookup on a radix tree
  *     @root:          radix tree root
@@ -891,48 +991,19 @@ unsigned int
 radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
                        unsigned long first_index, unsigned int max_items)
 {
-       unsigned long max_index;
-       struct radix_tree_node *node;
-       unsigned long cur_index = first_index;
-       unsigned int ret;
+       struct radix_tree_iter iter;
+       void **slot;
+       unsigned int ret = 0;
 
-       node = rcu_dereference_raw(root->rnode);
-       if (!node)
+       if (unlikely(!max_items))
                return 0;
 
-       if (!radix_tree_is_indirect_ptr(node)) {
-               if (first_index > 0)
-                       return 0;
-               results[0] = node;
-               return 1;
-       }
-       node = indirect_to_ptr(node);
-
-       max_index = radix_tree_maxindex(node->height);
-
-       ret = 0;
-       while (ret < max_items) {
-               unsigned int nr_found, slots_found, i;
-               unsigned long next_index;       /* Index of next search */
-
-               if (cur_index > max_index)
-                       break;
-               slots_found = __lookup(node, (void ***)results + ret, NULL,
-                               cur_index, max_items - ret, &next_index);
-               nr_found = 0;
-               for (i = 0; i < slots_found; i++) {
-                       struct radix_tree_node *slot;
-                       slot = *(((void ***)results)[ret + i]);
-                       if (!slot)
-                               continue;
-                       results[ret + nr_found] =
-                               indirect_to_ptr(rcu_dereference_raw(slot));
-                       nr_found++;
-               }
-               ret += nr_found;
-               if (next_index == 0)
+       radix_tree_for_each_slot(slot, root, &iter, first_index) {
+               results[ret] = indirect_to_ptr(rcu_dereference_raw(*slot));
+               if (!results[ret])
+                       continue;
+               if (++ret == max_items)
                        break;
-               cur_index = next_index;
        }
 
        return ret;
@@ -962,112 +1033,25 @@ radix_tree_gang_lookup_slot(struct radix_tree_root *root,
                        void ***results, unsigned long *indices,
                        unsigned long first_index, unsigned int max_items)
 {
-       unsigned long max_index;
-       struct radix_tree_node *node;
-       unsigned long cur_index = first_index;
-       unsigned int ret;
+       struct radix_tree_iter iter;
+       void **slot;
+       unsigned int ret = 0;
 
-       node = rcu_dereference_raw(root->rnode);
-       if (!node)
+       if (unlikely(!max_items))
                return 0;
 
-       if (!radix_tree_is_indirect_ptr(node)) {
-               if (first_index > 0)
-                       return 0;
-               results[0] = (void **)&root->rnode;
+       radix_tree_for_each_slot(slot, root, &iter, first_index) {
+               results[ret] = slot;
                if (indices)
-                       indices[0] = 0;
-               return 1;
-       }
-       node = indirect_to_ptr(node);
-
-       max_index = radix_tree_maxindex(node->height);
-
-       ret = 0;
-       while (ret < max_items) {
-               unsigned int slots_found;
-               unsigned long next_index;       /* Index of next search */
-
-               if (cur_index > max_index)
+                       indices[ret] = iter.index;
+               if (++ret == max_items)
                        break;
-               slots_found = __lookup(node, results + ret,
-                               indices ? indices + ret : NULL,
-                               cur_index, max_items - ret, &next_index);
-               ret += slots_found;
-               if (next_index == 0)
-                       break;
-               cur_index = next_index;
        }
 
        return ret;
 }
 EXPORT_SYMBOL(radix_tree_gang_lookup_slot);
 
-/*
- * FIXME: the two tag_get()s here should use find_next_bit() instead of
- * open-coding the search.
- */
-static unsigned int
-__lookup_tag(struct radix_tree_node *slot, void ***results, unsigned long index,
-       unsigned int max_items, unsigned long *next_index, unsigned int tag)
-{
-       unsigned int nr_found = 0;
-       unsigned int shift, height;
-
-       height = slot->height;
-       if (height == 0)
-               goto out;
-       shift = (height-1) * RADIX_TREE_MAP_SHIFT;
-
-       while (height > 0) {
-               unsigned long i = (index >> shift) & RADIX_TREE_MAP_MASK ;
-
-               for (;;) {
-                       if (tag_get(slot, tag, i))
-                               break;
-                       index &= ~((1UL << shift) - 1);
-                       index += 1UL << shift;
-                       if (index == 0)
-                               goto out;       /* 32-bit wraparound */
-                       i++;
-                       if (i == RADIX_TREE_MAP_SIZE)
-                               goto out;
-               }
-               height--;
-               if (height == 0) {      /* Bottom level: grab some items */
-                       unsigned long j = index & RADIX_TREE_MAP_MASK;
-
-                       for ( ; j < RADIX_TREE_MAP_SIZE; j++) {
-                               index++;
-                               if (!tag_get(slot, tag, j))
-                                       continue;
-                               /*
-                                * Even though the tag was found set, we need to
-                                * recheck that we have a non-NULL node, because
-                                * if this lookup is lockless, it may have been
-                                * subsequently deleted.
-                                *
-                                * Similar care must be taken in any place that
-                                * lookup ->slots[x] without a lock (ie. can't
-                                * rely on its value remaining the same).
-                                */
-                               if (slot->slots[j]) {
-                                       results[nr_found++] = &(slot->slots[j]);
-                                       if (nr_found == max_items)
-                                               goto out;
-                               }
-                       }
-               }
-               shift -= RADIX_TREE_MAP_SHIFT;
-               slot = rcu_dereference_raw(slot->slots[i]);
-               if (slot == NULL)
-                       break;
-       }
-out:
-       *next_index = index;
-       return nr_found;
-}
-
 /**
  *     radix_tree_gang_lookup_tag - perform multiple lookup on a radix tree
  *                                  based on a tag
@@ -1086,52 +1070,19 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
                unsigned long first_index, unsigned int max_items,
                unsigned int tag)
 {
-       struct radix_tree_node *node;
-       unsigned long max_index;
-       unsigned long cur_index = first_index;
-       unsigned int ret;
-
-       /* check the root's tag bit */
-       if (!root_tag_get(root, tag))
-               return 0;
+       struct radix_tree_iter iter;
+       void **slot;
+       unsigned int ret = 0;
 
-       node = rcu_dereference_raw(root->rnode);
-       if (!node)
+       if (unlikely(!max_items))
                return 0;
 
-       if (!radix_tree_is_indirect_ptr(node)) {
-               if (first_index > 0)
-                       return 0;
-               results[0] = node;
-               return 1;
-       }
-       node = indirect_to_ptr(node);
-
-       max_index = radix_tree_maxindex(node->height);
-
-       ret = 0;
-       while (ret < max_items) {
-               unsigned int nr_found, slots_found, i;
-               unsigned long next_index;       /* Index of next search */
-
-               if (cur_index > max_index)
-                       break;
-               slots_found = __lookup_tag(node, (void ***)results + ret,
-                               cur_index, max_items - ret, &next_index, tag);
-               nr_found = 0;
-               for (i = 0; i < slots_found; i++) {
-                       struct radix_tree_node *slot;
-                       slot = *(((void ***)results)[ret + i]);
-                       if (!slot)
-                               continue;
-                       results[ret + nr_found] =
-                               indirect_to_ptr(rcu_dereference_raw(slot));
-                       nr_found++;
-               }
-               ret += nr_found;
-               if (next_index == 0)
+       radix_tree_for_each_tagged(slot, root, &iter, first_index, tag) {
+               results[ret] = indirect_to_ptr(rcu_dereference_raw(*slot));
+               if (!results[ret])
+                       continue;
+               if (++ret == max_items)
                        break;
-               cur_index = next_index;
        }
 
        return ret;
@@ -1156,42 +1107,17 @@ radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results,
                unsigned long first_index, unsigned int max_items,
                unsigned int tag)
 {
-       struct radix_tree_node *node;
-       unsigned long max_index;
-       unsigned long cur_index = first_index;
-       unsigned int ret;
+       struct radix_tree_iter iter;
+       void **slot;
+       unsigned int ret = 0;
 
-       /* check the root's tag bit */
-       if (!root_tag_get(root, tag))
-               return 0;
-
-       node = rcu_dereference_raw(root->rnode);
-       if (!node)
+       if (unlikely(!max_items))
                return 0;
 
-       if (!radix_tree_is_indirect_ptr(node)) {
-               if (first_index > 0)
-                       return 0;
-               results[0] = (void **)&root->rnode;
-               return 1;
-       }
-       node = indirect_to_ptr(node);
-
-       max_index = radix_tree_maxindex(node->height);
-
-       ret = 0;
-       while (ret < max_items) {
-               unsigned int slots_found;
-               unsigned long next_index;       /* Index of next search */
-
-               if (cur_index > max_index)
-                       break;
-               slots_found = __lookup_tag(node, results + ret,
-                               cur_index, max_items - ret, &next_index, tag);
-               ret += slots_found;
-               if (next_index == 0)
+       radix_tree_for_each_tagged(slot, root, &iter, first_index, tag) {
+               results[ret] = slot;
+               if (++ret == max_items)
                        break;
-               cur_index = next_index;
        }
 
        return ret;
index c3811bc6b9e33ae88d2575d0387c810b53b95fb8..79c4b2b0b14eec1d05c93e3493dd02e0fd182829 100644 (file)
@@ -813,20 +813,19 @@ EXPORT_SYMBOL(find_or_create_page);
 unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
                            unsigned int nr_pages, struct page **pages)
 {
-       unsigned int i;
-       unsigned int ret;
-       unsigned int nr_found, nr_skip;
+       struct radix_tree_iter iter;
+       void **slot;
+       unsigned ret = 0;
+
+       if (unlikely(!nr_pages))
+               return 0;
 
        rcu_read_lock();
 restart:
-       nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
-                               (void ***)pages, NULL, start, nr_pages);
-       ret = 0;
-       nr_skip = 0;
-       for (i = 0; i < nr_found; i++) {
+       radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
                struct page *page;
 repeat:
-               page = radix_tree_deref_slot((void **)pages[i]);
+               page = radix_tree_deref_slot(slot);
                if (unlikely(!page))
                        continue;
 
@@ -837,7 +836,7 @@ repeat:
                                 * when entry at index 0 moves out of or back
                                 * to root: none yet gotten, safe to restart.
                                 */
-                               WARN_ON(start | i);
+                               WARN_ON(iter.index);
                                goto restart;
                        }
                        /*
@@ -845,7 +844,6 @@ repeat:
                         * here as an exceptional entry: so skip over it -
                         * we only reach this from invalidate_mapping_pages().
                         */
-                       nr_skip++;
                        continue;
                }
 
@@ -853,21 +851,16 @@ repeat:
                        goto repeat;
 
                /* Has the page moved? */
-               if (unlikely(page != *((void **)pages[i]))) {
+               if (unlikely(page != *slot)) {
                        page_cache_release(page);
                        goto repeat;
                }
 
                pages[ret] = page;
-               ret++;
+               if (++ret == nr_pages)
+                       break;
        }
 
-       /*
-        * If all entries were removed before we could secure them,
-        * try again, because callers stop trying once 0 is returned.
-        */
-       if (unlikely(!ret && nr_found > nr_skip))
-               goto restart;
        rcu_read_unlock();
        return ret;
 }
@@ -887,21 +880,22 @@ repeat:
 unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
                               unsigned int nr_pages, struct page **pages)
 {
-       unsigned int i;
-       unsigned int ret;
-       unsigned int nr_found;
+       struct radix_tree_iter iter;
+       void **slot;
+       unsigned int ret = 0;
+
+       if (unlikely(!nr_pages))
+               return 0;
 
        rcu_read_lock();
 restart:
-       nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
-                               (void ***)pages, NULL, index, nr_pages);
-       ret = 0;
-       for (i = 0; i < nr_found; i++) {
+       radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) {
                struct page *page;
 repeat:
-               page = radix_tree_deref_slot((void **)pages[i]);
+               page = radix_tree_deref_slot(slot);
+               /* The hole, there no reason to continue */
                if (unlikely(!page))
-                       continue;
+                       break;
 
                if (radix_tree_exception(page)) {
                        if (radix_tree_deref_retry(page)) {
@@ -924,7 +918,7 @@ repeat:
                        goto repeat;
 
                /* Has the page moved? */
-               if (unlikely(page != *((void **)pages[i]))) {
+               if (unlikely(page != *slot)) {
                        page_cache_release(page);
                        goto repeat;
                }
@@ -934,14 +928,14 @@ repeat:
                 * otherwise we can get both false positives and false
                 * negatives, which is just confusing to the caller.
                 */
-               if (page->mapping == NULL || page->index != index) {
+               if (page->mapping == NULL || page->index != iter.index) {
                        page_cache_release(page);
                        break;
                }
 
                pages[ret] = page;
-               ret++;
-               index++;
+               if (++ret == nr_pages)
+                       break;
        }
        rcu_read_unlock();
        return ret;
@@ -962,19 +956,20 @@ EXPORT_SYMBOL(find_get_pages_contig);
 unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
                        int tag, unsigned int nr_pages, struct page **pages)
 {
-       unsigned int i;
-       unsigned int ret;
-       unsigned int nr_found;
+       struct radix_tree_iter iter;
+       void **slot;
+       unsigned ret = 0;
+
+       if (unlikely(!nr_pages))
+               return 0;
 
        rcu_read_lock();
 restart:
-       nr_found = radix_tree_gang_lookup_tag_slot(&mapping->page_tree,
-                               (void ***)pages, *index, nr_pages, tag);
-       ret = 0;
-       for (i = 0; i < nr_found; i++) {
+       radix_tree_for_each_tagged(slot, &mapping->page_tree,
+                                  &iter, *index, tag) {
                struct page *page;
 repeat:
-               page = radix_tree_deref_slot((void **)pages[i]);
+               page = radix_tree_deref_slot(slot);
                if (unlikely(!page))
                        continue;
 
@@ -998,21 +993,16 @@ repeat:
                        goto repeat;
 
                /* Has the page moved? */
-               if (unlikely(page != *((void **)pages[i]))) {
+               if (unlikely(page != *slot)) {
                        page_cache_release(page);
                        goto repeat;
                }
 
                pages[ret] = page;
-               ret++;
+               if (++ret == nr_pages)
+                       break;
        }
 
-       /*
-        * If all entries were removed before we could secure them,
-        * try again, because callers stop trying once 0 is returned.
-        */
-       if (unlikely(!ret && nr_found))
-               goto restart;
        rcu_read_unlock();
 
        if (ret)
index b2ee6df0e9bb31eebd3b2f1528cbbcccb2b9c43e..7d698df4a067ce591fd661f45e098610e1a027db 100644 (file)
@@ -5306,6 +5306,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
                return 0;
        }
 
+       if (pmd_trans_unstable(pmd))
+               return 0;
        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
        for (; addr != end; pte++, addr += PAGE_SIZE)
                if (get_mctgt_type(vma, addr, *pte, NULL))
@@ -5502,6 +5504,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
                return 0;
        }
 
+       if (pmd_trans_unstable(pmd))
+               return 0;
 retry:
        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
        for (; addr != end; addr += PAGE_SIZE) {
index caea788628e4dd62060e4838d385e9858f415f25..a712fb9e04ce4c6914c34b4a80f5526c7b49f346 100644 (file)
@@ -1161,11 +1161,47 @@ void drain_local_pages(void *arg)
 }
 
 /*
- * Spill all the per-cpu pages from all CPUs back into the buddy allocator
+ * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
+ *
+ * Note that this code is protected against sending an IPI to an offline
+ * CPU but does not guarantee sending an IPI to newly hotplugged CPUs:
+ * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but
+ * nothing keeps CPUs from showing up after we populated the cpumask and
+ * before the call to on_each_cpu_mask().
  */
 void drain_all_pages(void)
 {
-       on_each_cpu(drain_local_pages, NULL, 1);
+       int cpu;
+       struct per_cpu_pageset *pcp;
+       struct zone *zone;
+
+       /*
+        * Allocate in the BSS so we wont require allocation in
+        * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
+        */
+       static cpumask_t cpus_with_pcps;
+
+       /*
+        * We don't care about racing with CPU hotplug event
+        * as offline notification will cause the notified
+        * cpu to drain that CPU pcps and on_each_cpu_mask
+        * disables preemption as part of its processing
+        */
+       for_each_online_cpu(cpu) {
+               bool has_pcps = false;
+               for_each_populated_zone(zone) {
+                       pcp = per_cpu_ptr(zone->pageset, cpu);
+                       if (pcp->pcp.count) {
+                               has_pcps = true;
+                               break;
+                       }
+               }
+               if (has_pcps)
+                       cpumask_set_cpu(cpu, &cpus_with_pcps);
+               else
+                       cpumask_clear_cpu(cpu, &cpus_with_pcps);
+       }
+       on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);
 }
 
 #ifdef CONFIG_HIBERNATION
@@ -2308,6 +2344,10 @@ rebalance:
                if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
                        if (oom_killer_disabled)
                                goto nopage;
+                       /* Coredumps can quickly deplete all memory reserves */
+                       if ((current->flags & PF_DUMPCORE) &&
+                           !(gfp_mask & __GFP_NOFAIL))
+                               goto nopage;
                        page = __alloc_pages_may_oom(gfp_mask, order,
                                        zonelist, high_zoneidx,
                                        nodemask, preferred_zone,
index 64d9966d16bc5961f44c37bd7f86441227447ec4..ffe13fdf8144adadfcc0a55b976f412009e87b38 100644 (file)
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2035,9 +2035,17 @@ static void flush_cpu_slab(void *d)
        __flush_cpu_slab(s, smp_processor_id());
 }
 
+static bool has_cpu_slab(int cpu, void *info)
+{
+       struct kmem_cache *s = info;
+       struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
+
+       return !!(c->page);
+}
+
 static void flush_all(struct kmem_cache *s)
 {
-       on_each_cpu(flush_cpu_slab, s, 1);
+       on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC);
 }
 
 /*
index dae42f380d6ebcde88d3aaef6aa3cb1d64e0f5a3..fafc26d1b1dc885d2541eda3bdc5705a4fe56012 100644 (file)
@@ -2022,6 +2022,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
        struct page *page = NULL;
        struct inode *inode = NULL;
 
+       if (swap_flags & ~SWAP_FLAGS_VALID)
+               return -EINVAL;
+
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
 
index 18aded3a89fcc1540e93966a2e5f8be5544b7ffd..61a183b89df6d15c358e7afc2b418411abe5e728 100644 (file)
@@ -626,3 +626,43 @@ int vmtruncate_range(struct inode *inode, loff_t lstart, loff_t lend)
 
        return 0;
 }
+
+/**
+ * truncate_pagecache_range - unmap and remove pagecache that is hole-punched
+ * @inode: inode
+ * @lstart: offset of beginning of hole
+ * @lend: offset of last byte of hole
+ *
+ * This function should typically be called before the filesystem
+ * releases resources associated with the freed range (eg. deallocates
+ * blocks). This way, pagecache will always stay logically coherent
+ * with on-disk format, and the filesystem would not have to deal with
+ * situations such as writepage being called for a page that has already
+ * had its underlying blocks deallocated.
+ */
+void truncate_pagecache_range(struct inode *inode, loff_t lstart, loff_t lend)
+{
+       struct address_space *mapping = inode->i_mapping;
+       loff_t unmap_start = round_up(lstart, PAGE_SIZE);
+       loff_t unmap_end = round_down(1 + lend, PAGE_SIZE) - 1;
+       /*
+        * This rounding is currently just for example: unmap_mapping_range
+        * expands its hole outwards, whereas we want it to contract the hole
+        * inwards.  However, existing callers of truncate_pagecache_range are
+        * doing their own page rounding first; and truncate_inode_pages_range
+        * currently BUGs if lend is not pagealigned-1 (it handles partial
+        * page at start of hole, but not partial page at end of hole).  Note
+        * unmap_mapping_range allows holelen 0 for all, and we allow lend -1.
+        */
+
+       /*
+        * Unlike in truncate_pagecache, unmap_mapping_range is called only
+        * once (before truncating pagecache), and without "even_cows" flag:
+        * hole-punching should not remove private COWed pages from the hole.
+        */
+       if ((u64)unmap_end > (u64)unmap_start)
+               unmap_mapping_range(mapping, unmap_start,
+                                   1 + unmap_end - unmap_start, 0);
+       truncate_inode_pages_range(mapping, lstart, lend);
+}
+EXPORT_SYMBOL(truncate_pagecache_range);
diff --git a/tools/slub/slabinfo.c b/tools/slub/slabinfo.c
deleted file mode 100644 (file)
index 164cbcf..0000000
+++ /dev/null
@@ -1,1393 +0,0 @@
-/*
- * Slabinfo: Tool to get reports about slabs
- *
- * (C) 2007 sgi, Christoph Lameter
- * (C) 2011 Linux Foundation, Christoph Lameter
- *
- * Compile with:
- *
- * gcc -o slabinfo slabinfo.c
- */
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/types.h>
-#include <dirent.h>
-#include <strings.h>
-#include <string.h>
-#include <unistd.h>
-#include <stdarg.h>
-#include <getopt.h>
-#include <regex.h>
-#include <errno.h>
-
-#define MAX_SLABS 500
-#define MAX_ALIASES 500
-#define MAX_NODES 1024
-
-struct slabinfo {
-       char *name;
-       int alias;
-       int refs;
-       int aliases, align, cache_dma, cpu_slabs, destroy_by_rcu;
-       int hwcache_align, object_size, objs_per_slab;
-       int sanity_checks, slab_size, store_user, trace;
-       int order, poison, reclaim_account, red_zone;
-       unsigned long partial, objects, slabs, objects_partial, objects_total;
-       unsigned long alloc_fastpath, alloc_slowpath;
-       unsigned long free_fastpath, free_slowpath;
-       unsigned long free_frozen, free_add_partial, free_remove_partial;
-       unsigned long alloc_from_partial, alloc_slab, free_slab, alloc_refill;
-       unsigned long cpuslab_flush, deactivate_full, deactivate_empty;
-       unsigned long deactivate_to_head, deactivate_to_tail;
-       unsigned long deactivate_remote_frees, order_fallback;
-       unsigned long cmpxchg_double_cpu_fail, cmpxchg_double_fail;
-       unsigned long alloc_node_mismatch, deactivate_bypass;
-       unsigned long cpu_partial_alloc, cpu_partial_free;
-       int numa[MAX_NODES];
-       int numa_partial[MAX_NODES];
-} slabinfo[MAX_SLABS];
-
-struct aliasinfo {
-       char *name;
-       char *ref;
-       struct slabinfo *slab;
-} aliasinfo[MAX_ALIASES];
-
-int slabs = 0;
-int actual_slabs = 0;
-int aliases = 0;
-int alias_targets = 0;
-int highest_node = 0;
-
-char buffer[4096];
-
-int show_empty = 0;
-int show_report = 0;
-int show_alias = 0;
-int show_slab = 0;
-int skip_zero = 1;
-int show_numa = 0;
-int show_track = 0;
-int show_first_alias = 0;
-int validate = 0;
-int shrink = 0;
-int show_inverted = 0;
-int show_single_ref = 0;
-int show_totals = 0;
-int sort_size = 0;
-int sort_active = 0;
-int set_debug = 0;
-int show_ops = 0;
-int show_activity = 0;
-
-/* Debug options */
-int sanity = 0;
-int redzone = 0;
-int poison = 0;
-int tracking = 0;
-int tracing = 0;
-
-int page_size;
-
-regex_t pattern;
-
-static void fatal(const char *x, ...)
-{
-       va_list ap;
-
-       va_start(ap, x);
-       vfprintf(stderr, x, ap);
-       va_end(ap);
-       exit(EXIT_FAILURE);
-}
-
-static void usage(void)
-{
-       printf("slabinfo 4/15/2011. (c) 2007 sgi/(c) 2011 Linux Foundation.\n\n"
-               "slabinfo [-ahnpvtsz] [-d debugopts] [slab-regexp]\n"
-               "-a|--aliases           Show aliases\n"
-               "-A|--activity          Most active slabs first\n"
-               "-d<options>|--debug=<options> Set/Clear Debug options\n"
-               "-D|--display-active    Switch line format to activity\n"
-               "-e|--empty             Show empty slabs\n"
-               "-f|--first-alias       Show first alias\n"
-               "-h|--help              Show usage information\n"
-               "-i|--inverted          Inverted list\n"
-               "-l|--slabs             Show slabs\n"
-               "-n|--numa              Show NUMA information\n"
-               "-o|--ops               Show kmem_cache_ops\n"
-               "-s|--shrink            Shrink slabs\n"
-               "-r|--report            Detailed report on single slabs\n"
-               "-S|--Size              Sort by size\n"
-               "-t|--tracking          Show alloc/free information\n"
-               "-T|--Totals            Show summary information\n"
-               "-v|--validate          Validate slabs\n"
-               "-z|--zero              Include empty slabs\n"
-               "-1|--1ref              Single reference\n"
-               "\nValid debug options (FZPUT may be combined)\n"
-               "a / A          Switch on all debug options (=FZUP)\n"
-               "-              Switch off all debug options\n"
-               "f / F          Sanity Checks (SLAB_DEBUG_FREE)\n"
-               "z / Z          Redzoning\n"
-               "p / P          Poisoning\n"
-               "u / U          Tracking\n"
-               "t / T          Tracing\n"
-       );
-}
-
-static unsigned long read_obj(const char *name)
-{
-       FILE *f = fopen(name, "r");
-
-       if (!f)
-               buffer[0] = 0;
-       else {
-               if (!fgets(buffer, sizeof(buffer), f))
-                       buffer[0] = 0;
-               fclose(f);
-               if (buffer[strlen(buffer)] == '\n')
-                       buffer[strlen(buffer)] = 0;
-       }
-       return strlen(buffer);
-}
-
-
-/*
- * Get the contents of an attribute
- */
-static unsigned long get_obj(const char *name)
-{
-       if (!read_obj(name))
-               return 0;
-
-       return atol(buffer);
-}
-
-static unsigned long get_obj_and_str(const char *name, char **x)
-{
-       unsigned long result = 0;
-       char *p;
-
-       *x = NULL;
-
-       if (!read_obj(name)) {
-               x = NULL;
-               return 0;
-       }
-       result = strtoul(buffer, &p, 10);
-       while (*p == ' ')
-               p++;
-       if (*p)
-               *x = strdup(p);
-       return result;
-}
-
-static void set_obj(struct slabinfo *s, const char *name, int n)
-{
-       char x[100];
-       FILE *f;
-
-       snprintf(x, 100, "%s/%s", s->name, name);
-       f = fopen(x, "w");
-       if (!f)
-               fatal("Cannot write to %s\n", x);
-
-       fprintf(f, "%d\n", n);
-       fclose(f);
-}
-
-static unsigned long read_slab_obj(struct slabinfo *s, const char *name)
-{
-       char x[100];
-       FILE *f;
-       size_t l;
-
-       snprintf(x, 100, "%s/%s", s->name, name);
-       f = fopen(x, "r");
-       if (!f) {
-               buffer[0] = 0;
-               l = 0;
-       } else {
-               l = fread(buffer, 1, sizeof(buffer), f);
-               buffer[l] = 0;
-               fclose(f);
-       }
-       return l;
-}
-
-
-/*
- * Put a size string together
- */
-static int store_size(char *buffer, unsigned long value)
-{
-       unsigned long divisor = 1;
-       char trailer = 0;
-       int n;
-
-       if (value > 1000000000UL) {
-               divisor = 100000000UL;
-               trailer = 'G';
-       } else if (value > 1000000UL) {
-               divisor = 100000UL;
-               trailer = 'M';
-       } else if (value > 1000UL) {
-               divisor = 100;
-               trailer = 'K';
-       }
-
-       value /= divisor;
-       n = sprintf(buffer, "%ld",value);
-       if (trailer) {
-               buffer[n] = trailer;
-               n++;
-               buffer[n] = 0;
-       }
-       if (divisor != 1) {
-               memmove(buffer + n - 2, buffer + n - 3, 4);
-               buffer[n-2] = '.';
-               n++;
-       }
-       return n;
-}
-
-static void decode_numa_list(int *numa, char *t)
-{
-       int node;
-       int nr;
-
-       memset(numa, 0, MAX_NODES * sizeof(int));
-
-       if (!t)
-               return;
-
-       while (*t == 'N') {
-               t++;
-               node = strtoul(t, &t, 10);
-               if (*t == '=') {
-                       t++;
-                       nr = strtoul(t, &t, 10);
-                       numa[node] = nr;
-                       if (node > highest_node)
-                               highest_node = node;
-               }
-               while (*t == ' ')
-                       t++;
-       }
-}
-
-static void slab_validate(struct slabinfo *s)
-{
-       if (strcmp(s->name, "*") == 0)
-               return;
-
-       set_obj(s, "validate", 1);
-}
-
-static void slab_shrink(struct slabinfo *s)
-{
-       if (strcmp(s->name, "*") == 0)
-               return;
-
-       set_obj(s, "shrink", 1);
-}
-
-int line = 0;
-
-static void first_line(void)
-{
-       if (show_activity)
-               printf("Name                   Objects      Alloc       Free   %%Fast Fallb O CmpX   UL\n");
-       else
-               printf("Name                   Objects Objsize    Space "
-                       "Slabs/Part/Cpu  O/S O %%Fr %%Ef Flg\n");
-}
-
-/*
- * Find the shortest alias of a slab
- */
-static struct aliasinfo *find_one_alias(struct slabinfo *find)
-{
-       struct aliasinfo *a;
-       struct aliasinfo *best = NULL;
-
-       for(a = aliasinfo;a < aliasinfo + aliases; a++) {
-               if (a->slab == find &&
-                       (!best || strlen(best->name) < strlen(a->name))) {
-                               best = a;
-                               if (strncmp(a->name,"kmall", 5) == 0)
-                                       return best;
-                       }
-       }
-       return best;
-}
-
-static unsigned long slab_size(struct slabinfo *s)
-{
-       return  s->slabs * (page_size << s->order);
-}
-
-static unsigned long slab_activity(struct slabinfo *s)
-{
-       return  s->alloc_fastpath + s->free_fastpath +
-               s->alloc_slowpath + s->free_slowpath;
-}
-
-static void slab_numa(struct slabinfo *s, int mode)
-{
-       int node;
-
-       if (strcmp(s->name, "*") == 0)
-               return;
-
-       if (!highest_node) {
-               printf("\n%s: No NUMA information available.\n", s->name);
-               return;
-       }
-
-       if (skip_zero && !s->slabs)
-               return;
-
-       if (!line) {
-               printf("\n%-21s:", mode ? "NUMA nodes" : "Slab");
-               for(node = 0; node <= highest_node; node++)
-                       printf(" %4d", node);
-               printf("\n----------------------");
-               for(node = 0; node <= highest_node; node++)
-                       printf("-----");
-               printf("\n");
-       }
-       printf("%-21s ", mode ? "All slabs" : s->name);
-       for(node = 0; node <= highest_node; node++) {
-               char b[20];
-
-               store_size(b, s->numa[node]);
-               printf(" %4s", b);
-       }
-       printf("\n");
-       if (mode) {
-               printf("%-21s ", "Partial slabs");
-               for(node = 0; node <= highest_node; node++) {
-                       char b[20];
-
-                       store_size(b, s->numa_partial[node]);
-                       printf(" %4s", b);
-               }
-               printf("\n");
-       }
-       line++;
-}
-
-static void show_tracking(struct slabinfo *s)
-{
-       printf("\n%s: Kernel object allocation\n", s->name);
-       printf("-----------------------------------------------------------------------\n");
-       if (read_slab_obj(s, "alloc_calls"))
-               printf("%s", buffer);
-       else
-               printf("No Data\n");
-
-       printf("\n%s: Kernel object freeing\n", s->name);
-       printf("------------------------------------------------------------------------\n");
-       if (read_slab_obj(s, "free_calls"))
-               printf("%s", buffer);
-       else
-               printf("No Data\n");
-
-}
-
-static void ops(struct slabinfo *s)
-{
-       if (strcmp(s->name, "*") == 0)
-               return;
-
-       if (read_slab_obj(s, "ops")) {
-               printf("\n%s: kmem_cache operations\n", s->name);
-               printf("--------------------------------------------\n");
-               printf("%s", buffer);
-       } else
-               printf("\n%s has no kmem_cache operations\n", s->name);
-}
-
-static const char *onoff(int x)
-{
-       if (x)
-               return "On ";
-       return "Off";
-}
-
-static void slab_stats(struct slabinfo *s)
-{
-       unsigned long total_alloc;
-       unsigned long total_free;
-       unsigned long total;
-
-       if (!s->alloc_slab)
-               return;
-
-       total_alloc = s->alloc_fastpath + s->alloc_slowpath;
-       total_free = s->free_fastpath + s->free_slowpath;
-
-       if (!total_alloc)
-               return;
-
-       printf("\n");
-       printf("Slab Perf Counter       Alloc     Free %%Al %%Fr\n");
-       printf("--------------------------------------------------\n");
-       printf("Fastpath             %8lu %8lu %3lu %3lu\n",
-               s->alloc_fastpath, s->free_fastpath,
-               s->alloc_fastpath * 100 / total_alloc,
-               s->free_fastpath * 100 / total_free);
-       printf("Slowpath             %8lu %8lu %3lu %3lu\n",
-               total_alloc - s->alloc_fastpath, s->free_slowpath,
-               (total_alloc - s->alloc_fastpath) * 100 / total_alloc,
-               s->free_slowpath * 100 / total_free);
-       printf("Page Alloc           %8lu %8lu %3lu %3lu\n",
-               s->alloc_slab, s->free_slab,
-               s->alloc_slab * 100 / total_alloc,
-               s->free_slab * 100 / total_free);
-       printf("Add partial          %8lu %8lu %3lu %3lu\n",
-               s->deactivate_to_head + s->deactivate_to_tail,
-               s->free_add_partial,
-               (s->deactivate_to_head + s->deactivate_to_tail) * 100 / total_alloc,
-               s->free_add_partial * 100 / total_free);
-       printf("Remove partial       %8lu %8lu %3lu %3lu\n",
-               s->alloc_from_partial, s->free_remove_partial,
-               s->alloc_from_partial * 100 / total_alloc,
-               s->free_remove_partial * 100 / total_free);
-
-       printf("Cpu partial list     %8lu %8lu %3lu %3lu\n",
-               s->cpu_partial_alloc, s->cpu_partial_free,
-               s->cpu_partial_alloc * 100 / total_alloc,
-               s->cpu_partial_free * 100 / total_free);
-
-       printf("RemoteObj/SlabFrozen %8lu %8lu %3lu %3lu\n",
-               s->deactivate_remote_frees, s->free_frozen,
-               s->deactivate_remote_frees * 100 / total_alloc,
-               s->free_frozen * 100 / total_free);
-
-       printf("Total                %8lu %8lu\n\n", total_alloc, total_free);
-
-       if (s->cpuslab_flush)
-               printf("Flushes %8lu\n", s->cpuslab_flush);
-
-       total = s->deactivate_full + s->deactivate_empty +
-                       s->deactivate_to_head + s->deactivate_to_tail + s->deactivate_bypass;
-
-       if (total) {
-               printf("\nSlab Deactivation             Ocurrences  %%\n");
-               printf("-------------------------------------------------\n");
-               printf("Slab full                     %7lu  %3lu%%\n",
-                       s->deactivate_full, (s->deactivate_full * 100) / total);
-               printf("Slab empty                    %7lu  %3lu%%\n",
-                       s->deactivate_empty, (s->deactivate_empty * 100) / total);
-               printf("Moved to head of partial list %7lu  %3lu%%\n",
-                       s->deactivate_to_head, (s->deactivate_to_head * 100) / total);
-               printf("Moved to tail of partial list %7lu  %3lu%%\n",
-                       s->deactivate_to_tail, (s->deactivate_to_tail * 100) / total);
-               printf("Deactivation bypass           %7lu  %3lu%%\n",
-                       s->deactivate_bypass, (s->deactivate_bypass * 100) / total);
-               printf("Refilled from foreign frees   %7lu  %3lu%%\n",
-                       s->alloc_refill, (s->alloc_refill * 100) / total);
-               printf("Node mismatch                 %7lu  %3lu%%\n",
-                       s->alloc_node_mismatch, (s->alloc_node_mismatch * 100) / total);
-       }
-
-       if (s->cmpxchg_double_fail || s->cmpxchg_double_cpu_fail)
-               printf("\nCmpxchg_double Looping\n------------------------\n");
-               printf("Locked Cmpxchg Double redos   %lu\nUnlocked Cmpxchg Double redos %lu\n",
-                       s->cmpxchg_double_fail, s->cmpxchg_double_cpu_fail);
-}
-
-static void report(struct slabinfo *s)
-{
-       if (strcmp(s->name, "*") == 0)
-               return;
-
-       printf("\nSlabcache: %-20s  Aliases: %2d Order : %2d Objects: %lu\n",
-               s->name, s->aliases, s->order, s->objects);
-       if (s->hwcache_align)
-               printf("** Hardware cacheline aligned\n");
-       if (s->cache_dma)
-               printf("** Memory is allocated in a special DMA zone\n");
-       if (s->destroy_by_rcu)
-               printf("** Slabs are destroyed via RCU\n");
-       if (s->reclaim_account)
-               printf("** Reclaim accounting active\n");
-
-       printf("\nSizes (bytes)     Slabs              Debug                Memory\n");
-       printf("------------------------------------------------------------------------\n");
-       printf("Object : %7d  Total  : %7ld   Sanity Checks : %s  Total: %7ld\n",
-                       s->object_size, s->slabs, onoff(s->sanity_checks),
-                       s->slabs * (page_size << s->order));
-       printf("SlabObj: %7d  Full   : %7ld   Redzoning     : %s  Used : %7ld\n",
-                       s->slab_size, s->slabs - s->partial - s->cpu_slabs,
-                       onoff(s->red_zone), s->objects * s->object_size);
-       printf("SlabSiz: %7d  Partial: %7ld   Poisoning     : %s  Loss : %7ld\n",
-                       page_size << s->order, s->partial, onoff(s->poison),
-                       s->slabs * (page_size << s->order) - s->objects * s->object_size);
-       printf("Loss   : %7d  CpuSlab: %7d   Tracking      : %s  Lalig: %7ld\n",
-                       s->slab_size - s->object_size, s->cpu_slabs, onoff(s->store_user),
-                       (s->slab_size - s->object_size) * s->objects);
-       printf("Align  : %7d  Objects: %7d   Tracing       : %s  Lpadd: %7ld\n",
-                       s->align, s->objs_per_slab, onoff(s->trace),
-                       ((page_size << s->order) - s->objs_per_slab * s->slab_size) *
-                       s->slabs);
-
-       ops(s);
-       show_tracking(s);
-       slab_numa(s, 1);
-       slab_stats(s);
-}
-
-static void slabcache(struct slabinfo *s)
-{
-       char size_str[20];
-       char dist_str[40];
-       char flags[20];
-       char *p = flags;
-
-       if (strcmp(s->name, "*") == 0)
-               return;
-
-       if (actual_slabs == 1) {
-               report(s);
-               return;
-       }
-
-       if (skip_zero && !show_empty && !s->slabs)
-               return;
-
-       if (show_empty && s->slabs)
-               return;
-
-       store_size(size_str, slab_size(s));
-       snprintf(dist_str, 40, "%lu/%lu/%d", s->slabs - s->cpu_slabs,
-                                               s->partial, s->cpu_slabs);
-
-       if (!line++)
-               first_line();
-
-       if (s->aliases)
-               *p++ = '*';
-       if (s->cache_dma)
-               *p++ = 'd';
-       if (s->hwcache_align)
-               *p++ = 'A';
-       if (s->poison)
-               *p++ = 'P';
-       if (s->reclaim_account)
-               *p++ = 'a';
-       if (s->red_zone)
-               *p++ = 'Z';
-       if (s->sanity_checks)
-               *p++ = 'F';
-       if (s->store_user)
-               *p++ = 'U';
-       if (s->trace)
-               *p++ = 'T';
-
-       *p = 0;
-       if (show_activity) {
-               unsigned long total_alloc;
-               unsigned long total_free;
-
-               total_alloc = s->alloc_fastpath + s->alloc_slowpath;
-               total_free = s->free_fastpath + s->free_slowpath;
-
-               printf("%-21s %8ld %10ld %10ld %3ld %3ld %5ld %1d %4ld %4ld\n",
-                       s->name, s->objects,
-                       total_alloc, total_free,
-                       total_alloc ? (s->alloc_fastpath * 100 / total_alloc) : 0,
-                       total_free ? (s->free_fastpath * 100 / total_free) : 0,
-                       s->order_fallback, s->order, s->cmpxchg_double_fail,
-                       s->cmpxchg_double_cpu_fail);
-       }
-       else
-               printf("%-21s %8ld %7d %8s %14s %4d %1d %3ld %3ld %s\n",
-                       s->name, s->objects, s->object_size, size_str, dist_str,
-                       s->objs_per_slab, s->order,
-                       s->slabs ? (s->partial * 100) / s->slabs : 100,
-                       s->slabs ? (s->objects * s->object_size * 100) /
-                               (s->slabs * (page_size << s->order)) : 100,
-                       flags);
-}
-
-/*
- * Analyze debug options. Return false if something is amiss.
- */
-static int debug_opt_scan(char *opt)
-{
-       if (!opt || !opt[0] || strcmp(opt, "-") == 0)
-               return 1;
-
-       if (strcasecmp(opt, "a") == 0) {
-               sanity = 1;
-               poison = 1;
-               redzone = 1;
-               tracking = 1;
-               return 1;
-       }
-
-       for ( ; *opt; opt++)
-               switch (*opt) {
-               case 'F' : case 'f':
-                       if (sanity)
-                               return 0;
-                       sanity = 1;
-                       break;
-               case 'P' : case 'p':
-                       if (poison)
-                               return 0;
-                       poison = 1;
-                       break;
-
-               case 'Z' : case 'z':
-                       if (redzone)
-                               return 0;
-                       redzone = 1;
-                       break;
-
-               case 'U' : case 'u':
-                       if (tracking)
-                               return 0;
-                       tracking = 1;
-                       break;
-
-               case 'T' : case 't':
-                       if (tracing)
-                               return 0;
-                       tracing = 1;
-                       break;
-               default:
-                       return 0;
-               }
-       return 1;
-}
-
-static int slab_empty(struct slabinfo *s)
-{
-       if (s->objects > 0)
-               return 0;
-
-       /*
-        * We may still have slabs even if there are no objects. Shrinking will
-        * remove them.
-        */
-       if (s->slabs != 0)
-               set_obj(s, "shrink", 1);
-
-       return 1;
-}
-
-static void slab_debug(struct slabinfo *s)
-{
-       if (strcmp(s->name, "*") == 0)
-               return;
-
-       if (sanity && !s->sanity_checks) {
-               set_obj(s, "sanity", 1);
-       }
-       if (!sanity && s->sanity_checks) {
-               if (slab_empty(s))
-                       set_obj(s, "sanity", 0);
-               else
-                       fprintf(stderr, "%s not empty cannot disable sanity checks\n", s->name);
-       }
-       if (redzone && !s->red_zone) {
-               if (slab_empty(s))
-                       set_obj(s, "red_zone", 1);
-               else
-                       fprintf(stderr, "%s not empty cannot enable redzoning\n", s->name);
-       }
-       if (!redzone && s->red_zone) {
-               if (slab_empty(s))
-                       set_obj(s, "red_zone", 0);
-               else
-                       fprintf(stderr, "%s not empty cannot disable redzoning\n", s->name);
-       }
-       if (poison && !s->poison) {
-               if (slab_empty(s))
-                       set_obj(s, "poison", 1);
-               else
-                       fprintf(stderr, "%s not empty cannot enable poisoning\n", s->name);
-       }
-       if (!poison && s->poison) {
-               if (slab_empty(s))
-                       set_obj(s, "poison", 0);
-               else
-                       fprintf(stderr, "%s not empty cannot disable poisoning\n", s->name);
-       }
-       if (tracking && !s->store_user) {
-               if (slab_empty(s))
-                       set_obj(s, "store_user", 1);
-               else
-                       fprintf(stderr, "%s not empty cannot enable tracking\n", s->name);
-       }
-       if (!tracking && s->store_user) {
-               if (slab_empty(s))
-                       set_obj(s, "store_user", 0);
-               else
-                       fprintf(stderr, "%s not empty cannot disable tracking\n", s->name);
-       }
-       if (tracing && !s->trace) {
-               if (slabs == 1)
-                       set_obj(s, "trace", 1);
-               else
-                       fprintf(stderr, "%s can only enable trace for one slab at a time\n", s->name);
-       }
-       if (!tracing && s->trace)
-               set_obj(s, "trace", 1);
-}
-
-static void totals(void)
-{
-       struct slabinfo *s;
-
-       int used_slabs = 0;
-       char b1[20], b2[20], b3[20], b4[20];
-       unsigned long long max = 1ULL << 63;
-
-       /* Object size */
-       unsigned long long min_objsize = max, max_objsize = 0, avg_objsize;
-
-       /* Number of partial slabs in a slabcache */
-       unsigned long long min_partial = max, max_partial = 0,
-                               avg_partial, total_partial = 0;
-
-       /* Number of slabs in a slab cache */
-       unsigned long long min_slabs = max, max_slabs = 0,
-                               avg_slabs, total_slabs = 0;
-
-       /* Size of the whole slab */
-       unsigned long long min_size = max, max_size = 0,
-                               avg_size, total_size = 0;
-
-       /* Bytes used for object storage in a slab */
-       unsigned long long min_used = max, max_used = 0,
-                               avg_used, total_used = 0;
-
-       /* Waste: Bytes used for alignment and padding */
-       unsigned long long min_waste = max, max_waste = 0,
-                               avg_waste, total_waste = 0;
-       /* Number of objects in a slab */
-       unsigned long long min_objects = max, max_objects = 0,
-                               avg_objects, total_objects = 0;
-       /* Waste per object */
-       unsigned long long min_objwaste = max,
-                               max_objwaste = 0, avg_objwaste,
-                               total_objwaste = 0;
-
-       /* Memory per object */
-       unsigned long long min_memobj = max,
-                               max_memobj = 0, avg_memobj,
-                               total_objsize = 0;
-
-       /* Percentage of partial slabs per slab */
-       unsigned long min_ppart = 100, max_ppart = 0,
-                               avg_ppart, total_ppart = 0;
-
-       /* Number of objects in partial slabs */
-       unsigned long min_partobj = max, max_partobj = 0,
-                               avg_partobj, total_partobj = 0;
-
-       /* Percentage of partial objects of all objects in a slab */
-       unsigned long min_ppartobj = 100, max_ppartobj = 0,
-                               avg_ppartobj, total_ppartobj = 0;
-
-
-       for (s = slabinfo; s < slabinfo + slabs; s++) {
-               unsigned long long size;
-               unsigned long used;
-               unsigned long long wasted;
-               unsigned long long objwaste;
-               unsigned long percentage_partial_slabs;
-               unsigned long percentage_partial_objs;
-
-               if (!s->slabs || !s->objects)
-                       continue;
-
-               used_slabs++;
-
-               size = slab_size(s);
-               used = s->objects * s->object_size;
-               wasted = size - used;
-               objwaste = s->slab_size - s->object_size;
-
-               percentage_partial_slabs = s->partial * 100 / s->slabs;
-               if (percentage_partial_slabs > 100)
-                       percentage_partial_slabs = 100;
-
-               percentage_partial_objs = s->objects_partial * 100
-                                                       / s->objects;
-
-               if (percentage_partial_objs > 100)
-                       percentage_partial_objs = 100;
-
-               if (s->object_size < min_objsize)
-                       min_objsize = s->object_size;
-               if (s->partial < min_partial)
-                       min_partial = s->partial;
-               if (s->slabs < min_slabs)
-                       min_slabs = s->slabs;
-               if (size < min_size)
-                       min_size = size;
-               if (wasted < min_waste)
-                       min_waste = wasted;
-               if (objwaste < min_objwaste)
-                       min_objwaste = objwaste;
-               if (s->objects < min_objects)
-                       min_objects = s->objects;
-               if (used < min_used)
-                       min_used = used;
-               if (s->objects_partial < min_partobj)
-                       min_partobj = s->objects_partial;
-               if (percentage_partial_slabs < min_ppart)
-                       min_ppart = percentage_partial_slabs;
-               if (percentage_partial_objs < min_ppartobj)
-                       min_ppartobj = percentage_partial_objs;
-               if (s->slab_size < min_memobj)
-                       min_memobj = s->slab_size;
-
-               if (s->object_size > max_objsize)
-                       max_objsize = s->object_size;
-               if (s->partial > max_partial)
-                       max_partial = s->partial;
-               if (s->slabs > max_slabs)
-                       max_slabs = s->slabs;
-               if (size > max_size)
-                       max_size = size;
-               if (wasted > max_waste)
-                       max_waste = wasted;
-               if (objwaste > max_objwaste)
-                       max_objwaste = objwaste;
-               if (s->objects > max_objects)
-                       max_objects = s->objects;
-               if (used > max_used)
-                       max_used = used;
-               if (s->objects_partial > max_partobj)
-                       max_partobj = s->objects_partial;
-               if (percentage_partial_slabs > max_ppart)
-                       max_ppart = percentage_partial_slabs;
-               if (percentage_partial_objs > max_ppartobj)
-                       max_ppartobj = percentage_partial_objs;
-               if (s->slab_size > max_memobj)
-                       max_memobj = s->slab_size;
-
-               total_partial += s->partial;
-               total_slabs += s->slabs;
-               total_size += size;
-               total_waste += wasted;
-
-               total_objects += s->objects;
-               total_used += used;
-               total_partobj += s->objects_partial;
-               total_ppart += percentage_partial_slabs;
-               total_ppartobj += percentage_partial_objs;
-
-               total_objwaste += s->objects * objwaste;
-               total_objsize += s->objects * s->slab_size;
-       }
-
-       if (!total_objects) {
-               printf("No objects\n");
-               return;
-       }
-       if (!used_slabs) {
-               printf("No slabs\n");
-               return;
-       }
-
-       /* Per slab averages */
-       avg_partial = total_partial / used_slabs;
-       avg_slabs = total_slabs / used_slabs;
-       avg_size = total_size / used_slabs;
-       avg_waste = total_waste / used_slabs;
-
-       avg_objects = total_objects / used_slabs;
-       avg_used = total_used / used_slabs;
-       avg_partobj = total_partobj / used_slabs;
-       avg_ppart = total_ppart / used_slabs;
-       avg_ppartobj = total_ppartobj / used_slabs;
-
-       /* Per object object sizes */
-       avg_objsize = total_used / total_objects;
-       avg_objwaste = total_objwaste / total_objects;
-       avg_partobj = total_partobj * 100 / total_objects;
-       avg_memobj = total_objsize / total_objects;
-
-       printf("Slabcache Totals\n");
-       printf("----------------\n");
-       printf("Slabcaches : %3d      Aliases  : %3d->%-3d Active: %3d\n",
-                       slabs, aliases, alias_targets, used_slabs);
-
-       store_size(b1, total_size);store_size(b2, total_waste);
-       store_size(b3, total_waste * 100 / total_used);
-       printf("Memory used: %6s   # Loss   : %6s   MRatio:%6s%%\n", b1, b2, b3);
-
-       store_size(b1, total_objects);store_size(b2, total_partobj);
-       store_size(b3, total_partobj * 100 / total_objects);
-       printf("# Objects  : %6s   # PartObj: %6s   ORatio:%6s%%\n", b1, b2, b3);
-
-       printf("\n");
-       printf("Per Cache    Average         Min         Max       Total\n");
-       printf("---------------------------------------------------------\n");
-
-       store_size(b1, avg_objects);store_size(b2, min_objects);
-       store_size(b3, max_objects);store_size(b4, total_objects);
-       printf("#Objects  %10s  %10s  %10s  %10s\n",
-                       b1,     b2,     b3,     b4);
-
-       store_size(b1, avg_slabs);store_size(b2, min_slabs);
-       store_size(b3, max_slabs);store_size(b4, total_slabs);
-       printf("#Slabs    %10s  %10s  %10s  %10s\n",
-                       b1,     b2,     b3,     b4);
-
-       store_size(b1, avg_partial);store_size(b2, min_partial);
-       store_size(b3, max_partial);store_size(b4, total_partial);
-       printf("#PartSlab %10s  %10s  %10s  %10s\n",
-                       b1,     b2,     b3,     b4);
-       store_size(b1, avg_ppart);store_size(b2, min_ppart);
-       store_size(b3, max_ppart);
-       store_size(b4, total_partial * 100  / total_slabs);
-       printf("%%PartSlab%10s%% %10s%% %10s%% %10s%%\n",
-                       b1,     b2,     b3,     b4);
-
-       store_size(b1, avg_partobj);store_size(b2, min_partobj);
-       store_size(b3, max_partobj);
-       store_size(b4, total_partobj);
-       printf("PartObjs  %10s  %10s  %10s  %10s\n",
-                       b1,     b2,     b3,     b4);
-
-       store_size(b1, avg_ppartobj);store_size(b2, min_ppartobj);
-       store_size(b3, max_ppartobj);
-       store_size(b4, total_partobj * 100 / total_objects);
-       printf("%% PartObj%10s%% %10s%% %10s%% %10s%%\n",
-                       b1,     b2,     b3,     b4);
-
-       store_size(b1, avg_size);store_size(b2, min_size);
-       store_size(b3, max_size);store_size(b4, total_size);
-       printf("Memory    %10s  %10s  %10s  %10s\n",
-                       b1,     b2,     b3,     b4);
-
-       store_size(b1, avg_used);store_size(b2, min_used);
-       store_size(b3, max_used);store_size(b4, total_used);
-       printf("Used      %10s  %10s  %10s  %10s\n",
-                       b1,     b2,     b3,     b4);
-
-       store_size(b1, avg_waste);store_size(b2, min_waste);
-       store_size(b3, max_waste);store_size(b4, total_waste);
-       printf("Loss      %10s  %10s  %10s  %10s\n",
-                       b1,     b2,     b3,     b4);
-
-       printf("\n");
-       printf("Per Object   Average         Min         Max\n");
-       printf("---------------------------------------------\n");
-
-       store_size(b1, avg_memobj);store_size(b2, min_memobj);
-       store_size(b3, max_memobj);
-       printf("Memory    %10s  %10s  %10s\n",
-                       b1,     b2,     b3);
-       store_size(b1, avg_objsize);store_size(b2, min_objsize);
-       store_size(b3, max_objsize);
-       printf("User      %10s  %10s  %10s\n",
-                       b1,     b2,     b3);
-
-       store_size(b1, avg_objwaste);store_size(b2, min_objwaste);
-       store_size(b3, max_objwaste);
-       printf("Loss      %10s  %10s  %10s\n",
-                       b1,     b2,     b3);
-}
-
-static void sort_slabs(void)
-{
-       struct slabinfo *s1,*s2;
-
-       for (s1 = slabinfo; s1 < slabinfo + slabs; s1++) {
-               for (s2 = s1 + 1; s2 < slabinfo + slabs; s2++) {
-                       int result;
-
-                       if (sort_size)
-                               result = slab_size(s1) < slab_size(s2);
-                       else if (sort_active)
-                               result = slab_activity(s1) < slab_activity(s2);
-                       else
-                               result = strcasecmp(s1->name, s2->name);
-
-                       if (show_inverted)
-                               result = -result;
-
-                       if (result > 0) {
-                               struct slabinfo t;
-
-                               memcpy(&t, s1, sizeof(struct slabinfo));
-                               memcpy(s1, s2, sizeof(struct slabinfo));
-                               memcpy(s2, &t, sizeof(struct slabinfo));
-                       }
-               }
-       }
-}
-
-static void sort_aliases(void)
-{
-       struct aliasinfo *a1,*a2;
-
-       for (a1 = aliasinfo; a1 < aliasinfo + aliases; a1++) {
-               for (a2 = a1 + 1; a2 < aliasinfo + aliases; a2++) {
-                       char *n1, *n2;
-
-                       n1 = a1->name;
-                       n2 = a2->name;
-                       if (show_alias && !show_inverted) {
-                               n1 = a1->ref;
-                               n2 = a2->ref;
-                       }
-                       if (strcasecmp(n1, n2) > 0) {
-                               struct aliasinfo t;
-
-                               memcpy(&t, a1, sizeof(struct aliasinfo));
-                               memcpy(a1, a2, sizeof(struct aliasinfo));
-                               memcpy(a2, &t, sizeof(struct aliasinfo));
-                       }
-               }
-       }
-}
-
-static void link_slabs(void)
-{
-       struct aliasinfo *a;
-       struct slabinfo *s;
-
-       for (a = aliasinfo; a < aliasinfo + aliases; a++) {
-
-               for (s = slabinfo; s < slabinfo + slabs; s++)
-                       if (strcmp(a->ref, s->name) == 0) {
-                               a->slab = s;
-                               s->refs++;
-                               break;
-                       }
-               if (s == slabinfo + slabs)
-                       fatal("Unresolved alias %s\n", a->ref);
-       }
-}
-
-static void alias(void)
-{
-       struct aliasinfo *a;
-       char *active = NULL;
-
-       sort_aliases();
-       link_slabs();
-
-       for(a = aliasinfo; a < aliasinfo + aliases; a++) {
-
-               if (!show_single_ref && a->slab->refs == 1)
-                       continue;
-
-               if (!show_inverted) {
-                       if (active) {
-                               if (strcmp(a->slab->name, active) == 0) {
-                                       printf(" %s", a->name);
-                                       continue;
-                               }
-                       }
-                       printf("\n%-12s <- %s", a->slab->name, a->name);
-                       active = a->slab->name;
-               }
-               else
-                       printf("%-20s -> %s\n", a->name, a->slab->name);
-       }
-       if (active)
-               printf("\n");
-}
-
-
-static void rename_slabs(void)
-{
-       struct slabinfo *s;
-       struct aliasinfo *a;
-
-       for (s = slabinfo; s < slabinfo + slabs; s++) {
-               if (*s->name != ':')
-                       continue;
-
-               if (s->refs > 1 && !show_first_alias)
-                       continue;
-
-               a = find_one_alias(s);
-
-               if (a)
-                       s->name = a->name;
-               else {
-                       s->name = "*";
-                       actual_slabs--;
-               }
-       }
-}
-
-static int slab_mismatch(char *slab)
-{
-       return regexec(&pattern, slab, 0, NULL, 0);
-}
-
-static void read_slab_dir(void)
-{
-       DIR *dir;
-       struct dirent *de;
-       struct slabinfo *slab = slabinfo;
-       struct aliasinfo *alias = aliasinfo;
-       char *p;
-       char *t;
-       int count;
-
-       if (chdir("/sys/kernel/slab") && chdir("/sys/slab"))
-               fatal("SYSFS support for SLUB not active\n");
-
-       dir = opendir(".");
-       while ((de = readdir(dir))) {
-               if (de->d_name[0] == '.' ||
-                       (de->d_name[0] != ':' && slab_mismatch(de->d_name)))
-                               continue;
-               switch (de->d_type) {
-                  case DT_LNK:
-                       alias->name = strdup(de->d_name);
-                       count = readlink(de->d_name, buffer, sizeof(buffer)-1);
-
-                       if (count < 0)
-                               fatal("Cannot read symlink %s\n", de->d_name);
-
-                       buffer[count] = 0;
-                       p = buffer + count;
-                       while (p > buffer && p[-1] != '/')
-                               p--;
-                       alias->ref = strdup(p);
-                       alias++;
-                       break;
-                  case DT_DIR:
-                       if (chdir(de->d_name))
-                               fatal("Unable to access slab %s\n", slab->name);
-                       slab->name = strdup(de->d_name);
-                       slab->alias = 0;
-                       slab->refs = 0;
-                       slab->aliases = get_obj("aliases");
-                       slab->align = get_obj("align");
-                       slab->cache_dma = get_obj("cache_dma");
-                       slab->cpu_slabs = get_obj("cpu_slabs");
-                       slab->destroy_by_rcu = get_obj("destroy_by_rcu");
-                       slab->hwcache_align = get_obj("hwcache_align");
-                       slab->object_size = get_obj("object_size");
-                       slab->objects = get_obj("objects");
-                       slab->objects_partial = get_obj("objects_partial");
-                       slab->objects_total = get_obj("objects_total");
-                       slab->objs_per_slab = get_obj("objs_per_slab");
-                       slab->order = get_obj("order");
-                       slab->partial = get_obj("partial");
-                       slab->partial = get_obj_and_str("partial", &t);
-                       decode_numa_list(slab->numa_partial, t);
-                       free(t);
-                       slab->poison = get_obj("poison");
-                       slab->reclaim_account = get_obj("reclaim_account");
-                       slab->red_zone = get_obj("red_zone");
-                       slab->sanity_checks = get_obj("sanity_checks");
-                       slab->slab_size = get_obj("slab_size");
-                       slab->slabs = get_obj_and_str("slabs", &t);
-                       decode_numa_list(slab->numa, t);
-                       free(t);
-                       slab->store_user = get_obj("store_user");
-                       slab->trace = get_obj("trace");
-                       slab->alloc_fastpath = get_obj("alloc_fastpath");
-                       slab->alloc_slowpath = get_obj("alloc_slowpath");
-                       slab->free_fastpath = get_obj("free_fastpath");
-                       slab->free_slowpath = get_obj("free_slowpath");
-                       slab->free_frozen= get_obj("free_frozen");
-                       slab->free_add_partial = get_obj("free_add_partial");
-                       slab->free_remove_partial = get_obj("free_remove_partial");
-                       slab->alloc_from_partial = get_obj("alloc_from_partial");
-                       slab->alloc_slab = get_obj("alloc_slab");
-                       slab->alloc_refill = get_obj("alloc_refill");
-                       slab->free_slab = get_obj("free_slab");
-                       slab->cpuslab_flush = get_obj("cpuslab_flush");
-                       slab->deactivate_full = get_obj("deactivate_full");
-                       slab->deactivate_empty = get_obj("deactivate_empty");
-                       slab->deactivate_to_head = get_obj("deactivate_to_head");
-                       slab->deactivate_to_tail = get_obj("deactivate_to_tail");
-                       slab->deactivate_remote_frees = get_obj("deactivate_remote_frees");
-                       slab->order_fallback = get_obj("order_fallback");
-                       slab->cmpxchg_double_cpu_fail = get_obj("cmpxchg_double_cpu_fail");
-                       slab->cmpxchg_double_fail = get_obj("cmpxchg_double_fail");
-                       slab->cpu_partial_alloc = get_obj("cpu_partial_alloc");
-                       slab->cpu_partial_free = get_obj("cpu_partial_free");
-                       slab->alloc_node_mismatch = get_obj("alloc_node_mismatch");
-                       slab->deactivate_bypass = get_obj("deactivate_bypass");
-                       chdir("..");
-                       if (slab->name[0] == ':')
-                               alias_targets++;
-                       slab++;
-                       break;
-                  default :
-                       fatal("Unknown file type %lx\n", de->d_type);
-               }
-       }
-       closedir(dir);
-       slabs = slab - slabinfo;
-       actual_slabs = slabs;
-       aliases = alias - aliasinfo;
-       if (slabs > MAX_SLABS)
-               fatal("Too many slabs\n");
-       if (aliases > MAX_ALIASES)
-               fatal("Too many aliases\n");
-}
-
-static void output_slabs(void)
-{
-       struct slabinfo *slab;
-
-       for (slab = slabinfo; slab < slabinfo + slabs; slab++) {
-
-               if (slab->alias)
-                       continue;
-
-
-               if (show_numa)
-                       slab_numa(slab, 0);
-               else if (show_track)
-                       show_tracking(slab);
-               else if (validate)
-                       slab_validate(slab);
-               else if (shrink)
-                       slab_shrink(slab);
-               else if (set_debug)
-                       slab_debug(slab);
-               else if (show_ops)
-                       ops(slab);
-               else if (show_slab)
-                       slabcache(slab);
-               else if (show_report)
-                       report(slab);
-       }
-}
-
-struct option opts[] = {
-       { "aliases", 0, NULL, 'a' },
-       { "activity", 0, NULL, 'A' },
-       { "debug", 2, NULL, 'd' },
-       { "display-activity", 0, NULL, 'D' },
-       { "empty", 0, NULL, 'e' },
-       { "first-alias", 0, NULL, 'f' },
-       { "help", 0, NULL, 'h' },
-       { "inverted", 0, NULL, 'i'},
-       { "numa", 0, NULL, 'n' },
-       { "ops", 0, NULL, 'o' },
-       { "report", 0, NULL, 'r' },
-       { "shrink", 0, NULL, 's' },
-       { "slabs", 0, NULL, 'l' },
-       { "track", 0, NULL, 't'},
-       { "validate", 0, NULL, 'v' },
-       { "zero", 0, NULL, 'z' },
-       { "1ref", 0, NULL, '1'},
-       { NULL, 0, NULL, 0 }
-};
-
-int main(int argc, char *argv[])
-{
-       int c;
-       int err;
-       char *pattern_source;
-
-       page_size = getpagesize();
-
-       while ((c = getopt_long(argc, argv, "aAd::Defhil1noprstvzTS",
-                                               opts, NULL)) != -1)
-               switch (c) {
-               case '1':
-                       show_single_ref = 1;
-                       break;
-               case 'a':
-                       show_alias = 1;
-                       break;
-               case 'A':
-                       sort_active = 1;
-                       break;
-               case 'd':
-                       set_debug = 1;
-                       if (!debug_opt_scan(optarg))
-                               fatal("Invalid debug option '%s'\n", optarg);
-                       break;
-               case 'D':
-                       show_activity = 1;
-                       break;
-               case 'e':
-                       show_empty = 1;
-                       break;
-               case 'f':
-                       show_first_alias = 1;
-                       break;
-               case 'h':
-                       usage();
-                       return 0;
-               case 'i':
-                       show_inverted = 1;
-                       break;
-               case 'n':
-                       show_numa = 1;
-                       break;
-               case 'o':
-                       show_ops = 1;
-                       break;
-               case 'r':
-                       show_report = 1;
-                       break;
-               case 's':
-                       shrink = 1;
-                       break;
-               case 'l':
-                       show_slab = 1;
-                       break;
-               case 't':
-                       show_track = 1;
-                       break;
-               case 'v':
-                       validate = 1;
-                       break;
-               case 'z':
-                       skip_zero = 0;
-                       break;
-               case 'T':
-                       show_totals = 1;
-                       break;
-               case 'S':
-                       sort_size = 1;
-                       break;
-
-               default:
-                       fatal("%s: Invalid option '%c'\n", argv[0], optopt);
-
-       }
-
-       if (!show_slab && !show_alias && !show_track && !show_report
-               && !validate && !shrink && !set_debug && !show_ops)
-                       show_slab = 1;
-
-       if (argc > optind)
-               pattern_source = argv[optind];
-       else
-               pattern_source = ".*";
-
-       err = regcomp(&pattern, pattern_source, REG_ICASE|REG_NOSUB);
-       if (err)
-               fatal("%s: Invalid pattern '%s' code %d\n",
-                       argv[0], pattern_source, err);
-       read_slab_dir();
-       if (show_alias)
-               alias();
-       else
-       if (show_totals)
-               totals();
-       else {
-               link_slabs();
-               rename_slabs();
-               sort_slabs();
-               output_slabs();
-       }
-       return 0;
-}
index 4ec84018cc1364f8f905ee30c31e52f04d7ec6ff..28bc57ee757cf04d7b2166dc3e4b236b5fd19de6 100644 (file)
@@ -1,10 +1,15 @@
-TARGETS = breakpoints
+TARGETS = breakpoints vm
 
 all:
        for TARGET in $(TARGETS); do \
                make -C $$TARGET; \
        done;
 
+run_tests: all
+       for TARGET in $(TARGETS); do \
+               make -C $$TARGET run_tests; \
+       done;
+
 clean:
        for TARGET in $(TARGETS); do \
                make -C $$TARGET clean; \
index f362722cdce7e564ad70d9d1c6df1ae094f0e2f7..931278035f5c94746fbe3ea5376fccdcf81e5747 100644 (file)
@@ -11,10 +11,13 @@ endif
 
 all:
 ifeq ($(ARCH),x86)
-       gcc breakpoint_test.c -o run_test
+       gcc breakpoint_test.c -o breakpoint_test
 else
        echo "Not an x86 target, can't build breakpoints selftests"
 endif
 
+run_tests:
+       ./breakpoint_test
+
 clean:
-       rm -fr run_test
+       rm -fr breakpoint_test
diff --git a/tools/testing/selftests/run_tests b/tools/testing/selftests/run_tests
deleted file mode 100644 (file)
index 320718a..0000000
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/bash
-
-TARGETS=breakpoints
-
-for TARGET in $TARGETS
-do
-       $TARGET/run_test
-done
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
new file mode 100644 (file)
index 0000000..b336b24
--- /dev/null
@@ -0,0 +1,14 @@
+# Makefile for vm selftests
+
+CC = $(CROSS_COMPILE)gcc
+CFLAGS = -Wall -Wextra
+
+all: hugepage-mmap hugepage-shm  map_hugetlb
+%: %.c
+       $(CC) $(CFLAGS) -o $@ $^
+
+run_tests: all
+       /bin/sh ./run_vmtests
+
+clean:
+       $(RM) hugepage-mmap hugepage-shm  map_hugetlb
diff --git a/tools/testing/selftests/vm/hugepage-mmap.c b/tools/testing/selftests/vm/hugepage-mmap.c
new file mode 100644 (file)
index 0000000..a10f310
--- /dev/null
@@ -0,0 +1,92 @@
+/*
+ * hugepage-mmap:
+ *
+ * Example of using huge page memory in a user application using the mmap
+ * system call.  Before running this application, make sure that the
+ * administrator has mounted the hugetlbfs filesystem (on some directory
+ * like /mnt) using the command mount -t hugetlbfs nodev /mnt. In this
+ * example, the app is requesting memory of size 256MB that is backed by
+ * huge pages.
+ *
+ * For the ia64 architecture, the Linux kernel reserves Region number 4 for
+ * huge pages.  That means that if one requires a fixed address, a huge page
+ * aligned address starting with 0x800000... will be required.  If a fixed
+ * address is not required, the kernel will select an address in the proper
+ * range.
+ * Other architectures, such as ppc64, i386 or x86_64 are not so constrained.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+
+#define FILE_NAME "huge/hugepagefile"
+#define LENGTH (256UL*1024*1024)
+#define PROTECTION (PROT_READ | PROT_WRITE)
+
+/* Only ia64 requires this */
+#ifdef __ia64__
+#define ADDR (void *)(0x8000000000000000UL)
+#define FLAGS (MAP_SHARED | MAP_FIXED)
+#else
+#define ADDR (void *)(0x0UL)
+#define FLAGS (MAP_SHARED)
+#endif
+
+static void check_bytes(char *addr)
+{
+       printf("First hex is %x\n", *((unsigned int *)addr));
+}
+
+static void write_bytes(char *addr)
+{
+       unsigned long i;
+
+       for (i = 0; i < LENGTH; i++)
+               *(addr + i) = (char)i;
+}
+
+static int read_bytes(char *addr)
+{
+       unsigned long i;
+
+       check_bytes(addr);
+       for (i = 0; i < LENGTH; i++)
+               if (*(addr + i) != (char)i) {
+                       printf("Mismatch at %lu\n", i);
+                       return 1;
+               }
+       return 0;
+}
+
+int main(void)
+{
+       void *addr;
+       int fd, ret;
+
+       fd = open(FILE_NAME, O_CREAT | O_RDWR, 0755);
+       if (fd < 0) {
+               perror("Open failed");
+               exit(1);
+       }
+
+       addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, fd, 0);
+       if (addr == MAP_FAILED) {
+               perror("mmap");
+               unlink(FILE_NAME);
+               exit(1);
+       }
+
+       printf("Returned address is %p\n", addr);
+       check_bytes(addr);
+       write_bytes(addr);
+       ret = read_bytes(addr);
+
+       munmap(addr, LENGTH);
+       close(fd);
+       unlink(FILE_NAME);
+
+       return ret;
+}
diff --git a/tools/testing/selftests/vm/hugepage-shm.c b/tools/testing/selftests/vm/hugepage-shm.c
new file mode 100644 (file)
index 0000000..0d0ef4f
--- /dev/null
@@ -0,0 +1,100 @@
+/*
+ * hugepage-shm:
+ *
+ * Example of using huge page memory in a user application using Sys V shared
+ * memory system calls.  In this example the app is requesting 256MB of
+ * memory that is backed by huge pages.  The application uses the flag
+ * SHM_HUGETLB in the shmget system call to inform the kernel that it is
+ * requesting huge pages.
+ *
+ * For the ia64 architecture, the Linux kernel reserves Region number 4 for
+ * huge pages.  That means that if one requires a fixed address, a huge page
+ * aligned address starting with 0x800000... will be required.  If a fixed
+ * address is not required, the kernel will select an address in the proper
+ * range.
+ * Other architectures, such as ppc64, i386 or x86_64 are not so constrained.
+ *
+ * Note: The default shared memory limit is quite low on many kernels,
+ * you may need to increase it via:
+ *
+ * echo 268435456 > /proc/sys/kernel/shmmax
+ *
+ * This will increase the maximum size per shared memory segment to 256MB.
+ * The other limit that you will hit eventually is shmall which is the
+ * total amount of shared memory in pages. To set it to 16GB on a system
+ * with a 4kB pagesize do:
+ *
+ * echo 4194304 > /proc/sys/kernel/shmall
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <sys/mman.h>
+
+#ifndef SHM_HUGETLB
+#define SHM_HUGETLB 04000
+#endif
+
+#define LENGTH (256UL*1024*1024)
+
+#define dprintf(x)  printf(x)
+
+/* Only ia64 requires this */
+#ifdef __ia64__
+#define ADDR (void *)(0x8000000000000000UL)
+#define SHMAT_FLAGS (SHM_RND)
+#else
+#define ADDR (void *)(0x0UL)
+#define SHMAT_FLAGS (0)
+#endif
+
+int main(void)
+{
+       int shmid;
+       unsigned long i;
+       char *shmaddr;
+
+       shmid = shmget(2, LENGTH, SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W);
+       if (shmid < 0) {
+               perror("shmget");
+               exit(1);
+       }
+       printf("shmid: 0x%x\n", shmid);
+
+       shmaddr = shmat(shmid, ADDR, SHMAT_FLAGS);
+       if (shmaddr == (char *)-1) {
+               perror("Shared memory attach failure");
+               shmctl(shmid, IPC_RMID, NULL);
+               exit(2);
+       }
+       printf("shmaddr: %p\n", shmaddr);
+
+       dprintf("Starting the writes:\n");
+       for (i = 0; i < LENGTH; i++) {
+               shmaddr[i] = (char)(i);
+               if (!(i % (1024 * 1024)))
+                       dprintf(".");
+       }
+       dprintf("\n");
+
+       dprintf("Starting the Check...");
+       for (i = 0; i < LENGTH; i++)
+               if (shmaddr[i] != (char)i) {
+                       printf("\nIndex %lu mismatched\n", i);
+                       exit(3);
+               }
+       dprintf("Done.\n");
+
+       if (shmdt((const void *)shmaddr) != 0) {
+               perror("Detach failure");
+               shmctl(shmid, IPC_RMID, NULL);
+               exit(4);
+       }
+
+       shmctl(shmid, IPC_RMID, NULL);
+
+       return 0;
+}
diff --git a/tools/testing/selftests/vm/map_hugetlb.c b/tools/testing/selftests/vm/map_hugetlb.c
new file mode 100644 (file)
index 0000000..ac56639
--- /dev/null
@@ -0,0 +1,79 @@
+/*
+ * Example of using hugepage memory in a user application using the mmap
+ * system call with MAP_HUGETLB flag.  Before running this program make
+ * sure the administrator has allocated enough default sized huge pages
+ * to cover the 256 MB allocation.
+ *
+ * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages.
+ * That means the addresses starting with 0x800000... will need to be
+ * specified.  Specifying a fixed address is not required on ppc64, i386
+ * or x86_64.
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+
+#define LENGTH (256UL*1024*1024)
+#define PROTECTION (PROT_READ | PROT_WRITE)
+
+#ifndef MAP_HUGETLB
+#define MAP_HUGETLB 0x40000 /* arch specific */
+#endif
+
+/* Only ia64 requires this */
+#ifdef __ia64__
+#define ADDR (void *)(0x8000000000000000UL)
+#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED)
+#else
+#define ADDR (void *)(0x0UL)
+#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB)
+#endif
+
+static void check_bytes(char *addr)
+{
+       printf("First hex is %x\n", *((unsigned int *)addr));
+}
+
+static void write_bytes(char *addr)
+{
+       unsigned long i;
+
+       for (i = 0; i < LENGTH; i++)
+               *(addr + i) = (char)i;
+}
+
+static int read_bytes(char *addr)
+{
+       unsigned long i;
+
+       check_bytes(addr);
+       for (i = 0; i < LENGTH; i++)
+               if (*(addr + i) != (char)i) {
+                       printf("Mismatch at %lu\n", i);
+                       return 1;
+               }
+       return 0;
+}
+
+int main(void)
+{
+       void *addr;
+       int ret;
+
+       addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, 0, 0);
+       if (addr == MAP_FAILED) {
+               perror("mmap");
+               exit(1);
+       }
+
+       printf("Returned address is %p\n", addr);
+       check_bytes(addr);
+       write_bytes(addr);
+       ret = read_bytes(addr);
+
+       munmap(addr, LENGTH);
+
+       return ret;
+}
diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests
new file mode 100644 (file)
index 0000000..8b40bd5
--- /dev/null
@@ -0,0 +1,77 @@
+#!/bin/bash
+#please run as root
+
+#we need 256M, below is the size in kB
+needmem=262144
+mnt=./huge
+
+#get pagesize and freepages from /proc/meminfo
+while read name size unit; do
+       if [ "$name" = "HugePages_Free:" ]; then
+               freepgs=$size
+       fi
+       if [ "$name" = "Hugepagesize:" ]; then
+               pgsize=$size
+       fi
+done < /proc/meminfo
+
+#set proper nr_hugepages
+if [ -n "$freepgs" ] && [ -n "$pgsize" ]; then
+       nr_hugepgs=`cat /proc/sys/vm/nr_hugepages`
+       needpgs=`expr $needmem / $pgsize`
+       if [ $freepgs -lt $needpgs ]; then
+               lackpgs=$(( $needpgs - $freepgs ))
+               echo $(( $lackpgs + $nr_hugepgs )) > /proc/sys/vm/nr_hugepages
+               if [ $? -ne 0 ]; then
+                       echo "Please run this test as root"
+                       exit 1
+               fi
+       fi
+else
+       echo "no hugetlbfs support in kernel?"
+       exit 1
+fi
+
+mkdir $mnt
+mount -t hugetlbfs none $mnt
+
+echo "--------------------"
+echo "runing hugepage-mmap"
+echo "--------------------"
+./hugepage-mmap
+if [ $? -ne 0 ]; then
+       echo "[FAIL]"
+else
+       echo "[PASS]"
+fi
+
+shmmax=`cat /proc/sys/kernel/shmmax`
+shmall=`cat /proc/sys/kernel/shmall`
+echo 268435456 > /proc/sys/kernel/shmmax
+echo 4194304 > /proc/sys/kernel/shmall
+echo "--------------------"
+echo "runing hugepage-shm"
+echo "--------------------"
+./hugepage-shm
+if [ $? -ne 0 ]; then
+       echo "[FAIL]"
+else
+       echo "[PASS]"
+fi
+echo $shmmax > /proc/sys/kernel/shmmax
+echo $shmall > /proc/sys/kernel/shmall
+
+echo "--------------------"
+echo "runing map_hugetlb"
+echo "--------------------"
+./map_hugetlb
+if [ $? -ne 0 ]; then
+       echo "[FAIL]"
+else
+       echo "[PASS]"
+fi
+
+#cleanup
+umount $mnt
+rm -rf $mnt
+echo $nr_hugepgs > /proc/sys/vm/nr_hugepages
diff --git a/tools/vm/Makefile b/tools/vm/Makefile
new file mode 100644 (file)
index 0000000..8e30e5c
--- /dev/null
@@ -0,0 +1,11 @@
+# Makefile for vm tools
+
+CC = $(CROSS_COMPILE)gcc
+CFLAGS = -Wall -Wextra
+
+all: page-types slabinfo
+%: %.c
+       $(CC) $(CFLAGS) -o $@ $^
+
+clean:
+       $(RM) page-types slabinfo
diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c
new file mode 100644 (file)
index 0000000..7dab7b2
--- /dev/null
@@ -0,0 +1,1102 @@
+/*
+ * page-types: Tool for querying page flags
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should find a copy of v2 of the GNU General Public License somewhere on
+ * your Linux system; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Copyright (C) 2009 Intel corporation
+ *
+ * Authors: Wu Fengguang <fengguang.wu@intel.com>
+ */
+
+#define _LARGEFILE64_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <stdarg.h>
+#include <string.h>
+#include <getopt.h>
+#include <limits.h>
+#include <assert.h>
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/fcntl.h>
+#include <sys/mount.h>
+#include <sys/statfs.h>
+#include "../../include/linux/magic.h"
+
+
+#ifndef MAX_PATH
+# define MAX_PATH 256
+#endif
+
+#ifndef STR
+# define _STR(x) #x
+# define STR(x) _STR(x)
+#endif
+
+/*
+ * pagemap kernel ABI bits
+ */
+
+#define PM_ENTRY_BYTES      sizeof(uint64_t)
+#define PM_STATUS_BITS      3
+#define PM_STATUS_OFFSET    (64 - PM_STATUS_BITS)
+#define PM_STATUS_MASK      (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET)
+#define PM_STATUS(nr)       (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK)
+#define PM_PSHIFT_BITS      6
+#define PM_PSHIFT_OFFSET    (PM_STATUS_OFFSET - PM_PSHIFT_BITS)
+#define PM_PSHIFT_MASK      (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET)
+#define PM_PSHIFT(x)        (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
+#define PM_PFRAME_MASK      ((1LL << PM_PSHIFT_OFFSET) - 1)
+#define PM_PFRAME(x)        ((x) & PM_PFRAME_MASK)
+
+#define PM_PRESENT          PM_STATUS(4LL)
+#define PM_SWAP             PM_STATUS(2LL)
+
+
+/*
+ * kernel page flags
+ */
+
+#define KPF_BYTES              8
+#define PROC_KPAGEFLAGS                "/proc/kpageflags"
+
+/* copied from kpageflags_read() */
+#define KPF_LOCKED             0
+#define KPF_ERROR              1
+#define KPF_REFERENCED         2
+#define KPF_UPTODATE           3
+#define KPF_DIRTY              4
+#define KPF_LRU                        5
+#define KPF_ACTIVE             6
+#define KPF_SLAB               7
+#define KPF_WRITEBACK          8
+#define KPF_RECLAIM            9
+#define KPF_BUDDY              10
+
+/* [11-20] new additions in 2.6.31 */
+#define KPF_MMAP               11
+#define KPF_ANON               12
+#define KPF_SWAPCACHE          13
+#define KPF_SWAPBACKED         14
+#define KPF_COMPOUND_HEAD      15
+#define KPF_COMPOUND_TAIL      16
+#define KPF_HUGE               17
+#define KPF_UNEVICTABLE                18
+#define KPF_HWPOISON           19
+#define KPF_NOPAGE             20
+#define KPF_KSM                        21
+#define KPF_THP                        22
+
+/* [32-] kernel hacking assistances */
+#define KPF_RESERVED           32
+#define KPF_MLOCKED            33
+#define KPF_MAPPEDTODISK       34
+#define KPF_PRIVATE            35
+#define KPF_PRIVATE_2          36
+#define KPF_OWNER_PRIVATE      37
+#define KPF_ARCH               38
+#define KPF_UNCACHED           39
+
+/* [48-] take some arbitrary free slots for expanding overloaded flags
+ * not part of kernel API
+ */
+#define KPF_READAHEAD          48
+#define KPF_SLOB_FREE          49
+#define KPF_SLUB_FROZEN                50
+#define KPF_SLUB_DEBUG         51
+
+#define KPF_ALL_BITS           ((uint64_t)~0ULL)
+#define KPF_HACKERS_BITS       (0xffffULL << 32)
+#define KPF_OVERLOADED_BITS    (0xffffULL << 48)
+#define BIT(name)              (1ULL << KPF_##name)
+#define BITS_COMPOUND          (BIT(COMPOUND_HEAD) | BIT(COMPOUND_TAIL))
+
+static const char * const page_flag_names[] = {
+       [KPF_LOCKED]            = "L:locked",
+       [KPF_ERROR]             = "E:error",
+       [KPF_REFERENCED]        = "R:referenced",
+       [KPF_UPTODATE]          = "U:uptodate",
+       [KPF_DIRTY]             = "D:dirty",
+       [KPF_LRU]               = "l:lru",
+       [KPF_ACTIVE]            = "A:active",
+       [KPF_SLAB]              = "S:slab",
+       [KPF_WRITEBACK]         = "W:writeback",
+       [KPF_RECLAIM]           = "I:reclaim",
+       [KPF_BUDDY]             = "B:buddy",
+
+       [KPF_MMAP]              = "M:mmap",
+       [KPF_ANON]              = "a:anonymous",
+       [KPF_SWAPCACHE]         = "s:swapcache",
+       [KPF_SWAPBACKED]        = "b:swapbacked",
+       [KPF_COMPOUND_HEAD]     = "H:compound_head",
+       [KPF_COMPOUND_TAIL]     = "T:compound_tail",
+       [KPF_HUGE]              = "G:huge",
+       [KPF_UNEVICTABLE]       = "u:unevictable",
+       [KPF_HWPOISON]          = "X:hwpoison",
+       [KPF_NOPAGE]            = "n:nopage",
+       [KPF_KSM]               = "x:ksm",
+       [KPF_THP]               = "t:thp",
+
+       [KPF_RESERVED]          = "r:reserved",
+       [KPF_MLOCKED]           = "m:mlocked",
+       [KPF_MAPPEDTODISK]      = "d:mappedtodisk",
+       [KPF_PRIVATE]           = "P:private",
+       [KPF_PRIVATE_2]         = "p:private_2",
+       [KPF_OWNER_PRIVATE]     = "O:owner_private",
+       [KPF_ARCH]              = "h:arch",
+       [KPF_UNCACHED]          = "c:uncached",
+
+       [KPF_READAHEAD]         = "I:readahead",
+       [KPF_SLOB_FREE]         = "P:slob_free",
+       [KPF_SLUB_FROZEN]       = "A:slub_frozen",
+       [KPF_SLUB_DEBUG]        = "E:slub_debug",
+};
+
+
+static const char * const debugfs_known_mountpoints[] = {
+       "/sys/kernel/debug",
+       "/debug",
+       0,
+};
+
+/*
+ * data structures
+ */
+
+static int             opt_raw;        /* for kernel developers */
+static int             opt_list;       /* list pages (in ranges) */
+static int             opt_no_summary; /* don't show summary */
+static pid_t           opt_pid;        /* process to walk */
+
+#define MAX_ADDR_RANGES        1024
+static int             nr_addr_ranges;
+static unsigned long   opt_offset[MAX_ADDR_RANGES];
+static unsigned long   opt_size[MAX_ADDR_RANGES];
+
+#define MAX_VMAS       10240
+static int             nr_vmas;
+static unsigned long   pg_start[MAX_VMAS];
+static unsigned long   pg_end[MAX_VMAS];
+
+#define MAX_BIT_FILTERS        64
+static int             nr_bit_filters;
+static uint64_t                opt_mask[MAX_BIT_FILTERS];
+static uint64_t                opt_bits[MAX_BIT_FILTERS];
+
+static int             page_size;
+
+static int             pagemap_fd;
+static int             kpageflags_fd;
+
+static int             opt_hwpoison;
+static int             opt_unpoison;
+
+static char            hwpoison_debug_fs[MAX_PATH+1];
+static int             hwpoison_inject_fd;
+static int             hwpoison_forget_fd;
+
+#define HASH_SHIFT     13
+#define HASH_SIZE      (1 << HASH_SHIFT)
+#define HASH_MASK      (HASH_SIZE - 1)
+#define HASH_KEY(flags)        (flags & HASH_MASK)
+
+static unsigned long   total_pages;
+static unsigned long   nr_pages[HASH_SIZE];
+static uint64_t                page_flags[HASH_SIZE];
+
+
+/*
+ * helper functions
+ */
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
+#define min_t(type, x, y) ({                   \
+       type __min1 = (x);                      \
+       type __min2 = (y);                      \
+       __min1 < __min2 ? __min1 : __min2; })
+
+#define max_t(type, x, y) ({                   \
+       type __max1 = (x);                      \
+       type __max2 = (y);                      \
+       __max1 > __max2 ? __max1 : __max2; })
+
+static unsigned long pages2mb(unsigned long pages)
+{
+       return (pages * page_size) >> 20;
+}
+
+static void fatal(const char *x, ...)
+{
+       va_list ap;
+
+       va_start(ap, x);
+       vfprintf(stderr, x, ap);
+       va_end(ap);
+       exit(EXIT_FAILURE);
+}
+
+static int checked_open(const char *pathname, int flags)
+{
+       int fd = open(pathname, flags);
+
+       if (fd < 0) {
+               perror(pathname);
+               exit(EXIT_FAILURE);
+       }
+
+       return fd;
+}
+
+/*
+ * pagemap/kpageflags routines
+ */
+
+static unsigned long do_u64_read(int fd, char *name,
+                                uint64_t *buf,
+                                unsigned long index,
+                                unsigned long count)
+{
+       long bytes;
+
+       if (index > ULONG_MAX / 8)
+               fatal("index overflow: %lu\n", index);
+
+       if (lseek(fd, index * 8, SEEK_SET) < 0) {
+               perror(name);
+               exit(EXIT_FAILURE);
+       }
+
+       bytes = read(fd, buf, count * 8);
+       if (bytes < 0) {
+               perror(name);
+               exit(EXIT_FAILURE);
+       }
+       if (bytes % 8)
+               fatal("partial read: %lu bytes\n", bytes);
+
+       return bytes / 8;
+}
+
+static unsigned long kpageflags_read(uint64_t *buf,
+                                    unsigned long index,
+                                    unsigned long pages)
+{
+       return do_u64_read(kpageflags_fd, PROC_KPAGEFLAGS, buf, index, pages);
+}
+
+static unsigned long pagemap_read(uint64_t *buf,
+                                 unsigned long index,
+                                 unsigned long pages)
+{
+       return do_u64_read(pagemap_fd, "/proc/pid/pagemap", buf, index, pages);
+}
+
+static unsigned long pagemap_pfn(uint64_t val)
+{
+       unsigned long pfn;
+
+       if (val & PM_PRESENT)
+               pfn = PM_PFRAME(val);
+       else
+               pfn = 0;
+
+       return pfn;
+}
+
+
+/*
+ * page flag names
+ */
+
+static char *page_flag_name(uint64_t flags)
+{
+       static char buf[65];
+       int present;
+       int i, j;
+
+       for (i = 0, j = 0; i < ARRAY_SIZE(page_flag_names); i++) {
+               present = (flags >> i) & 1;
+               if (!page_flag_names[i]) {
+                       if (present)
+                               fatal("unknown flag bit %d\n", i);
+                       continue;
+               }
+               buf[j++] = present ? page_flag_names[i][0] : '_';
+       }
+
+       return buf;
+}
+
+static char *page_flag_longname(uint64_t flags)
+{
+       static char buf[1024];
+       int i, n;
+
+       for (i = 0, n = 0; i < ARRAY_SIZE(page_flag_names); i++) {
+               if (!page_flag_names[i])
+                       continue;
+               if ((flags >> i) & 1)
+                       n += snprintf(buf + n, sizeof(buf) - n, "%s,",
+                                       page_flag_names[i] + 2);
+       }
+       if (n)
+               n--;
+       buf[n] = '\0';
+
+       return buf;
+}
+
+
+/*
+ * page list and summary
+ */
+
+static void show_page_range(unsigned long voffset,
+                           unsigned long offset, uint64_t flags)
+{
+       static uint64_t      flags0;
+       static unsigned long voff;
+       static unsigned long index;
+       static unsigned long count;
+
+       if (flags == flags0 && offset == index + count &&
+           (!opt_pid || voffset == voff + count)) {
+               count++;
+               return;
+       }
+
+       if (count) {
+               if (opt_pid)
+                       printf("%lx\t", voff);
+               printf("%lx\t%lx\t%s\n",
+                               index, count, page_flag_name(flags0));
+       }
+
+       flags0 = flags;
+       index  = offset;
+       voff   = voffset;
+       count  = 1;
+}
+
+static void show_page(unsigned long voffset,
+                     unsigned long offset, uint64_t flags)
+{
+       if (opt_pid)
+               printf("%lx\t", voffset);
+       printf("%lx\t%s\n", offset, page_flag_name(flags));
+}
+
+static void show_summary(void)
+{
+       int i;
+
+       printf("             flags\tpage-count       MB"
+               "  symbolic-flags\t\t\tlong-symbolic-flags\n");
+
+       for (i = 0; i < ARRAY_SIZE(nr_pages); i++) {
+               if (nr_pages[i])
+                       printf("0x%016llx\t%10lu %8lu  %s\t%s\n",
+                               (unsigned long long)page_flags[i],
+                               nr_pages[i],
+                               pages2mb(nr_pages[i]),
+                               page_flag_name(page_flags[i]),
+                               page_flag_longname(page_flags[i]));
+       }
+
+       printf("             total\t%10lu %8lu\n",
+                       total_pages, pages2mb(total_pages));
+}
+
+
+/*
+ * page flag filters
+ */
+
+static int bit_mask_ok(uint64_t flags)
+{
+       int i;
+
+       for (i = 0; i < nr_bit_filters; i++) {
+               if (opt_bits[i] == KPF_ALL_BITS) {
+                       if ((flags & opt_mask[i]) == 0)
+                               return 0;
+               } else {
+                       if ((flags & opt_mask[i]) != opt_bits[i])
+                               return 0;
+               }
+       }
+
+       return 1;
+}
+
+static uint64_t expand_overloaded_flags(uint64_t flags)
+{
+       /* SLOB/SLUB overload several page flags */
+       if (flags & BIT(SLAB)) {
+               if (flags & BIT(PRIVATE))
+                       flags ^= BIT(PRIVATE) | BIT(SLOB_FREE);
+               if (flags & BIT(ACTIVE))
+                       flags ^= BIT(ACTIVE) | BIT(SLUB_FROZEN);
+               if (flags & BIT(ERROR))
+                       flags ^= BIT(ERROR) | BIT(SLUB_DEBUG);
+       }
+
+       /* PG_reclaim is overloaded as PG_readahead in the read path */
+       if ((flags & (BIT(RECLAIM) | BIT(WRITEBACK))) == BIT(RECLAIM))
+               flags ^= BIT(RECLAIM) | BIT(READAHEAD);
+
+       return flags;
+}
+
+static uint64_t well_known_flags(uint64_t flags)
+{
+       /* hide flags intended only for kernel hacker */
+       flags &= ~KPF_HACKERS_BITS;
+
+       /* hide non-hugeTLB compound pages */
+       if ((flags & BITS_COMPOUND) && !(flags & BIT(HUGE)))
+               flags &= ~BITS_COMPOUND;
+
+       return flags;
+}
+
+static uint64_t kpageflags_flags(uint64_t flags)
+{
+       flags = expand_overloaded_flags(flags);
+
+       if (!opt_raw)
+               flags = well_known_flags(flags);
+
+       return flags;
+}
+
+/* verify that a mountpoint is actually a debugfs instance */
+static int debugfs_valid_mountpoint(const char *debugfs)
+{
+       struct statfs st_fs;
+
+       if (statfs(debugfs, &st_fs) < 0)
+               return -ENOENT;
+       else if (st_fs.f_type != (long) DEBUGFS_MAGIC)
+               return -ENOENT;
+
+       return 0;
+}
+
+/* find the path to the mounted debugfs */
+static const char *debugfs_find_mountpoint(void)
+{
+       const char **ptr;
+       char type[100];
+       FILE *fp;
+
+       ptr = debugfs_known_mountpoints;
+       while (*ptr) {
+               if (debugfs_valid_mountpoint(*ptr) == 0) {
+                       strcpy(hwpoison_debug_fs, *ptr);
+                       return hwpoison_debug_fs;
+               }
+               ptr++;
+       }
+
+       /* give up and parse /proc/mounts */
+       fp = fopen("/proc/mounts", "r");
+       if (fp == NULL)
+               perror("Can't open /proc/mounts for read");
+
+       while (fscanf(fp, "%*s %"
+                     STR(MAX_PATH)
+                     "s %99s %*s %*d %*d\n",
+                     hwpoison_debug_fs, type) == 2) {
+               if (strcmp(type, "debugfs") == 0)
+                       break;
+       }
+       fclose(fp);
+
+       if (strcmp(type, "debugfs") != 0)
+               return NULL;
+
+       return hwpoison_debug_fs;
+}
+
+/* mount the debugfs somewhere if it's not mounted */
+
+static void debugfs_mount(void)
+{
+       const char **ptr;
+
+       /* see if it's already mounted */
+       if (debugfs_find_mountpoint())
+               return;
+
+       ptr = debugfs_known_mountpoints;
+       while (*ptr) {
+               if (mount(NULL, *ptr, "debugfs", 0, NULL) == 0) {
+                       /* save the mountpoint */
+                       strcpy(hwpoison_debug_fs, *ptr);
+                       break;
+               }
+               ptr++;
+       }
+
+       if (*ptr == NULL) {
+               perror("mount debugfs");
+               exit(EXIT_FAILURE);
+       }
+}
+
+/*
+ * page actions
+ */
+
+static void prepare_hwpoison_fd(void)
+{
+       char buf[MAX_PATH + 1];
+
+       debugfs_mount();
+
+       if (opt_hwpoison && !hwpoison_inject_fd) {
+               snprintf(buf, MAX_PATH, "%s/hwpoison/corrupt-pfn",
+                       hwpoison_debug_fs);
+               hwpoison_inject_fd = checked_open(buf, O_WRONLY);
+       }
+
+       if (opt_unpoison && !hwpoison_forget_fd) {
+               snprintf(buf, MAX_PATH, "%s/hwpoison/unpoison-pfn",
+                       hwpoison_debug_fs);
+               hwpoison_forget_fd = checked_open(buf, O_WRONLY);
+       }
+}
+
+static int hwpoison_page(unsigned long offset)
+{
+       char buf[100];
+       int len;
+
+       len = sprintf(buf, "0x%lx\n", offset);
+       len = write(hwpoison_inject_fd, buf, len);
+       if (len < 0) {
+               perror("hwpoison inject");
+               return len;
+       }
+       return 0;
+}
+
+static int unpoison_page(unsigned long offset)
+{
+       char buf[100];
+       int len;
+
+       len = sprintf(buf, "0x%lx\n", offset);
+       len = write(hwpoison_forget_fd, buf, len);
+       if (len < 0) {
+               perror("hwpoison forget");
+               return len;
+       }
+       return 0;
+}
+
+/*
+ * page frame walker
+ */
+
+static int hash_slot(uint64_t flags)
+{
+       int k = HASH_KEY(flags);
+       int i;
+
+       /* Explicitly reserve slot 0 for flags 0: the following logic
+        * cannot distinguish an unoccupied slot from slot (flags==0).
+        */
+       if (flags == 0)
+               return 0;
+
+       /* search through the remaining (HASH_SIZE-1) slots */
+       for (i = 1; i < ARRAY_SIZE(page_flags); i++, k++) {
+               if (!k || k >= ARRAY_SIZE(page_flags))
+                       k = 1;
+               if (page_flags[k] == 0) {
+                       page_flags[k] = flags;
+                       return k;
+               }
+               if (page_flags[k] == flags)
+                       return k;
+       }
+
+       fatal("hash table full: bump up HASH_SHIFT?\n");
+       exit(EXIT_FAILURE);
+}
+
+static void add_page(unsigned long voffset,
+                    unsigned long offset, uint64_t flags)
+{
+       flags = kpageflags_flags(flags);
+
+       if (!bit_mask_ok(flags))
+               return;
+
+       if (opt_hwpoison)
+               hwpoison_page(offset);
+       if (opt_unpoison)
+               unpoison_page(offset);
+
+       if (opt_list == 1)
+               show_page_range(voffset, offset, flags);
+       else if (opt_list == 2)
+               show_page(voffset, offset, flags);
+
+       nr_pages[hash_slot(flags)]++;
+       total_pages++;
+}
+
+#define KPAGEFLAGS_BATCH       (64 << 10)      /* 64k pages */
+static void walk_pfn(unsigned long voffset,
+                    unsigned long index,
+                    unsigned long count)
+{
+       uint64_t buf[KPAGEFLAGS_BATCH];
+       unsigned long batch;
+       long pages;
+       unsigned long i;
+
+       while (count) {
+               batch = min_t(unsigned long, count, KPAGEFLAGS_BATCH);
+               pages = kpageflags_read(buf, index, batch);
+               if (pages == 0)
+                       break;
+
+               for (i = 0; i < pages; i++)
+                       add_page(voffset + i, index + i, buf[i]);
+
+               index += pages;
+               count -= pages;
+       }
+}
+
+#define PAGEMAP_BATCH  (64 << 10)
+static void walk_vma(unsigned long index, unsigned long count)
+{
+       uint64_t buf[PAGEMAP_BATCH];
+       unsigned long batch;
+       unsigned long pages;
+       unsigned long pfn;
+       unsigned long i;
+
+       while (count) {
+               batch = min_t(unsigned long, count, PAGEMAP_BATCH);
+               pages = pagemap_read(buf, index, batch);
+               if (pages == 0)
+                       break;
+
+               for (i = 0; i < pages; i++) {
+                       pfn = pagemap_pfn(buf[i]);
+                       if (pfn)
+                               walk_pfn(index + i, pfn, 1);
+               }
+
+               index += pages;
+               count -= pages;
+       }
+}
+
+static void walk_task(unsigned long index, unsigned long count)
+{
+       const unsigned long end = index + count;
+       unsigned long start;
+       int i = 0;
+
+       while (index < end) {
+
+               while (pg_end[i] <= index)
+                       if (++i >= nr_vmas)
+                               return;
+               if (pg_start[i] >= end)
+                       return;
+
+               start = max_t(unsigned long, pg_start[i], index);
+               index = min_t(unsigned long, pg_end[i], end);