Merge branch 'akpm' (Andrew's patch-bomb)
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 22 Mar 2012 16:04:48 +0000 (09:04 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 22 Mar 2012 16:04:48 +0000 (09:04 -0700)
Merge first batch of patches from Andrew Morton:
 "A few misc things and all the MM queue"

* emailed from Andrew Morton <akpm@linux-foundation.org>: (92 commits)
  memcg: avoid THP split in task migration
  thp: add HPAGE_PMD_* definitions for !CONFIG_TRANSPARENT_HUGEPAGE
  memcg: clean up existing move charge code
  mm/memcontrol.c: remove unnecessary 'break' in mem_cgroup_read()
  mm/memcontrol.c: remove redundant BUG_ON() in mem_cgroup_usage_unregister_event()
  mm/memcontrol.c: s/stealed/stolen/
  memcg: fix performance of mem_cgroup_begin_update_page_stat()
  memcg: remove PCG_FILE_MAPPED
  memcg: use new logic for page stat accounting
  memcg: remove PCG_MOVE_LOCK flag from page_cgroup
  memcg: simplify move_account() check
  memcg: remove EXPORT_SYMBOL(mem_cgroup_update_page_stat)
  memcg: kill dead prev_priority stubs
  memcg: remove PCG_CACHE page_cgroup flag
  memcg: let css_get_next() rely upon rcu_read_lock()
  cgroup: revert ss_id_lock to spinlock
  idr: make idr_get_next() good for rcu_read_lock()
  memcg: remove unnecessary thp check in page stat accounting
  memcg: remove redundant returns
  memcg: enum lru_list lru
  ...

77 files changed:
Documentation/filesystems/proc.txt
Documentation/kernel-parameters.txt
Documentation/vm/page-types.c
Documentation/vm/pagemap.txt
arch/sparc/kernel/signal32.c
arch/sparc/kernel/signal_32.c
arch/sparc/kernel/signal_64.c
arch/x86/kernel/sys_x86_64.c
arch/x86/kernel/vm86_32.c
arch/x86/mm/hugetlbpage.c
arch/x86/mm/numa_emulation.c
arch/xtensa/kernel/signal.c
drivers/idle/intel_idle.c
drivers/tty/sysrq.c
fs/exec.c
fs/hugetlbfs/inode.c
fs/namei.c
fs/proc/base.c
fs/proc/internal.h
fs/proc/page.c
fs/proc/task_mmu.c
fs/proc/task_nommu.c
fs/seq_file.c
include/asm-generic/pgtable.h
include/linux/cgroup.h
include/linux/compaction.h
include/linux/cpuset.h
include/linux/huge_mm.h
include/linux/hugetlb.h
include/linux/init_task.h
include/linux/kernel-page-flags.h
include/linux/memcontrol.h
include/linux/migrate.h
include/linux/mm.h
include/linux/mmzone.h
include/linux/oom.h
include/linux/page-flags.h
include/linux/page_cgroup.h
include/linux/rmap.h
include/linux/sched.h
include/linux/swap.h
ipc/shm.c
kernel/cgroup.c
kernel/cpuset.c
kernel/exit.c
kernel/fork.c
lib/idr.c
mm/bootmem.c
mm/compaction.c
mm/filemap.c
mm/huge_memory.c
mm/hugetlb.c
mm/ksm.c
mm/memcontrol.c
mm/memory-failure.c
mm/memory.c
mm/mempolicy.c
mm/migrate.c
mm/mincore.c
mm/mmap.c
mm/mmu_context.c
mm/mprotect.c
mm/oom_kill.c
mm/page-writeback.c
mm/page_alloc.c
mm/pagewalk.c
mm/pgtable-generic.c
mm/rmap.c
mm/shmem.c
mm/slab.c
mm/slub.c
mm/sparse.c
mm/swap.c
mm/swap_state.c
mm/swapfile.c
mm/util.c
mm/vmscan.c

index a76a26a1db8a6fe855b0eeea32dee251296c0c8a..b7413cb46dcb1f1a331068861e766a6c03a5da40 100644 (file)
@@ -290,7 +290,7 @@ Table 1-4: Contents of the stat files (as of 2.6.30-rc7)
   rsslim        current limit in bytes on the rss
   start_code    address above which program text can run
   end_code      address below which program text can run
-  start_stack   address of the start of the stack
+  start_stack   address of the start of the main process stack
   esp           current value of ESP
   eip           current value of EIP
   pending       bitmap of pending signals
@@ -325,7 +325,7 @@ address           perms offset  dev   inode      pathname
 a7cb1000-a7cb2000 ---p 00000000 00:00 0
 a7cb2000-a7eb2000 rw-p 00000000 00:00 0
 a7eb2000-a7eb3000 ---p 00000000 00:00 0
-a7eb3000-a7ed5000 rw-p 00000000 00:00 0
+a7eb3000-a7ed5000 rw-p 00000000 00:00 0          [stack:1001]
 a7ed5000-a8008000 r-xp 00000000 03:00 4222       /lib/libc.so.6
 a8008000-a800a000 r--p 00133000 03:00 4222       /lib/libc.so.6
 a800a000-a800b000 rw-p 00135000 03:00 4222       /lib/libc.so.6
@@ -357,11 +357,39 @@ is not associated with a file:
 
  [heap]                   = the heap of the program
  [stack]                  = the stack of the main process
+ [stack:1001]             = the stack of the thread with tid 1001
  [vdso]                   = the "virtual dynamic shared object",
                             the kernel system call handler
 
  or if empty, the mapping is anonymous.
 
+The /proc/PID/task/TID/maps is a view of the virtual memory from the viewpoint
+of the individual tasks of a process. In this file you will see a mapping marked
+as [stack] if that task sees it as a stack. This is a key difference from the
+content of /proc/PID/maps, where you will see all mappings that are being used
+as stack by all of those tasks. Hence, for the example above, the task-level
+map, i.e. /proc/PID/task/TID/maps for thread 1001 will look like this:
+
+08048000-08049000 r-xp 00000000 03:00 8312       /opt/test
+08049000-0804a000 rw-p 00001000 03:00 8312       /opt/test
+0804a000-0806b000 rw-p 00000000 00:00 0          [heap]
+a7cb1000-a7cb2000 ---p 00000000 00:00 0
+a7cb2000-a7eb2000 rw-p 00000000 00:00 0
+a7eb2000-a7eb3000 ---p 00000000 00:00 0
+a7eb3000-a7ed5000 rw-p 00000000 00:00 0          [stack]
+a7ed5000-a8008000 r-xp 00000000 03:00 4222       /lib/libc.so.6
+a8008000-a800a000 r--p 00133000 03:00 4222       /lib/libc.so.6
+a800a000-a800b000 rw-p 00135000 03:00 4222       /lib/libc.so.6
+a800b000-a800e000 rw-p 00000000 00:00 0
+a800e000-a8022000 r-xp 00000000 03:00 14462      /lib/libpthread.so.0
+a8022000-a8023000 r--p 00013000 03:00 14462      /lib/libpthread.so.0
+a8023000-a8024000 rw-p 00014000 03:00 14462      /lib/libpthread.so.0
+a8024000-a8027000 rw-p 00000000 00:00 0
+a8027000-a8043000 r-xp 00000000 03:00 8317       /lib/ld-linux.so.2
+a8043000-a8044000 r--p 0001b000 03:00 8317       /lib/ld-linux.so.2
+a8044000-a8045000 rw-p 0001c000 03:00 8317       /lib/ld-linux.so.2
+aff35000-aff4a000 rw-p 00000000 00:00 0
+ffffe000-fffff000 r-xp 00000000 00:00 0          [vdso]
 
 The /proc/PID/smaps is an extension based on maps, showing the memory
 consumption for each of the process's mappings. For each of mappings there
index 8cadb7551fca8d5a199edc7fa9008af07359d499..7986d79d9d17030a73fb6efbe1ae02ddece06342 100644 (file)
@@ -2635,6 +2635,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                        to facilitate early boot debugging.
                        See also Documentation/trace/events.txt
 
+       transparent_hugepage=
+                       [KNL]
+                       Format: [always|madvise|never]
+                       Can be used to control the default behavior of the system
+                       with respect to transparent hugepages.
+                       See Documentation/vm/transhuge.txt for more details.
+
        tsc=            Disable clocksource stability checks for TSC.
                        Format: <string>
                        [x86] reliable: mark tsc clocksource as reliable, this
index 7445caa26d05f59d9440d9556314be77ae090053..0b13f02d405971ab4d2c72d1d013c07a1dbc78a3 100644 (file)
@@ -98,6 +98,7 @@
 #define KPF_HWPOISON           19
 #define KPF_NOPAGE             20
 #define KPF_KSM                        21
+#define KPF_THP                        22
 
 /* [32-] kernel hacking assistances */
 #define KPF_RESERVED           32
@@ -147,6 +148,7 @@ static const char *page_flag_names[] = {
        [KPF_HWPOISON]          = "X:hwpoison",
        [KPF_NOPAGE]            = "n:nopage",
        [KPF_KSM]               = "x:ksm",
+       [KPF_THP]               = "t:thp",
 
        [KPF_RESERVED]          = "r:reserved",
        [KPF_MLOCKED]           = "m:mlocked",
index df09b9650a811045ee99708682b2172c0c8fee83..4600cbe3d6beabc7e9fb77c147951bcda70e6b46 100644 (file)
@@ -60,6 +60,7 @@ There are three components to pagemap:
     19. HWPOISON
     20. NOPAGE
     21. KSM
+    22. THP
 
 Short descriptions to the page flags:
 
@@ -97,6 +98,9 @@ Short descriptions to the page flags:
 21. KSM
     identical memory pages dynamically shared between one or more processes
 
+22. THP
+    contiguous pages which construct transparent hugepages
+
     [IO related page flags]
  1. ERROR     IO error occurred
  3. UPTODATE  page has up-to-date data
index 023b8860dc9704330391b1db6a8eaf2ea117d32e..c8f5b50db89ceda3be69c8d5cc17d1b16476d0f5 100644 (file)
@@ -776,7 +776,6 @@ static inline int handle_signal32(unsigned long signr, struct k_sigaction *ka,
                                  siginfo_t *info,
                                  sigset_t *oldset, struct pt_regs *regs)
 {
-       sigset_t blocked;
        int err;
 
        if (ka->sa.sa_flags & SA_SIGINFO)
@@ -787,11 +786,7 @@ static inline int handle_signal32(unsigned long signr, struct k_sigaction *ka,
        if (err)
                return err;
 
-       sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask);
-       if (!(ka->sa.sa_flags & SA_NOMASK))
-               sigaddset(&blocked, signr);
-       set_current_blocked(&blocked);
-
+       block_sigmask(ka, signr);
        tracehook_signal_handler(signr, info, ka, regs, 0);
 
        return 0;
index d54c6e53aba00323fda9f01c95aae0f001352fb9..7bb71b6fbd2052185943ada79384228a598f8739 100644 (file)
@@ -465,7 +465,6 @@ static inline int
 handle_signal(unsigned long signr, struct k_sigaction *ka,
              siginfo_t *info, sigset_t *oldset, struct pt_regs *regs)
 {
-       sigset_t blocked;
        int err;
 
        if (ka->sa.sa_flags & SA_SIGINFO)
@@ -476,11 +475,7 @@ handle_signal(unsigned long signr, struct k_sigaction *ka,
        if (err)
                return err;
 
-       sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask);
-       if (!(ka->sa.sa_flags & SA_NOMASK))
-               sigaddset(&blocked, signr);
-       set_current_blocked(&blocked);
-
+       block_sigmask(ka, signr);
        tracehook_signal_handler(signr, info, ka, regs, 0);
 
        return 0;
index f0836cd0e2f243ffb3c1df37ff02cc31488850b7..d8a67e60be8042ce6610c36283e42d9ff1d1bcbf 100644 (file)
@@ -479,18 +479,14 @@ static inline int handle_signal(unsigned long signr, struct k_sigaction *ka,
                                siginfo_t *info,
                                sigset_t *oldset, struct pt_regs *regs)
 {
-       sigset_t blocked;
        int err;
 
        err = setup_rt_frame(ka, regs, signr, oldset,
                             (ka->sa.sa_flags & SA_SIGINFO) ? info : NULL);
        if (err)
                return err;
-       sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask);
-       if (!(ka->sa.sa_flags & SA_NOMASK))
-               sigaddset(&blocked, signr);
-       set_current_blocked(&blocked);
 
+       block_sigmask(ka, signr);
        tracehook_signal_handler(signr, info, ka, regs, 0);
 
        return 0;
index 051489082d591928e3e212ebc83ece12c93f50e2..ef59642ff1bf9e719ae1674668dece0a22787d7c 100644 (file)
@@ -195,7 +195,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 {
        struct vm_area_struct *vma;
        struct mm_struct *mm = current->mm;
-       unsigned long addr = addr0;
+       unsigned long addr = addr0, start_addr;
 
        /* requested length too big for entire address space */
        if (len > TASK_SIZE)
@@ -223,25 +223,14 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
                mm->free_area_cache = mm->mmap_base;
        }
 
+try_again:
        /* either no address requested or can't fit in requested address hole */
-       addr = mm->free_area_cache;
-
-       /* make sure it can fit in the remaining address space */
-       if (addr > len) {
-               unsigned long tmp_addr = align_addr(addr - len, filp,
-                                                   ALIGN_TOPDOWN);
-
-               vma = find_vma(mm, tmp_addr);
-               if (!vma || tmp_addr + len <= vma->vm_start)
-                       /* remember the address as a hint for next time */
-                       return mm->free_area_cache = tmp_addr;
-       }
-
-       if (mm->mmap_base < len)
-               goto bottomup;
+       start_addr = addr = mm->free_area_cache;
 
-       addr = mm->mmap_base-len;
+       if (addr < len)
+               goto fail;
 
+       addr -= len;
        do {
                addr = align_addr(addr, filp, ALIGN_TOPDOWN);
 
@@ -263,6 +252,17 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
                addr = vma->vm_start-len;
        } while (len < vma->vm_start);
 
+fail:
+       /*
+        * if hint left us with no space for the requested
+        * mapping then try again:
+        */
+       if (start_addr != mm->mmap_base) {
+               mm->free_area_cache = mm->mmap_base;
+               mm->cached_hole_size = 0;
+               goto try_again;
+       }
+
 bottomup:
        /*
         * A failed mmap() very likely causes application failure,
index b466cab5ba15d171cb4b1fd61a1590bd0cca9956..328cb37bb827915ccc3e87cc6518acd7ccf25686 100644 (file)
@@ -172,6 +172,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
        spinlock_t *ptl;
        int i;
 
+       down_write(&mm->mmap_sem);
        pgd = pgd_offset(mm, 0xA0000);
        if (pgd_none_or_clear_bad(pgd))
                goto out;
@@ -190,6 +191,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
        }
        pte_unmap_unlock(pte, ptl);
 out:
+       up_write(&mm->mmap_sem);
        flush_tlb();
 }
 
index 8ecbb4bba4b3b44241b64b46079476ad90f7aad4..f6679a7fb8ca492482f18421d78f438cc9cfba2b 100644 (file)
@@ -308,10 +308,11 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
 {
        struct hstate *h = hstate_file(file);
        struct mm_struct *mm = current->mm;
-       struct vm_area_struct *vma, *prev_vma;
-       unsigned long base = mm->mmap_base, addr = addr0;
+       struct vm_area_struct *vma;
+       unsigned long base = mm->mmap_base;
+       unsigned long addr = addr0;
        unsigned long largest_hole = mm->cached_hole_size;
-       int first_time = 1;
+       unsigned long start_addr;
 
        /* don't allow allocations above current base */
        if (mm->free_area_cache > base)
@@ -322,6 +323,8 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
                mm->free_area_cache  = base;
        }
 try_again:
+       start_addr = mm->free_area_cache;
+
        /* make sure it can fit in the remaining address space */
        if (mm->free_area_cache < len)
                goto fail;
@@ -337,22 +340,14 @@ try_again:
                if (!vma)
                        return addr;
 
-               /*
-                * new region fits between prev_vma->vm_end and
-                * vma->vm_start, use it:
-                */
-               prev_vma = vma->vm_prev;
-               if (addr + len <= vma->vm_start &&
-                           (!prev_vma || (addr >= prev_vma->vm_end))) {
+               if (addr + len <= vma->vm_start) {
                        /* remember the address as a hint for next time */
                        mm->cached_hole_size = largest_hole;
                        return (mm->free_area_cache = addr);
-               } else {
+               } else if (mm->free_area_cache == vma->vm_end) {
                        /* pull free_area_cache down to the first hole */
-                       if (mm->free_area_cache == vma->vm_end) {
-                               mm->free_area_cache = vma->vm_start;
-                               mm->cached_hole_size = largest_hole;
-                       }
+                       mm->free_area_cache = vma->vm_start;
+                       mm->cached_hole_size = largest_hole;
                }
 
                /* remember the largest hole we saw so far */
@@ -368,10 +363,9 @@ fail:
         * if hint left us with no space for the requested
         * mapping then try again:
         */
-       if (first_time) {
+       if (start_addr != base) {
                mm->free_area_cache = base;
                largest_hole = 0;
-               first_time = 0;
                goto try_again;
        }
        /*
index 46db56845f180bcc890195e52052daa686680518..740b0a3554316679e4fd69e11fb20a308dab471f 100644 (file)
@@ -60,7 +60,7 @@ static int __init emu_setup_memblk(struct numa_meminfo *ei,
        eb->nid = nid;
 
        if (emu_nid_to_phys[nid] == NUMA_NO_NODE)
-               emu_nid_to_phys[nid] = pb->nid;
+               emu_nid_to_phys[nid] = nid;
 
        pb->start += size;
        if (pb->start >= pb->end) {
index f2220b5bdce64daff9d0b75306e713159b334502..b69b000349fcdb20888a6e080cf8045ae82fc6b7 100644 (file)
@@ -260,10 +260,7 @@ asmlinkage long xtensa_rt_sigreturn(long a0, long a1, long a2, long a3,
                goto badframe;
 
        sigdelsetmask(&set, ~_BLOCKABLE);
-       spin_lock_irq(&current->sighand->siglock);
-       current->blocked = set;
-       recalc_sigpending();
-       spin_unlock_irq(&current->sighand->siglock);
+       set_current_blocked(&set);
 
        if (restore_sigcontext(regs, frame))
                goto badframe;
@@ -336,8 +333,8 @@ gen_return_code(unsigned char *codemem)
 }
 
 
-static void setup_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
-                       sigset_t *set, struct pt_regs *regs)
+static int setup_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+                      sigset_t *set, struct pt_regs *regs)
 {
        struct rt_sigframe *frame;
        int err = 0;
@@ -422,12 +419,11 @@ static void setup_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
                current->comm, current->pid, signal, frame, regs->pc);
 #endif
 
-       return;
+       return 0;
 
 give_sigsegv:
-       if (sig == SIGSEGV)
-               ka->sa.sa_handler = SIG_DFL;
-       force_sig(SIGSEGV, current);
+       force_sigsegv(sig, current);
+       return -EFAULT;
 }
 
 /*
@@ -449,11 +445,8 @@ asmlinkage long xtensa_rt_sigsuspend(sigset_t __user *unewset,
                return -EFAULT;
 
        sigdelsetmask(&newset, ~_BLOCKABLE);
-       spin_lock_irq(&current->sighand->siglock);
        saveset = current->blocked;
-       current->blocked = newset;
-       recalc_sigpending();
-       spin_unlock_irq(&current->sighand->siglock);
+       set_current_blocked(&newset);
 
        regs->areg[2] = -EINTR;
        while (1) {
@@ -536,17 +529,11 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset)
 
                /* Whee!  Actually deliver the signal.  */
                /* Set up the stack frame */
-               setup_frame(signr, &ka, &info, oldset, regs);
-
-               if (ka.sa.sa_flags & SA_ONESHOT)
-                       ka.sa.sa_handler = SIG_DFL;
+               ret = setup_frame(signr, &ka, &info, oldset, regs);
+               if (ret)
+                       return ret;
 
-               spin_lock_irq(&current->sighand->siglock);
-               sigorsets(&current->blocked, &current->blocked, &ka.sa.sa_mask);
-               if (!(ka.sa.sa_flags & SA_NODEFER))
-                       sigaddset(&current->blocked, signr);
-               recalc_sigpending();
-               spin_unlock_irq(&current->sighand->siglock);
+               block_sigmask(&ka, signr);
                if (current->ptrace & PT_SINGLESTEP)
                        task_pt_regs(current)->icountlevel = 1;
 
index 1c15e9b3357567f7845452bf38b6796fb11b1a50..d0f59c3f87eff08c9af04bd7d13f2b91e6f3ea17 100644 (file)
@@ -507,8 +507,7 @@ int intel_idle_cpu_init(int cpu)
                int num_substates;
 
                if (cstate > max_cstate) {
-                       printk(PREFIX "max_cstate %d reached\n",
-                              max_cstate);
+                       printk(PREFIX "max_cstate %d reached\n", max_cstate);
                        break;
                }
 
@@ -524,8 +523,9 @@ int intel_idle_cpu_init(int cpu)
                dev->states_usage[dev->state_count].driver_data =
                        (void *)get_driver_data(cstate);
 
-                       dev->state_count += 1;
-               }
+               dev->state_count += 1;
+       }
+
        dev->cpu = cpu;
 
        if (cpuidle_register_device(dev)) {
index ecb8e2203ac87ac18ac0d4d24f17768bad57a088..136e86faa1e17e0e807a3c41b93fd464fd7c8729 100644 (file)
@@ -346,7 +346,7 @@ static struct sysrq_key_op sysrq_term_op = {
 
 static void moom_callback(struct work_struct *ignored)
 {
-       out_of_memory(node_zonelist(0, GFP_KERNEL), GFP_KERNEL, 0, NULL);
+       out_of_memory(node_zonelist(0, GFP_KERNEL), GFP_KERNEL, 0, NULL, true);
 }
 
 static DECLARE_WORK(moom_work, moom_callback);
index 0b931471d4f439d444987cb2ef81c203c8ff4f8f..23559c227d9cb4f9480855590df611999e852dd4 100644 (file)
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -822,7 +822,7 @@ static int exec_mmap(struct mm_struct *mm)
        /* Notify parent that we're no longer interested in the old VM */
        tsk = current;
        old_mm = current->mm;
-       sync_mm_rss(tsk, old_mm);
+       sync_mm_rss(old_mm);
        mm_release(tsk, old_mm);
 
        if (old_mm) {
index 81932fa1861ae1a70e51dc105af411093afbe72e..ea251749d9d5982e6fc35ecf6d21ab61547962e8 100644 (file)
@@ -41,6 +41,25 @@ const struct file_operations hugetlbfs_file_operations;
 static const struct inode_operations hugetlbfs_dir_inode_operations;
 static const struct inode_operations hugetlbfs_inode_operations;
 
+struct hugetlbfs_config {
+       uid_t   uid;
+       gid_t   gid;
+       umode_t mode;
+       long    nr_blocks;
+       long    nr_inodes;
+       struct hstate *hstate;
+};
+
+struct hugetlbfs_inode_info {
+       struct shared_policy policy;
+       struct inode vfs_inode;
+};
+
+static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode)
+{
+       return container_of(inode, struct hugetlbfs_inode_info, vfs_inode);
+}
+
 static struct backing_dev_info hugetlbfs_backing_dev_info = {
        .name           = "hugetlbfs",
        .ra_pages       = 0,    /* No readahead */
@@ -154,10 +173,12 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
                        return addr;
        }
 
-       start_addr = mm->free_area_cache;
-
-       if (len <= mm->cached_hole_size)
+       if (len > mm->cached_hole_size)
+               start_addr = mm->free_area_cache;
+       else {
                start_addr = TASK_UNMAPPED_BASE;
+               mm->cached_hole_size = 0;
+       }
 
 full_search:
        addr = ALIGN(start_addr, huge_page_size(h));
@@ -171,13 +192,18 @@ full_search:
                         */
                        if (start_addr != TASK_UNMAPPED_BASE) {
                                start_addr = TASK_UNMAPPED_BASE;
+                               mm->cached_hole_size = 0;
                                goto full_search;
                        }
                        return -ENOMEM;
                }
 
-               if (!vma || addr + len <= vma->vm_start)
+               if (!vma || addr + len <= vma->vm_start) {
+                       mm->free_area_cache = addr + len;
                        return addr;
+               }
+               if (addr + mm->cached_hole_size < vma->vm_start)
+                       mm->cached_hole_size = vma->vm_start - addr;
                addr = ALIGN(vma->vm_end, huge_page_size(h));
        }
 }
@@ -238,17 +264,10 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
        loff_t isize;
        ssize_t retval = 0;
 
-       mutex_lock(&inode->i_mutex);
-
        /* validate length */
        if (len == 0)
                goto out;
 
-       isize = i_size_read(inode);
-       if (!isize)
-               goto out;
-
-       end_index = (isize - 1) >> huge_page_shift(h);
        for (;;) {
                struct page *page;
                unsigned long nr, ret;
@@ -256,18 +275,21 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
 
                /* nr is the maximum number of bytes to copy from this page */
                nr = huge_page_size(h);
+               isize = i_size_read(inode);
+               if (!isize)
+                       goto out;
+               end_index = (isize - 1) >> huge_page_shift(h);
                if (index >= end_index) {
                        if (index > end_index)
                                goto out;
                        nr = ((isize - 1) & ~huge_page_mask(h)) + 1;
-                       if (nr <= offset) {
+                       if (nr <= offset)
                                goto out;
-                       }
                }
                nr = nr - offset;
 
                /* Find the page */
-               page = find_get_page(mapping, index);
+               page = find_lock_page(mapping, index);
                if (unlikely(page == NULL)) {
                        /*
                         * We have a HOLE, zero out the user-buffer for the
@@ -279,17 +301,18 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
                        else
                                ra = 0;
                } else {
+                       unlock_page(page);
+
                        /*
                         * We have the page, copy it to user space buffer.
                         */
                        ra = hugetlbfs_read_actor(page, offset, buf, len, nr);
                        ret = ra;
+                       page_cache_release(page);
                }
                if (ra < 0) {
                        if (retval == 0)
                                retval = ra;
-                       if (page)
-                               page_cache_release(page);
                        goto out;
                }
 
@@ -299,16 +322,12 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
                index += offset >> huge_page_shift(h);
                offset &= ~huge_page_mask(h);
 
-               if (page)
-                       page_cache_release(page);
-
                /* short read or no more work */
                if ((ret != nr) || (len == 0))
                        break;
        }
 out:
        *ppos = ((loff_t)index << huge_page_shift(h)) + offset;
-       mutex_unlock(&inode->i_mutex);
        return retval;
 }
 
@@ -607,9 +626,15 @@ static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
                spin_lock(&sbinfo->stat_lock);
                /* If no limits set, just report 0 for max/free/used
                 * blocks, like simple_statfs() */
-               if (sbinfo->max_blocks >= 0) {
-                       buf->f_blocks = sbinfo->max_blocks;
-                       buf->f_bavail = buf->f_bfree = sbinfo->free_blocks;
+               if (sbinfo->spool) {
+                       long free_pages;
+
+                       spin_lock(&sbinfo->spool->lock);
+                       buf->f_blocks = sbinfo->spool->max_hpages;
+                       free_pages = sbinfo->spool->max_hpages
+                               - sbinfo->spool->used_hpages;
+                       buf->f_bavail = buf->f_bfree = free_pages;
+                       spin_unlock(&sbinfo->spool->lock);
                        buf->f_files = sbinfo->max_inodes;
                        buf->f_ffree = sbinfo->free_inodes;
                }
@@ -625,6 +650,10 @@ static void hugetlbfs_put_super(struct super_block *sb)
 
        if (sbi) {
                sb->s_fs_info = NULL;
+
+               if (sbi->spool)
+                       hugepage_put_subpool(sbi->spool);
+
                kfree(sbi);
        }
 }
@@ -853,10 +882,14 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_fs_info = sbinfo;
        sbinfo->hstate = config.hstate;
        spin_lock_init(&sbinfo->stat_lock);
-       sbinfo->max_blocks = config.nr_blocks;
-       sbinfo->free_blocks = config.nr_blocks;
        sbinfo->max_inodes = config.nr_inodes;
        sbinfo->free_inodes = config.nr_inodes;
+       sbinfo->spool = NULL;
+       if (config.nr_blocks != -1) {
+               sbinfo->spool = hugepage_new_subpool(config.nr_blocks);
+               if (!sbinfo->spool)
+                       goto out_free;
+       }
        sb->s_maxbytes = MAX_LFS_FILESIZE;
        sb->s_blocksize = huge_page_size(config.hstate);
        sb->s_blocksize_bits = huge_page_shift(config.hstate);
@@ -868,38 +901,12 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
                goto out_free;
        return 0;
 out_free:
+       if (sbinfo->spool)
+               kfree(sbinfo->spool);
        kfree(sbinfo);
        return -ENOMEM;
 }
 
-int hugetlb_get_quota(struct address_space *mapping, long delta)
-{
-       int ret = 0;
-       struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb);
-
-       if (sbinfo->free_blocks > -1) {
-               spin_lock(&sbinfo->stat_lock);
-               if (sbinfo->free_blocks - delta >= 0)
-                       sbinfo->free_blocks -= delta;
-               else
-                       ret = -ENOMEM;
-               spin_unlock(&sbinfo->stat_lock);
-       }
-
-       return ret;
-}
-
-void hugetlb_put_quota(struct address_space *mapping, long delta)
-{
-       struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb);
-
-       if (sbinfo->free_blocks > -1) {
-               spin_lock(&sbinfo->stat_lock);
-               sbinfo->free_blocks += delta;
-               spin_unlock(&sbinfo->stat_lock);
-       }
-}
-
 static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *data)
 {
@@ -919,8 +926,8 @@ static int can_do_hugetlb_shm(void)
        return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group);
 }
 
-struct file *hugetlb_file_setup(const char *name, size_t size,
-                               vm_flags_t acctflag,
+struct file *hugetlb_file_setup(const char *name, unsigned long addr,
+                               size_t size, vm_flags_t acctflag,
                                struct user_struct **user, int creat_flags)
 {
        int error = -ENOMEM;
@@ -929,6 +936,8 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
        struct path path;
        struct dentry *root;
        struct qstr quick_string;
+       struct hstate *hstate;
+       unsigned long num_pages;
 
        *user = NULL;
        if (!hugetlbfs_vfsmount)
@@ -937,7 +946,11 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
        if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
                *user = current_user();
                if (user_shm_lock(size, *user)) {
-                       printk_once(KERN_WARNING "Using mlock ulimits for SHM_HUGETLB is deprecated\n");
+                       task_lock(current);
+                       printk_once(KERN_WARNING
+                               "%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n",
+                               current->comm, current->pid);
+                       task_unlock(current);
                } else {
                        *user = NULL;
                        return ERR_PTR(-EPERM);
@@ -958,10 +971,12 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
        if (!inode)
                goto out_dentry;
 
+       hstate = hstate_inode(inode);
+       size += addr & ~huge_page_mask(hstate);
+       num_pages = ALIGN(size, huge_page_size(hstate)) >>
+                       huge_page_shift(hstate);
        error = -ENOMEM;
-       if (hugetlb_reserve_pages(inode, 0,
-                       size >> huge_page_shift(hstate_inode(inode)), NULL,
-                       acctflag))
+       if (hugetlb_reserve_pages(inode, 0, num_pages, NULL, acctflag))
                goto out_inode;
 
        d_instantiate(path.dentry, inode);
@@ -997,6 +1012,7 @@ static int __init init_hugetlbfs_fs(void)
        if (error)
                return error;
 
+       error = -ENOMEM;
        hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
                                        sizeof(struct hugetlbfs_inode_info),
                                        0, 0, init_once);
@@ -1015,10 +1031,10 @@ static int __init init_hugetlbfs_fs(void)
        }
 
        error = PTR_ERR(vfsmount);
+       unregister_filesystem(&hugetlbfs_fs_type);
 
  out:
-       if (error)
-               kmem_cache_destroy(hugetlbfs_inode_cachep);
+       kmem_cache_destroy(hugetlbfs_inode_cachep);
  out2:
        bdi_destroy(&hugetlbfs_backing_dev_info);
        return error;
index 13e6a1f191a900aecb6fdaef499a955e17c92a9d..a94a7f9a03eaf9d543fb32620c9a172fdb0c60ec 100644 (file)
@@ -1455,9 +1455,15 @@ done:
 }
 EXPORT_SYMBOL(full_name_hash);
 
+#ifdef CONFIG_64BIT
 #define ONEBYTES       0x0101010101010101ul
 #define SLASHBYTES     0x2f2f2f2f2f2f2f2ful
 #define HIGHBITS       0x8080808080808080ul
+#else
+#define ONEBYTES       0x01010101ul
+#define SLASHBYTES     0x2f2f2f2ful
+#define HIGHBITS       0x80808080ul
+#endif
 
 /* Return the high bit set in the first byte that is a zero */
 static inline unsigned long has_zero(unsigned long a)
index 965d4bde3a3b41816e1f08627ccd500bc6cc1544..3b42c1418f3118871d93275848f633217419129f 100644 (file)
@@ -2989,9 +2989,9 @@ static const struct pid_entry tgid_base_stuff[] = {
        INF("cmdline",    S_IRUGO, proc_pid_cmdline),
        ONE("stat",       S_IRUGO, proc_tgid_stat),
        ONE("statm",      S_IRUGO, proc_pid_statm),
-       REG("maps",       S_IRUGO, proc_maps_operations),
+       REG("maps",       S_IRUGO, proc_pid_maps_operations),
 #ifdef CONFIG_NUMA
-       REG("numa_maps",  S_IRUGO, proc_numa_maps_operations),
+       REG("numa_maps",  S_IRUGO, proc_pid_numa_maps_operations),
 #endif
        REG("mem",        S_IRUSR|S_IWUSR, proc_mem_operations),
        LNK("cwd",        proc_cwd_link),
@@ -3002,7 +3002,7 @@ static const struct pid_entry tgid_base_stuff[] = {
        REG("mountstats", S_IRUSR, proc_mountstats_operations),
 #ifdef CONFIG_PROC_PAGE_MONITOR
        REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
-       REG("smaps",      S_IRUGO, proc_smaps_operations),
+       REG("smaps",      S_IRUGO, proc_pid_smaps_operations),
        REG("pagemap",    S_IRUGO, proc_pagemap_operations),
 #endif
 #ifdef CONFIG_SECURITY
@@ -3348,9 +3348,9 @@ static const struct pid_entry tid_base_stuff[] = {
        INF("cmdline",   S_IRUGO, proc_pid_cmdline),
        ONE("stat",      S_IRUGO, proc_tid_stat),
        ONE("statm",     S_IRUGO, proc_pid_statm),
-       REG("maps",      S_IRUGO, proc_maps_operations),
+       REG("maps",      S_IRUGO, proc_tid_maps_operations),
 #ifdef CONFIG_NUMA
-       REG("numa_maps", S_IRUGO, proc_numa_maps_operations),
+       REG("numa_maps", S_IRUGO, proc_tid_numa_maps_operations),
 #endif
        REG("mem",       S_IRUSR|S_IWUSR, proc_mem_operations),
        LNK("cwd",       proc_cwd_link),
@@ -3360,7 +3360,7 @@ static const struct pid_entry tid_base_stuff[] = {
        REG("mountinfo",  S_IRUGO, proc_mountinfo_operations),
 #ifdef CONFIG_PROC_PAGE_MONITOR
        REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
-       REG("smaps",     S_IRUGO, proc_smaps_operations),
+       REG("smaps",     S_IRUGO, proc_tid_smaps_operations),
        REG("pagemap",    S_IRUGO, proc_pagemap_operations),
 #endif
 #ifdef CONFIG_SECURITY
index 292577531ad13e665bfd94ff63dbfd5a206a5472..c44efe19798fde8f11741448bbbfad548a0c151f 100644 (file)
@@ -53,9 +53,12 @@ extern int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
                                struct pid *pid, struct task_struct *task);
 extern loff_t mem_lseek(struct file *file, loff_t offset, int orig);
 
-extern const struct file_operations proc_maps_operations;
-extern const struct file_operations proc_numa_maps_operations;
-extern const struct file_operations proc_smaps_operations;
+extern const struct file_operations proc_pid_maps_operations;
+extern const struct file_operations proc_tid_maps_operations;
+extern const struct file_operations proc_pid_numa_maps_operations;
+extern const struct file_operations proc_tid_numa_maps_operations;
+extern const struct file_operations proc_pid_smaps_operations;
+extern const struct file_operations proc_tid_smaps_operations;
 extern const struct file_operations proc_clear_refs_operations;
 extern const struct file_operations proc_pagemap_operations;
 extern const struct file_operations proc_net_operations;
index 6d8e6a9e93aba59e471d32d4ee9dc59b2579d8c4..7fcd0d60a9680ae1eb4998633c51ea90e730209f 100644 (file)
@@ -115,6 +115,8 @@ u64 stable_page_flags(struct page *page)
                u |= 1 << KPF_COMPOUND_TAIL;
        if (PageHuge(page))
                u |= 1 << KPF_HUGE;
+       else if (PageTransCompound(page))
+               u |= 1 << KPF_THP;
 
        /*
         * Caveats on high order pages: page->_count will only be set
index 7dcd2a250495d9a1777e6d7992637fb0a4897a81..9694cc2835115f18c5c6f9719d8ab9a332631901 100644 (file)
@@ -209,16 +209,20 @@ static int do_maps_open(struct inode *inode, struct file *file,
        return ret;
 }
 
-static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
+static void
+show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
 {
        struct mm_struct *mm = vma->vm_mm;
        struct file *file = vma->vm_file;
+       struct proc_maps_private *priv = m->private;
+       struct task_struct *task = priv->task;
        vm_flags_t flags = vma->vm_flags;
        unsigned long ino = 0;
        unsigned long long pgoff = 0;
        unsigned long start, end;
        dev_t dev = 0;
        int len;
+       const char *name = NULL;
 
        if (file) {
                struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
@@ -252,36 +256,57 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
        if (file) {
                pad_len_spaces(m, len);
                seq_path(m, &file->f_path, "\n");
-       } else {
-               const char *name = arch_vma_name(vma);
-               if (!name) {
-                       if (mm) {
-                               if (vma->vm_start <= mm->brk &&
-                                               vma->vm_end >= mm->start_brk) {
-                                       name = "[heap]";
-                               } else if (vma->vm_start <= mm->start_stack &&
-                                          vma->vm_end >= mm->start_stack) {
-                                       name = "[stack]";
-                               }
+               goto done;
+       }
+
+       name = arch_vma_name(vma);
+       if (!name) {
+               pid_t tid;
+
+               if (!mm) {
+                       name = "[vdso]";
+                       goto done;
+               }
+
+               if (vma->vm_start <= mm->brk &&
+                   vma->vm_end >= mm->start_brk) {
+                       name = "[heap]";
+                       goto done;
+               }
+
+               tid = vm_is_stack(task, vma, is_pid);
+
+               if (tid != 0) {
+                       /*
+                        * Thread stack in /proc/PID/task/TID/maps or
+                        * the main process stack.
+                        */
+                       if (!is_pid || (vma->vm_start <= mm->start_stack &&
+                           vma->vm_end >= mm->start_stack)) {
+                               name = "[stack]";
                        } else {
-                               name = "[vdso]";
+                               /* Thread stack in /proc/PID/maps */
+                               pad_len_spaces(m, len);
+                               seq_printf(m, "[stack:%d]", tid);
                        }
                }
-               if (name) {
-                       pad_len_spaces(m, len);
-                       seq_puts(m, name);
-               }
+       }
+
+done:
+       if (name) {
+               pad_len_spaces(m, len);
+               seq_puts(m, name);
        }
        seq_putc(m, '\n');
 }
 
-static int show_map(struct seq_file *m, void *v)
+static int show_map(struct seq_file *m, void *v, int is_pid)
 {
        struct vm_area_struct *vma = v;
        struct proc_maps_private *priv = m->private;
        struct task_struct *task = priv->task;
 
-       show_map_vma(m, vma);
+       show_map_vma(m, vma, is_pid);
 
        if (m->count < m->size)  /* vma is copied successfully */
                m->version = (vma != get_gate_vma(task->mm))
@@ -289,20 +314,49 @@ static int show_map(struct seq_file *m, void *v)
        return 0;
 }
 
+static int show_pid_map(struct seq_file *m, void *v)
+{
+       return show_map(m, v, 1);
+}
+
+static int show_tid_map(struct seq_file *m, void *v)
+{
+       return show_map(m, v, 0);
+}
+
 static const struct seq_operations proc_pid_maps_op = {
        .start  = m_start,
        .next   = m_next,
        .stop   = m_stop,
-       .show   = show_map
+       .show   = show_pid_map
 };
 
-static int maps_open(struct inode *inode, struct file *file)
+static const struct seq_operations proc_tid_maps_op = {
+       .start  = m_start,
+       .next   = m_next,
+       .stop   = m_stop,
+       .show   = show_tid_map
+};
+
+static int pid_maps_open(struct inode *inode, struct file *file)
 {
        return do_maps_open(inode, file, &proc_pid_maps_op);
 }
 
-const struct file_operations proc_maps_operations = {
-       .open           = maps_open,
+static int tid_maps_open(struct inode *inode, struct file *file)
+{
+       return do_maps_open(inode, file, &proc_tid_maps_op);
+}
+
+const struct file_operations proc_pid_maps_operations = {
+       .open           = pid_maps_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = seq_release_private,
+};
+
+const struct file_operations proc_tid_maps_operations = {
+       .open           = tid_maps_open,
        .read           = seq_read,
        .llseek         = seq_lseek,
        .release        = seq_release_private,
@@ -394,21 +448,15 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
        pte_t *pte;
        spinlock_t *ptl;
 
-       spin_lock(&walk->mm->page_table_lock);
-       if (pmd_trans_huge(*pmd)) {
-               if (pmd_trans_splitting(*pmd)) {
-                       spin_unlock(&walk->mm->page_table_lock);
-                       wait_split_huge_page(vma->anon_vma, pmd);
-               } else {
-                       smaps_pte_entry(*(pte_t *)pmd, addr,
-                                       HPAGE_PMD_SIZE, walk);
-                       spin_unlock(&walk->mm->page_table_lock);
-                       mss->anonymous_thp += HPAGE_PMD_SIZE;
-                       return 0;
-               }
-       } else {
+       if (pmd_trans_huge_lock(pmd, vma) == 1) {
+               smaps_pte_entry(*(pte_t *)pmd, addr, HPAGE_PMD_SIZE, walk);
                spin_unlock(&walk->mm->page_table_lock);
+               mss->anonymous_thp += HPAGE_PMD_SIZE;
+               return 0;
        }
+
+       if (pmd_trans_unstable(pmd))
+               return 0;
        /*
         * The mmap_sem held all the way back in m_start() is what
         * keeps khugepaged out of here and from collapsing things
@@ -422,7 +470,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
        return 0;
 }
 
-static int show_smap(struct seq_file *m, void *v)
+static int show_smap(struct seq_file *m, void *v, int is_pid)
 {
        struct proc_maps_private *priv = m->private;
        struct task_struct *task = priv->task;
@@ -440,7 +488,7 @@ static int show_smap(struct seq_file *m, void *v)
        if (vma->vm_mm && !is_vm_hugetlb_page(vma))
                walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk);
 
-       show_map_vma(m, vma);
+       show_map_vma(m, vma, is_pid);
 
        seq_printf(m,
                   "Size:           %8lu kB\n"
@@ -479,20 +527,49 @@ static int show_smap(struct seq_file *m, void *v)
        return 0;
 }
 
+static int show_pid_smap(struct seq_file *m, void *v)
+{
+       return show_smap(m, v, 1);
+}
+
+static int show_tid_smap(struct seq_file *m, void *v)
+{
+       return show_smap(m, v, 0);
+}
+
 static const struct seq_operations proc_pid_smaps_op = {
        .start  = m_start,
        .next   = m_next,
        .stop   = m_stop,
-       .show   = show_smap
+       .show   = show_pid_smap
+};
+
+static const struct seq_operations proc_tid_smaps_op = {
+       .start  = m_start,
+       .next   = m_next,
+       .stop   = m_stop,
+       .show   = show_tid_smap
 };
 
-static int smaps_open(struct inode *inode, struct file *file)
+static int pid_smaps_open(struct inode *inode, struct file *file)
 {
        return do_maps_open(inode, file, &proc_pid_smaps_op);
 }
 
-const struct file_operations proc_smaps_operations = {
-       .open           = smaps_open,
+static int tid_smaps_open(struct inode *inode, struct file *file)
+{
+       return do_maps_open(inode, file, &proc_tid_smaps_op);
+}
+
+const struct file_operations proc_pid_smaps_operations = {
+       .open           = pid_smaps_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = seq_release_private,
+};
+
+const struct file_operations proc_tid_smaps_operations = {
+       .open           = tid_smaps_open,
        .read           = seq_read,
        .llseek         = seq_lseek,
        .release        = seq_release_private,
@@ -507,6 +584,8 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
        struct page *page;
 
        split_huge_page_pmd(walk->mm, pmd);
+       if (pmd_trans_unstable(pmd))
+               return 0;
 
        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
        for (; addr != end; pte++, addr += PAGE_SIZE) {
@@ -598,11 +677,18 @@ const struct file_operations proc_clear_refs_operations = {
        .llseek         = noop_llseek,
 };
 
+typedef struct {
+       u64 pme;
+} pagemap_entry_t;
+
 struct pagemapread {
        int pos, len;
-       u64 *buffer;
+       pagemap_entry_t *buffer;
 };
 
+#define PAGEMAP_WALK_SIZE      (PMD_SIZE)
+#define PAGEMAP_WALK_MASK      (PMD_MASK)
+
 #define PM_ENTRY_BYTES      sizeof(u64)
 #define PM_STATUS_BITS      3
 #define PM_STATUS_OFFSET    (64 - PM_STATUS_BITS)
@@ -620,10 +706,15 @@ struct pagemapread {
 #define PM_NOT_PRESENT      PM_PSHIFT(PAGE_SHIFT)
 #define PM_END_OF_BUFFER    1
 
-static int add_to_pagemap(unsigned long addr, u64 pfn,
+static inline pagemap_entry_t make_pme(u64 val)
+{
+       return (pagemap_entry_t) { .pme = val };
+}
+
+static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,
                          struct pagemapread *pm)
 {
-       pm->buffer[pm->pos++] = pfn;
+       pm->buffer[pm->pos++] = *pme;
        if (pm->pos >= pm->len)
                return PM_END_OF_BUFFER;
        return 0;
@@ -635,8 +726,10 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
        struct pagemapread *pm = walk->private;
        unsigned long addr;
        int err = 0;
+       pagemap_entry_t pme = make_pme(PM_NOT_PRESENT);
+
        for (addr = start; addr < end; addr += PAGE_SIZE) {
-               err = add_to_pagemap(addr, PM_NOT_PRESENT, pm);
+               err = add_to_pagemap(addr, &pme, pm);
                if (err)
                        break;
        }
@@ -649,17 +742,35 @@ static u64 swap_pte_to_pagemap_entry(pte_t pte)
        return swp_type(e) | (swp_offset(e) << MAX_SWAPFILES_SHIFT);
 }
 
-static u64 pte_to_pagemap_entry(pte_t pte)
+static void pte_to_pagemap_entry(pagemap_entry_t *pme, pte_t pte)
 {
-       u64 pme = 0;
        if (is_swap_pte(pte))
-               pme = PM_PFRAME(swap_pte_to_pagemap_entry(pte))
-                       | PM_PSHIFT(PAGE_SHIFT) | PM_SWAP;
+               *pme = make_pme(PM_PFRAME(swap_pte_to_pagemap_entry(pte))
+                               | PM_PSHIFT(PAGE_SHIFT) | PM_SWAP);
        else if (pte_present(pte))
-               pme = PM_PFRAME(pte_pfn(pte))
-                       | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT;
-       return pme;
+               *pme = make_pme(PM_PFRAME(pte_pfn(pte))
+                               | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme,
+                                       pmd_t pmd, int offset)
+{
+       /*
+        * Currently pmd for thp is always present because thp can not be
+        * swapped-out, migrated, or HWPOISONed (split in such cases instead.)
+        * This if-check is just to prepare for future implementation.
+        */
+       if (pmd_present(pmd))
+               *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset)
+                               | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT);
 }
+#else
+static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme,
+                                               pmd_t pmd, int offset)
+{
+}
+#endif
 
 static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
                             struct mm_walk *walk)
@@ -668,13 +779,30 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
        struct pagemapread *pm = walk->private;
        pte_t *pte;
        int err = 0;
+       pagemap_entry_t pme = make_pme(PM_NOT_PRESENT);
 
-       split_huge_page_pmd(walk->mm, pmd);
+       if (pmd_trans_unstable(pmd))
+               return 0;
 
        /* find the first VMA at or above 'addr' */
        vma = find_vma(walk->mm, addr);
+       spin_lock(&walk->mm->page_table_lock);
+       if (pmd_trans_huge_lock(pmd, vma) == 1) {
+               for (; addr != end; addr += PAGE_SIZE) {
+                       unsigned long offset;
+
+                       offset = (addr & ~PAGEMAP_WALK_MASK) >>
+                                       PAGE_SHIFT;
+                       thp_pmd_to_pagemap_entry(&pme, *pmd, offset);
+                       err = add_to_pagemap(addr, &pme, pm);
+                       if (err)
+                               break;
+               }
+               spin_unlock(&walk->mm->page_table_lock);
+               return err;
+       }
+
        for (; addr != end; addr += PAGE_SIZE) {
-               u64 pfn = PM_NOT_PRESENT;
 
                /* check to see if we've left 'vma' behind
                 * and need a new, higher one */
@@ -686,11 +814,11 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
                if (vma && (vma->vm_start <= addr) &&
                    !is_vm_hugetlb_page(vma)) {
                        pte = pte_offset_map(pmd, addr);
-                       pfn = pte_to_pagemap_entry(*pte);
+                       pte_to_pagemap_entry(&pme, *pte);
                        /* unmap before userspace copy */
                        pte_unmap(pte);
                }
-               err = add_to_pagemap(addr, pfn, pm);
+               err = add_to_pagemap(addr, &pme, pm);
                if (err)
                        return err;
        }
@@ -701,13 +829,12 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 }
 
 #ifdef CONFIG_HUGETLB_PAGE
-static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset)
+static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme,
+                                       pte_t pte, int offset)
 {
-       u64 pme = 0;
        if (pte_present(pte))
-               pme = PM_PFRAME(pte_pfn(pte) + offset)
-                       | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT;
-       return pme;
+               *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset)
+                               | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT);
 }
 
 /* This function walks within one hugetlb entry in the single call */
@@ -717,12 +844,12 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
 {
        struct pagemapread *pm = walk->private;
        int err = 0;
-       u64 pfn;
+       pagemap_entry_t pme = make_pme(PM_NOT_PRESENT);
 
        for (; addr != end; addr += PAGE_SIZE) {
                int offset = (addr & ~hmask) >> PAGE_SHIFT;
-               pfn = huge_pte_to_pagemap_entry(*pte, offset);
-               err = add_to_pagemap(addr, pfn, pm);
+               huge_pte_to_pagemap_entry(&pme, *pte, offset);
+               err = add_to_pagemap(addr, &pme, pm);
                if (err)
                        return err;
        }
@@ -757,8 +884,6 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
  * determine which areas of memory are actually mapped and llseek to
  * skip over unmapped regions.
  */
-#define PAGEMAP_WALK_SIZE      (PMD_SIZE)
-#define PAGEMAP_WALK_MASK      (PMD_MASK)
 static ssize_t pagemap_read(struct file *file, char __user *buf,
                            size_t count, loff_t *ppos)
 {
@@ -941,26 +1066,21 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
        pte_t *pte;
 
        md = walk->private;
-       spin_lock(&walk->mm->page_table_lock);
-       if (pmd_trans_huge(*pmd)) {
-               if (pmd_trans_splitting(*pmd)) {
-                       spin_unlock(&walk->mm->page_table_lock);
-                       wait_split_huge_page(md->vma->anon_vma, pmd);
-               } else {
-                       pte_t huge_pte = *(pte_t *)pmd;
-                       struct page *page;
-
-                       page = can_gather_numa_stats(huge_pte, md->vma, addr);
-                       if (page)
-                               gather_stats(page, md, pte_dirty(huge_pte),
-                                               HPAGE_PMD_SIZE/PAGE_SIZE);
-                       spin_unlock(&walk->mm->page_table_lock);
-                       return 0;
-               }
-       } else {
+
+       if (pmd_trans_huge_lock(pmd, md->vma) == 1) {
+               pte_t huge_pte = *(pte_t *)pmd;
+               struct page *page;
+
+               page = can_gather_numa_stats(huge_pte, md->vma, addr);
+               if (page)
+                       gather_stats(page, md, pte_dirty(huge_pte),
+                                    HPAGE_PMD_SIZE/PAGE_SIZE);
                spin_unlock(&walk->mm->page_table_lock);
+               return 0;
        }
 
+       if (pmd_trans_unstable(pmd))
+               return 0;
        orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
        do {
                struct page *page = can_gather_numa_stats(*pte, md->vma, addr);
@@ -1002,7 +1122,7 @@ static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
 /*
  * Display pages allocated per node and memory policy via /proc.
  */
-static int show_numa_map(struct seq_file *m, void *v)
+static int show_numa_map(struct seq_file *m, void *v, int is_pid)
 {
        struct numa_maps_private *numa_priv = m->private;
        struct proc_maps_private *proc_priv = &numa_priv->proc_maps;
@@ -1039,9 +1159,19 @@ static int show_numa_map(struct seq_file *m, void *v)
                seq_path(m, &file->f_path, "\n\t= ");
        } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
                seq_printf(m, " heap");
-       } else if (vma->vm_start <= mm->start_stack &&
-                       vma->vm_end >= mm->start_stack) {
-               seq_printf(m, " stack");
+       } else {
+               pid_t tid = vm_is_stack(proc_priv->task, vma, is_pid);
+               if (tid != 0) {
+                       /*
+                        * Thread stack in /proc/PID/task/TID/maps or
+                        * the main process stack.
+                        */
+                       if (!is_pid || (vma->vm_start <= mm->start_stack &&
+                           vma->vm_end >= mm->start_stack))
+                               seq_printf(m, " stack");
+                       else
+                               seq_printf(m, " stack:%d", tid);
+               }
        }
 
        if (is_vm_hugetlb_page(vma))
@@ -1084,21 +1214,39 @@ out:
        return 0;
 }
 
+static int show_pid_numa_map(struct seq_file *m, void *v)
+{
+       return show_numa_map(m, v, 1);
+}
+
+static int show_tid_numa_map(struct seq_file *m, void *v)
+{
+       return show_numa_map(m, v, 0);
+}
+
 static const struct seq_operations proc_pid_numa_maps_op = {
-        .start  = m_start,
-        .next   = m_next,
-        .stop   = m_stop,
-        .show   = show_numa_map,
+       .start  = m_start,
+       .next   = m_next,
+       .stop   = m_stop,
+       .show   = show_pid_numa_map,
+};
+
+static const struct seq_operations proc_tid_numa_maps_op = {
+       .start  = m_start,
+       .next   = m_next,
+       .stop   = m_stop,
+       .show   = show_tid_numa_map,
 };
 
-static int numa_maps_open(struct inode *inode, struct file *file)
+static int numa_maps_open(struct inode *inode, struct file *file,
+                         const struct seq_operations *ops)
 {
        struct numa_maps_private *priv;
        int ret = -ENOMEM;
        priv = kzalloc(sizeof(*priv), GFP_KERNEL);
        if (priv) {
                priv->proc_maps.pid = proc_pid(inode);
-               ret = seq_open(file, &proc_pid_numa_maps_op);
+               ret = seq_open(file, ops);
                if (!ret) {
                        struct seq_file *m = file->private_data;
                        m->private = priv;
@@ -1109,8 +1257,25 @@ static int numa_maps_open(struct inode *inode, struct file *file)
        return ret;
 }
 
-const struct file_operations proc_numa_maps_operations = {
-       .open           = numa_maps_open,
+static int pid_numa_maps_open(struct inode *inode, struct file *file)
+{
+       return numa_maps_open(inode, file, &proc_pid_numa_maps_op);
+}
+
+static int tid_numa_maps_open(struct inode *inode, struct file *file)
+{
+       return numa_maps_open(inode, file, &proc_tid_numa_maps_op);
+}
+
+const struct file_operations proc_pid_numa_maps_operations = {
+       .open           = pid_numa_maps_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = seq_release_private,
+};
+
+const struct file_operations proc_tid_numa_maps_operations = {
+       .open           = tid_numa_maps_open,
        .read           = seq_read,
        .llseek         = seq_lseek,
        .release        = seq_release_private,
index 980de547c0709b758307dce07a6e81ea9a9c59bd..74fe164d1b233924f6a4d3e8ddd24624660dd63a 100644 (file)
@@ -134,9 +134,11 @@ static void pad_len_spaces(struct seq_file *m, int len)
 /*
  * display a single VMA to a sequenced file
  */
-static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
+static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
+                         int is_pid)
 {
        struct mm_struct *mm = vma->vm_mm;
+       struct proc_maps_private *priv = m->private;
        unsigned long ino = 0;
        struct file *file;
        dev_t dev = 0;
@@ -168,10 +170,19 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
                pad_len_spaces(m, len);
                seq_path(m, &file->f_path, "");
        } else if (mm) {
-               if (vma->vm_start <= mm->start_stack &&
-                       vma->vm_end >= mm->start_stack) {
+               pid_t tid = vm_is_stack(priv->task, vma, is_pid);
+
+               if (tid != 0) {
                        pad_len_spaces(m, len);
-                       seq_puts(m, "[stack]");
+                       /*
+                        * Thread stack in /proc/PID/task/TID/maps or
+                        * the main process stack.
+                        */
+                       if (!is_pid || (vma->vm_start <= mm->start_stack &&
+                           vma->vm_end >= mm->start_stack))
+                               seq_printf(m, "[stack]");
+                       else
+                               seq_printf(m, "[stack:%d]", tid);
                }
        }
 
@@ -182,11 +193,22 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
 /*
  * display mapping lines for a particular process's /proc/pid/maps
  */
-static int show_map(struct seq_file *m, void *_p)
+static int show_map(struct seq_file *m, void *_p, int is_pid)
 {
        struct rb_node *p = _p;
 
-       return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb));
+       return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb),
+                             is_pid);
+}
+
+static int show_pid_map(struct seq_file *m, void *_p)
+{
+       return show_map(m, _p, 1);
+}
+
+static int show_tid_map(struct seq_file *m, void *_p)
+{
+       return show_map(m, _p, 0);
 }
 
 static void *m_start(struct seq_file *m, loff_t *pos)
@@ -240,10 +262,18 @@ static const struct seq_operations proc_pid_maps_ops = {
        .start  = m_start,
        .next   = m_next,
        .stop   = m_stop,
-       .show   = show_map
+       .show   = show_pid_map
+};
+
+static const struct seq_operations proc_tid_maps_ops = {
+       .start  = m_start,
+       .next   = m_next,
+       .stop   = m_stop,
+       .show   = show_tid_map
 };
 
-static int maps_open(struct inode *inode, struct file *file)
+static int maps_open(struct inode *inode, struct file *file,
+                    const struct seq_operations *ops)
 {
        struct proc_maps_private *priv;
        int ret = -ENOMEM;
@@ -251,7 +281,7 @@ static int maps_open(struct inode *inode, struct file *file)
        priv = kzalloc(sizeof(*priv), GFP_KERNEL);
        if (priv) {
                priv->pid = proc_pid(inode);
-               ret = seq_open(file, &proc_pid_maps_ops);
+               ret = seq_open(file, ops);
                if (!ret) {
                        struct seq_file *m = file->private_data;
                        m->private = priv;
@@ -262,8 +292,25 @@ static int maps_open(struct inode *inode, struct file *file)
        return ret;
 }
 
-const struct file_operations proc_maps_operations = {
-       .open           = maps_open,
+static int pid_maps_open(struct inode *inode, struct file *file)
+{
+       return maps_open(inode, file, &proc_pid_maps_ops);
+}
+
+static int tid_maps_open(struct inode *inode, struct file *file)
+{
+       return maps_open(inode, file, &proc_tid_maps_ops);
+}
+
+const struct file_operations proc_pid_maps_operations = {
+       .open           = pid_maps_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = seq_release_private,
+};
+
+const struct file_operations proc_tid_maps_operations = {
+       .open           = tid_maps_open,
        .read           = seq_read,
        .llseek         = seq_lseek,
        .release        = seq_release_private,
index 4023d6be939bc8521e9afca4094ad96ef8cf3c9f..aa242dc99373bec0fe981d22508921454759bd14 100644 (file)
@@ -140,9 +140,21 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
 
        mutex_lock(&m->lock);
 
+       /*
+        * seq_file->op->..m_start/m_stop/m_next may do special actions
+        * or optimisations based on the file->f_version, so we want to
+        * pass the file->f_version to those methods.
+        *
+        * seq_file->version is just copy of f_version, and seq_file
+        * methods can treat it simply as file version.
+        * It is copied in first and copied out after all operations.
+        * It is convenient to have it as  part of structure to avoid the
+        * need of passing another argument to all the seq_file methods.
+        */
+       m->version = file->f_version;
+
        /* Don't assume *ppos is where we left it */
        if (unlikely(*ppos != m->read_pos)) {
-               m->read_pos = *ppos;
                while ((err = traverse(m, *ppos)) == -EAGAIN)
                        ;
                if (err) {
@@ -152,21 +164,11 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
                        m->index = 0;
                        m->count = 0;
                        goto Done;
+               } else {
+                       m->read_pos = *ppos;
                }
        }
 
-       /*
-        * seq_file->op->..m_start/m_stop/m_next may do special actions
-        * or optimisations based on the file->f_version, so we want to
-        * pass the file->f_version to those methods.
-        *
-        * seq_file->version is just copy of f_version, and seq_file
-        * methods can treat it simply as file version.
-        * It is copied in first and copied out after all operations.
-        * It is convenient to have it as  part of structure to avoid the
-        * need of passing another argument to all the seq_file methods.
-        */
-       m->version = file->f_version;
        /* grab buffer if we didn't have one */
        if (!m->buf) {
                m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL);
index 76bff2bff15e346532be60dc1ce1a13aff070471..a03c098b0cce94e06c87e022fb33f9e19b8f89cb 100644 (file)
@@ -425,6 +425,8 @@ extern void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn,
                                unsigned long size);
 #endif
 
+#ifdef CONFIG_MMU
+
 #ifndef CONFIG_TRANSPARENT_HUGEPAGE
 static inline int pmd_trans_huge(pmd_t pmd)
 {
@@ -441,7 +443,66 @@ static inline int pmd_write(pmd_t pmd)
        return 0;
 }
 #endif /* __HAVE_ARCH_PMD_WRITE */
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+/*
+ * This function is meant to be used by sites walking pagetables with
+ * the mmap_sem hold in read mode to protect against MADV_DONTNEED and
+ * transhuge page faults. MADV_DONTNEED can convert a transhuge pmd
+ * into a null pmd and the transhuge page fault can convert a null pmd
+ * into an hugepmd or into a regular pmd (if the hugepage allocation
+ * fails). While holding the mmap_sem in read mode the pmd becomes
+ * stable and stops changing under us only if it's not null and not a
+ * transhuge pmd. When those races occurs and this function makes a
+ * difference vs the standard pmd_none_or_clear_bad, the result is
+ * undefined so behaving like if the pmd was none is safe (because it
+ * can return none anyway). The compiler level barrier() is critically
+ * important to compute the two checks atomically on the same pmdval.
+ */
+static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd)
+{
+       /* depend on compiler for an atomic pmd read */
+       pmd_t pmdval = *pmd;
+       /*
+        * The barrier will stabilize the pmdval in a register or on
+        * the stack so that it will stop changing under the code.
+        */
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       barrier();
+#endif
+       if (pmd_none(pmdval))
+               return 1;
+       if (unlikely(pmd_bad(pmdval))) {
+               if (!pmd_trans_huge(pmdval))
+                       pmd_clear_bad(pmd);
+               return 1;
+       }
+       return 0;
+}
+
+/*
+ * This is a noop if Transparent Hugepage Support is not built into
+ * the kernel. Otherwise it is equivalent to
+ * pmd_none_or_trans_huge_or_clear_bad(), and shall only be called in
+ * places that already verified the pmd is not none and they want to
+ * walk ptes while holding the mmap sem in read mode (write mode don't
+ * need this). If THP is not enabled, the pmd can't go away under the
+ * code even if MADV_DONTNEED runs, but if THP is enabled we need to
+ * run a pmd_trans_unstable before walking the ptes after
+ * split_huge_page_pmd returns (because it may have run when the pmd
+ * become null, but then a page fault can map in a THP and not a
+ * regular page).
+ */
+static inline int pmd_trans_unstable(pmd_t *pmd)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       return pmd_none_or_trans_huge_or_clear_bad(pmd);
+#else
+       return 0;
 #endif
+}
+
+#endif /* CONFIG_MMU */
 
 #endif /* !__ASSEMBLY__ */
 
index 501adb1b2f439f9ec746ba9a83b1f243682a9237..5a85b3415c1b56b971f54a6f65e87ea1bf5e64af 100644 (file)
@@ -498,7 +498,7 @@ struct cgroup_subsys {
        struct list_head sibling;
        /* used when use_id == true */
        struct idr idr;
-       rwlock_t id_lock;
+       spinlock_t id_lock;
 
        /* should be defined only by modular subsystems */
        struct module *module;
index bb2bbdbe546495e1f0ab94e87b58578bc5c0c53f..51a90b7f2d606a3dcb9bb582cd209add0c429351 100644 (file)
@@ -23,6 +23,7 @@ extern int fragmentation_index(struct zone *zone, unsigned int order);
 extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
                        int order, gfp_t gfp_mask, nodemask_t *mask,
                        bool sync);
+extern int compact_pgdat(pg_data_t *pgdat, int order);
 extern unsigned long compaction_suitable(struct zone *zone, int order);
 
 /* Do not skip compaction more than 64 times */
@@ -33,20 +34,26 @@ extern unsigned long compaction_suitable(struct zone *zone, int order);
  * allocation success. 1 << compact_defer_limit compactions are skipped up
  * to a limit of 1 << COMPACT_MAX_DEFER_SHIFT
  */
-static inline void defer_compaction(struct zone *zone)
+static inline void defer_compaction(struct zone *zone, int order)
 {
        zone->compact_considered = 0;
        zone->compact_defer_shift++;
 
+       if (order < zone->compact_order_failed)
+               zone->compact_order_failed = order;
+
        if (zone->compact_defer_shift > COMPACT_MAX_DEFER_SHIFT)
                zone->compact_defer_shift = COMPACT_MAX_DEFER_SHIFT;
 }
 
 /* Returns true if compaction should be skipped this time */
-static inline bool compaction_deferred(struct zone *zone)
+static inline bool compaction_deferred(struct zone *zone, int order)
 {
        unsigned long defer_limit = 1UL << zone->compact_defer_shift;
 
+       if (order < zone->compact_order_failed)
+               return false;
+
        /* Avoid possible overflow */
        if (++zone->compact_considered > defer_limit)
                zone->compact_considered = defer_limit;
@@ -62,16 +69,21 @@ static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
        return COMPACT_CONTINUE;
 }
 
+static inline int compact_pgdat(pg_data_t *pgdat, int order)
+{
+       return COMPACT_CONTINUE;
+}
+
 static inline unsigned long compaction_suitable(struct zone *zone, int order)
 {
        return COMPACT_SKIPPED;
 }
 
-static inline void defer_compaction(struct zone *zone)
+static inline void defer_compaction(struct zone *zone, int order)
 {
 }
 
-static inline bool compaction_deferred(struct zone *zone)
+static inline bool compaction_deferred(struct zone *zone, int order)
 {
        return 1;
 }
index e9eaec522655c4da77f55fefb851564c31cae711..7a7e5fd2a27784e3307e3351c92abd4240aba2db 100644 (file)
@@ -89,42 +89,33 @@ extern void rebuild_sched_domains(void);
 extern void cpuset_print_task_mems_allowed(struct task_struct *p);
 
 /*
- * reading current mems_allowed and mempolicy in the fastpath must protected
- * by get_mems_allowed()
+ * get_mems_allowed is required when making decisions involving mems_allowed
+ * such as during page allocation. mems_allowed can be updated in parallel
+ * and depending on the new value an operation can fail potentially causing
+ * process failure. A retry loop with get_mems_allowed and put_mems_allowed
+ * prevents these artificial failures.
  */
-static inline void get_mems_allowed(void)
+static inline unsigned int get_mems_allowed(void)
 {
-       current->mems_allowed_change_disable++;
-
-       /*
-        * ensure that reading mems_allowed and mempolicy happens after the
-        * update of ->mems_allowed_change_disable.
-        *
-        * the write-side task finds ->mems_allowed_change_disable is not 0,
-        * and knows the read-side task is reading mems_allowed or mempolicy,
-        * so it will clear old bits lazily.
-        */
-       smp_mb();
+       return read_seqcount_begin(&current->mems_allowed_seq);
 }
 
-static inline void put_mems_allowed(void)
+/*
+ * If this returns false, the operation that took place after get_mems_allowed
+ * may have failed. It is up to the caller to retry the operation if
+ * appropriate.
+ */
+static inline bool put_mems_allowed(unsigned int seq)
 {
-       /*
-        * ensure that reading mems_allowed and mempolicy before reducing
-        * mems_allowed_change_disable.
-        *
-        * the write-side task will know that the read-side task is still
-        * reading mems_allowed or mempolicy, don't clears old bits in the
-        * nodemask.
-        */
-       smp_mb();
-       --ACCESS_ONCE(current->mems_allowed_change_disable);
+       return !read_seqcount_retry(&current->mems_allowed_seq, seq);
 }
 
 static inline void set_mems_allowed(nodemask_t nodemask)
 {
        task_lock(current);
+       write_seqcount_begin(&current->mems_allowed_seq);
        current->mems_allowed = nodemask;
+       write_seqcount_end(&current->mems_allowed_seq);
        task_unlock(current);
 }
 
@@ -234,12 +225,14 @@ static inline void set_mems_allowed(nodemask_t nodemask)
 {
 }
 
-static inline void get_mems_allowed(void)
+static inline unsigned int get_mems_allowed(void)
 {
+       return 0;
 }
 
-static inline void put_mems_allowed(void)
+static inline bool put_mems_allowed(unsigned int seq)
 {
+       return true;
 }
 
 #endif /* !CONFIG_CPUSETS */
index 1b921299abc420f1e258b614aa319964cd98942c..c8af7a2efb5288e582e3fba9df08308d6c3f8076 100644 (file)
@@ -51,6 +51,9 @@ extern pmd_t *page_check_address_pmd(struct page *page,
                                     unsigned long address,
                                     enum page_check_address_pmd_flag flag);
 
+#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
+#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
+
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #define HPAGE_PMD_SHIFT HPAGE_SHIFT
 #define HPAGE_PMD_MASK HPAGE_MASK
@@ -102,8 +105,6 @@ extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd);
                BUG_ON(pmd_trans_splitting(*____pmd) ||                 \
                       pmd_trans_huge(*____pmd));                       \
        } while (0)
-#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
-#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
 #if HPAGE_PMD_ORDER > MAX_ORDER
 #error "hugepages can't be allocated by the buddy allocator"
 #endif
@@ -113,6 +114,18 @@ extern void __vma_adjust_trans_huge(struct vm_area_struct *vma,
                                    unsigned long start,
                                    unsigned long end,
                                    long adjust_next);
+extern int __pmd_trans_huge_lock(pmd_t *pmd,
+                                struct vm_area_struct *vma);
+/* mmap_sem must be held on entry */
+static inline int pmd_trans_huge_lock(pmd_t *pmd,
+                                     struct vm_area_struct *vma)
+{
+       VM_BUG_ON(!rwsem_is_locked(&vma->vm_mm->mmap_sem));
+       if (pmd_trans_huge(*pmd))
+               return __pmd_trans_huge_lock(pmd, vma);
+       else
+               return 0;
+}
 static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
                                         unsigned long start,
                                         unsigned long end,
@@ -146,9 +159,9 @@ static inline struct page *compound_trans_head(struct page *page)
        return page;
 }
 #else /* CONFIG_TRANSPARENT_HUGEPAGE */
-#define HPAGE_PMD_SHIFT ({ BUG(); 0; })
-#define HPAGE_PMD_MASK ({ BUG(); 0; })
-#define HPAGE_PMD_SIZE ({ BUG(); 0; })
+#define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
+#define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
+#define HPAGE_PMD_SIZE ({ BUILD_BUG(); 0; })
 
 #define hpage_nr_pages(x) 1
 
@@ -176,6 +189,11 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
                                         long adjust_next)
 {
 }
+static inline int pmd_trans_huge_lock(pmd_t *pmd,
+                                     struct vm_area_struct *vma)
+{
+       return 0;
+}
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 #endif /* _LINUX_HUGE_MM_H */
index d9d6c868b86bc01226031d63ce5ee72eb44ebed6..000837e126e64b3b0e64d67aff67a3c9626e4c5e 100644 (file)
@@ -14,6 +14,15 @@ struct user_struct;
 #include <linux/shm.h>
 #include <asm/tlbflush.h>
 
+struct hugepage_subpool {
+       spinlock_t lock;
+       long count;
+       long max_hpages, used_hpages;
+};
+
+struct hugepage_subpool *hugepage_new_subpool(long nr_blocks);
+void hugepage_put_subpool(struct hugepage_subpool *spool);
+
 int PageHuge(struct page *page);
 
 void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
@@ -128,35 +137,14 @@ enum {
 };
 
 #ifdef CONFIG_HUGETLBFS
-struct hugetlbfs_config {
-       uid_t   uid;
-       gid_t   gid;
-       umode_t mode;
-       long    nr_blocks;
-       long    nr_inodes;
-       struct hstate *hstate;
-};
-
 struct hugetlbfs_sb_info {
-       long    max_blocks;   /* blocks allowed */
-       long    free_blocks;  /* blocks free */
        long    max_inodes;   /* inodes allowed */
        long    free_inodes;  /* inodes free */
        spinlock_t      stat_lock;
        struct hstate *hstate;
+       struct hugepage_subpool *spool;
 };
 
-
-struct hugetlbfs_inode_info {
-       struct shared_policy policy;
-       struct inode vfs_inode;
-};
-
-static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode)
-{
-       return container_of(inode, struct hugetlbfs_inode_info, vfs_inode);
-}
-
 static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
 {
        return sb->s_fs_info;
@@ -164,10 +152,9 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
 
 extern const struct file_operations hugetlbfs_file_operations;
 extern const struct vm_operations_struct hugetlb_vm_ops;
-struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct,
+struct file *hugetlb_file_setup(const char *name, unsigned long addr,
+                               size_t size, vm_flags_t acct,
                                struct user_struct **user, int creat_flags);
-int hugetlb_get_quota(struct address_space *mapping, long delta);
-void hugetlb_put_quota(struct address_space *mapping, long delta);
 
 static inline int is_file_hugepages(struct file *file)
 {
@@ -179,15 +166,11 @@ static inline int is_file_hugepages(struct file *file)
        return 0;
 }
 
-static inline void set_file_hugepages(struct file *file)
-{
-       file->f_op = &hugetlbfs_file_operations;
-}
 #else /* !CONFIG_HUGETLBFS */
 
 #define is_file_hugepages(file)                        0
-#define set_file_hugepages(file)               BUG()
-static inline struct file *hugetlb_file_setup(const char *name, size_t size,
+static inline struct file *
+hugetlb_file_setup(const char *name, unsigned long addr, size_t size,
                vm_flags_t acctflag, struct user_struct **user, int creat_flags)
 {
        return ERR_PTR(-ENOSYS);
index f994d51f70f20cc7b1e1bd92ed8163e1e74454de..e4baff5f7ff403722f54b5e6dcb7f67b2926fa98 100644 (file)
@@ -29,6 +29,13 @@ extern struct fs_struct init_fs;
 #define INIT_GROUP_RWSEM(sig)
 #endif
 
+#ifdef CONFIG_CPUSETS
+#define INIT_CPUSET_SEQ                                                        \
+       .mems_allowed_seq = SEQCNT_ZERO,
+#else
+#define INIT_CPUSET_SEQ
+#endif
+
 #define INIT_SIGNALS(sig) {                                            \
        .nr_threads     = 1,                                            \
        .wait_chldexit  = __WAIT_QUEUE_HEAD_INITIALIZER(sig.wait_chldexit),\
@@ -192,6 +199,7 @@ extern struct cred init_cred;
        INIT_FTRACE_GRAPH                                               \
        INIT_TRACE_RECURSION                                            \
        INIT_TASK_RCU_PREEMPT(tsk)                                      \
+       INIT_CPUSET_SEQ                                                 \
 }
 
 
index bd92a89f4b0aa49cadcb6d320a8d2815ffd190c3..26a65711676f421dd1e287734ac4f781c713c8b0 100644 (file)
@@ -30,6 +30,7 @@
 #define KPF_NOPAGE             20
 
 #define KPF_KSM                        21
+#define KPF_THP                        22
 
 /* kernel hacking assistances
  * WARNING: subject to change, never rely on them!
index b80de520670b64a67dc2ceb8b4e4e11c1d623071..f94efd2f6c275b0ff14859e2451da53a6e3115a9 100644 (file)
@@ -77,7 +77,8 @@ extern void mem_cgroup_uncharge_end(void);
 extern void mem_cgroup_uncharge_page(struct page *page);
 extern void mem_cgroup_uncharge_cache_page(struct page *page);
 
-extern void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask);
+extern void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
+                                    int order);
 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg);
 
 extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page);
@@ -140,6 +141,34 @@ static inline bool mem_cgroup_disabled(void)
        return false;
 }
 
+void __mem_cgroup_begin_update_page_stat(struct page *page, bool *locked,
+                                        unsigned long *flags);
+
+extern atomic_t memcg_moving;
+
+static inline void mem_cgroup_begin_update_page_stat(struct page *page,
+                                       bool *locked, unsigned long *flags)
+{
+       if (mem_cgroup_disabled())
+               return;
+       rcu_read_lock();
+       *locked = false;
+       if (atomic_read(&memcg_moving))
+               __mem_cgroup_begin_update_page_stat(page, locked, flags);
+}
+
+void __mem_cgroup_end_update_page_stat(struct page *page,
+                               unsigned long *flags);
+static inline void mem_cgroup_end_update_page_stat(struct page *page,
+                                       bool *locked, unsigned long *flags)
+{
+       if (mem_cgroup_disabled())
+               return;
+       if (*locked)
+               __mem_cgroup_end_update_page_stat(page, flags);
+       rcu_read_unlock();
+}
+
 void mem_cgroup_update_page_stat(struct page *page,
                                 enum mem_cgroup_page_stat_item idx,
                                 int val);
@@ -298,21 +327,6 @@ static inline void mem_cgroup_iter_break(struct mem_cgroup *root,
 {
 }
 
-static inline int mem_cgroup_get_reclaim_priority(struct mem_cgroup *memcg)
-{
-       return 0;
-}
-
-static inline void mem_cgroup_note_reclaim_priority(struct mem_cgroup *memcg,
-                                               int priority)
-{
-}
-
-static inline void mem_cgroup_record_reclaim_priority(struct mem_cgroup *memcg,
-                                               int priority)
-{
-}
-
 static inline bool mem_cgroup_disabled(void)
 {
        return true;
@@ -355,6 +369,16 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 {
 }
 
+static inline void mem_cgroup_begin_update_page_stat(struct page *page,
+                                       bool *locked, unsigned long *flags)
+{
+}
+
+static inline void mem_cgroup_end_update_page_stat(struct page *page,
+                                       bool *locked, unsigned long *flags)
+{
+}
+
 static inline void mem_cgroup_inc_page_stat(struct page *page,
                                            enum mem_cgroup_page_stat_item idx)
 {
@@ -391,7 +415,7 @@ static inline void mem_cgroup_replace_page_cache(struct page *oldpage,
                                struct page *newpage)
 {
 }
-#endif /* CONFIG_CGROUP_MEM_CONT */
+#endif /* CONFIG_CGROUP_MEM_RES_CTLR */
 
 #if !defined(CONFIG_CGROUP_MEM_RES_CTLR) || !defined(CONFIG_DEBUG_VM)
 static inline bool
index 05ed2828a5535bfe151c17ab2b5a656a5209ef4c..855c337b20c35cb22ef0a3fa6213cfb82a3a90b0 100644 (file)
@@ -8,7 +8,6 @@
 typedef struct page *new_page_t(struct page *, unsigned long private, int **);
 
 #ifdef CONFIG_MIGRATION
-#define PAGE_MIGRATION 1
 
 extern void putback_lru_pages(struct list_head *l);
 extern int migrate_page(struct address_space *,
@@ -32,7 +31,6 @@ extern void migrate_page_copy(struct page *newpage, struct page *page);
 extern int migrate_huge_page_move_mapping(struct address_space *mapping,
                                  struct page *newpage, struct page *page);
 #else
-#define PAGE_MIGRATION 0
 
 static inline void putback_lru_pages(struct list_head *l) {}
 static inline int migrate_pages(struct list_head *l, new_page_t x,
index b5bb54d6d667dbdd333aab7f92f1c819fe81fca8..ee67e326b6f8bc277ea5b9a6972077491f2b688c 100644 (file)
@@ -1040,6 +1040,9 @@ static inline int stack_guard_page_end(struct vm_area_struct *vma,
                !vma_growsup(vma->vm_next, addr);
 }
 
+extern pid_t
+vm_is_stack(struct task_struct *task, struct vm_area_struct *vma, int in_group);
+
 extern unsigned long move_page_tables(struct vm_area_struct *vma,
                unsigned long old_addr, struct vm_area_struct *new_vma,
                unsigned long new_addr, unsigned long len);
@@ -1058,19 +1061,20 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
 /*
  * per-process(per-mm_struct) statistics.
  */
-static inline void set_mm_counter(struct mm_struct *mm, int member, long value)
-{
-       atomic_long_set(&mm->rss_stat.count[member], value);
-}
-
-#if defined(SPLIT_RSS_COUNTING)
-unsigned long get_mm_counter(struct mm_struct *mm, int member);
-#else
 static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
 {
-       return atomic_long_read(&mm->rss_stat.count[member]);
-}
+       long val = atomic_long_read(&mm->rss_stat.count[member]);
+
+#ifdef SPLIT_RSS_COUNTING
+       /*
+        * counter is updated in asynchronous manner and may go to minus.
+        * But it's never be expected number for users.
+        */
+       if (val < 0)
+               val = 0;
 #endif
+       return (unsigned long)val;
+}
 
 static inline void add_mm_counter(struct mm_struct *mm, int member, long value)
 {
@@ -1127,9 +1131,9 @@ static inline void setmax_mm_hiwater_rss(unsigned long *maxrss,
 }
 
 #if defined(SPLIT_RSS_COUNTING)
-void sync_mm_rss(struct task_struct *task, struct mm_struct *mm);
+void sync_mm_rss(struct mm_struct *mm);
 #else
-static inline void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
+static inline void sync_mm_rss(struct mm_struct *mm)
 {
 }
 #endif
@@ -1291,8 +1295,6 @@ extern void get_pfn_range_for_nid(unsigned int nid,
 extern unsigned long find_min_pfn_with_active_regions(void);
 extern void free_bootmem_with_active_regions(int nid,
                                                unsigned long max_low_pfn);
-int add_from_early_node_map(struct range *range, int az,
-                                  int nr_range, int nid);
 extern void sparse_memory_present_with_active_regions(int nid);
 
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
index 650ba2fb3301af5975fa1b027ebdb008cf573e61..dff711509661c8e5eb74fce083eef26e526a783e 100644 (file)
@@ -365,6 +365,7 @@ struct zone {
         */
        unsigned int            compact_considered;
        unsigned int            compact_defer_shift;
+       int                     compact_order_failed;
 #endif
 
        ZONE_PADDING(_pad1_)
index 552fba9c7d5a5a17386901ba00c6f403513e636f..3d7647536b0304ba40013aa2399d551b6779b4cf 100644 (file)
@@ -49,7 +49,7 @@ extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags);
 extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags);
 
 extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
-               int order, nodemask_t *mask);
+               int order, nodemask_t *mask, bool force_kill);
 extern int register_oom_notifier(struct notifier_block *nb);
 extern int unregister_oom_notifier(struct notifier_block *nb);
 
index e90a673be67e1eebb4c98ba30c198ede805dcb82..6b25758e028e06fc1e3d4db30a6cd88c175f3039 100644 (file)
@@ -414,11 +414,26 @@ static inline int PageTransHuge(struct page *page)
        return PageHead(page);
 }
 
+/*
+ * PageTransCompound returns true for both transparent huge pages
+ * and hugetlbfs pages, so it should only be called when it's known
+ * that hugetlbfs pages aren't involved.
+ */
 static inline int PageTransCompound(struct page *page)
 {
        return PageCompound(page);
 }
 
+/*
+ * PageTransTail returns true for both transparent huge pages
+ * and hugetlbfs pages, so it should only be called when it's known
+ * that hugetlbfs pages aren't involved.
+ */
+static inline int PageTransTail(struct page *page)
+{
+       return PageTail(page);
+}
+
 #else
 
 static inline int PageTransHuge(struct page *page)
@@ -430,6 +445,11 @@ static inline int PageTransCompound(struct page *page)
 {
        return 0;
 }
+
+static inline int PageTransTail(struct page *page)
+{
+       return 0;
+}
 #endif
 
 #ifdef CONFIG_MMU
index a2d11771c84b1f4a8717670ddc6720f404a8820f..a88cdba27809b490cc64b283e32d841ab7f0dd42 100644 (file)
@@ -4,12 +4,8 @@
 enum {
        /* flags for mem_cgroup */
        PCG_LOCK,  /* Lock for pc->mem_cgroup and following bits. */
-       PCG_CACHE, /* charged as cache */
        PCG_USED, /* this object is in use. */
        PCG_MIGRATION, /* under page migration */
-       /* flags for mem_cgroup and file and I/O status */
-       PCG_MOVE_LOCK, /* For race between move_account v.s. following bits */
-       PCG_FILE_MAPPED, /* page is accounted as "mapped" */
        __NR_PCG_FLAGS,
 };
 
@@ -64,19 +60,10 @@ static inline void ClearPageCgroup##uname(struct page_cgroup *pc)   \
 static inline int TestClearPageCgroup##uname(struct page_cgroup *pc)   \
        { return test_and_clear_bit(PCG_##lname, &pc->flags);  }
 
-/* Cache flag is set only once (at allocation) */
-TESTPCGFLAG(Cache, CACHE)
-CLEARPCGFLAG(Cache, CACHE)
-SETPCGFLAG(Cache, CACHE)
-
 TESTPCGFLAG(Used, USED)
 CLEARPCGFLAG(Used, USED)
 SETPCGFLAG(Used, USED)
 
-SETPCGFLAG(FileMapped, FILE_MAPPED)
-CLEARPCGFLAG(FileMapped, FILE_MAPPED)
-TESTPCGFLAG(FileMapped, FILE_MAPPED)
-
 SETPCGFLAG(Migration, MIGRATION)
 CLEARPCGFLAG(Migration, MIGRATION)
 TESTPCGFLAG(Migration, MIGRATION)
@@ -85,7 +72,7 @@ static inline void lock_page_cgroup(struct page_cgroup *pc)
 {
        /*
         * Don't take this lock in IRQ context.
-        * This lock is for pc->mem_cgroup, USED, CACHE, MIGRATION
+        * This lock is for pc->mem_cgroup, USED, MIGRATION
         */
        bit_spin_lock(PCG_LOCK, &pc->flags);
 }
@@ -95,24 +82,6 @@ static inline void unlock_page_cgroup(struct page_cgroup *pc)
        bit_spin_unlock(PCG_LOCK, &pc->flags);
 }
 
-static inline void move_lock_page_cgroup(struct page_cgroup *pc,
-       unsigned long *flags)
-{
-       /*
-        * We know updates to pc->flags of page cache's stats are from both of
-        * usual context or IRQ context. Disable IRQ to avoid deadlock.
-        */
-       local_irq_save(*flags);
-       bit_spin_lock(PCG_MOVE_LOCK, &pc->flags);
-}
-
-static inline void move_unlock_page_cgroup(struct page_cgroup *pc,
-       unsigned long *flags)
-{
-       bit_spin_unlock(PCG_MOVE_LOCK, &pc->flags);
-       local_irq_restore(*flags);
-}
-
 #else /* CONFIG_CGROUP_MEM_RES_CTLR */
 struct page_cgroup;
 
index 1cdd62a2788a99436f666ec9d9e194f3c341e769..fd07c4542cee4f60784081267fb069787919a5bc 100644 (file)
@@ -122,7 +122,6 @@ void unlink_anon_vmas(struct vm_area_struct *);
 int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *);
 void anon_vma_moveto_tail(struct vm_area_struct *);
 int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *);
-void __anon_vma_link(struct vm_area_struct *);
 
 static inline void anon_vma_merge(struct vm_area_struct *vma,
                                  struct vm_area_struct *next)
index e074e1e54f8573f55650684d58181d8a53d706fd..0c147a4260a51b47e79e347d9aa4239969a5d638 100644 (file)
@@ -1514,7 +1514,7 @@ struct task_struct {
 #endif
 #ifdef CONFIG_CPUSETS
        nodemask_t mems_allowed;        /* Protected by alloc_lock */
-       int mems_allowed_change_disable;
+       seqcount_t mems_allowed_seq;    /* Seqence no to catch updates */
        int cpuset_mem_spread_rotor;
        int cpuset_slab_spread_rotor;
 #endif
index 3e60228e7299bade2a864f21c9d4addc6c304cf7..b86b5c20617d93e361d462e20c0c3c87856214b6 100644 (file)
@@ -223,6 +223,7 @@ extern void lru_add_page_tail(struct zone* zone,
 extern void activate_page(struct page *);
 extern void mark_page_accessed(struct page *);
 extern void lru_add_drain(void);
+extern void lru_add_drain_cpu(int cpu);
 extern int lru_add_drain_all(void);
 extern void rotate_reclaimable_page(struct page *page);
 extern void deactivate_page(struct page *page);
@@ -329,7 +330,6 @@ extern long total_swap_pages;
 extern void si_swapinfo(struct sysinfo *);
 extern swp_entry_t get_swap_page(void);
 extern swp_entry_t get_swap_page_of_type(int);
-extern int valid_swaphandles(swp_entry_t, unsigned long *);
 extern int add_swap_count_continuation(swp_entry_t, gfp_t);
 extern void swap_shmem_alloc(swp_entry_t);
 extern int swap_duplicate(swp_entry_t);
index b76be5bda6c2f10fd98019bea5730c24b7614e98..406c5b208193373b979ce82bffe6617250ea64ed 100644 (file)
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -482,7 +482,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
                /* hugetlb_file_setup applies strict accounting */
                if (shmflg & SHM_NORESERVE)
                        acctflag = VM_NORESERVE;
-               file = hugetlb_file_setup(name, size, acctflag,
+               file = hugetlb_file_setup(name, 0, size, acctflag,
                                        &shp->mlock_user, HUGETLB_SHMFS_INODE);
        } else {
                /*
index 1ece8e20fdb50b4109bf1d99a1e465808863568a..f4ea4b6f3cf1eae8725ab6107f20bab58324f103 100644 (file)
@@ -4881,9 +4881,9 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
 
        rcu_assign_pointer(id->css, NULL);
        rcu_assign_pointer(css->id, NULL);
-       write_lock(&ss->id_lock);
+       spin_lock(&ss->id_lock);
        idr_remove(&ss->idr, id->id);
-       write_unlock(&ss->id_lock);
+       spin_unlock(&ss->id_lock);
        kfree_rcu(id, rcu_head);
 }
 EXPORT_SYMBOL_GPL(free_css_id);
@@ -4909,10 +4909,10 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
                error = -ENOMEM;
                goto err_out;
        }
-       write_lock(&ss->id_lock);
+       spin_lock(&ss->id_lock);
        /* Don't use 0. allocates an ID of 1-65535 */
        error = idr_get_new_above(&ss->idr, newid, 1, &myid);
-       write_unlock(&ss->id_lock);
+       spin_unlock(&ss->id_lock);
 
        /* Returns error when there are no free spaces for new ID.*/
        if (error) {
@@ -4927,9 +4927,9 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
        return newid;
 remove_idr:
        error = -ENOSPC;
-       write_lock(&ss->id_lock);
+       spin_lock(&ss->id_lock);
        idr_remove(&ss->idr, myid);
-       write_unlock(&ss->id_lock);
+       spin_unlock(&ss->id_lock);
 err_out:
        kfree(newid);
        return ERR_PTR(error);
@@ -4941,7 +4941,7 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
 {
        struct css_id *newid;
 
-       rwlock_init(&ss->id_lock);
+       spin_lock_init(&ss->id_lock);
        idr_init(&ss->idr);
 
        newid = get_new_cssid(ss, 0);
@@ -5029,6 +5029,8 @@ css_get_next(struct cgroup_subsys *ss, int id,
                return NULL;
 
        BUG_ON(!ss->use_id);
+       WARN_ON_ONCE(!rcu_read_lock_held());
+
        /* fill start point for scan */
        tmpid = id;
        while (1) {
@@ -5036,10 +5038,7 @@ css_get_next(struct cgroup_subsys *ss, int id,
                 * scan next entry from bitmap(tree), tmpid is updated after
                 * idr_get_next().
                 */
-               read_lock(&ss->id_lock);
                tmp = idr_get_next(&ss->idr, &tmpid);
-               read_unlock(&ss->id_lock);
-
                if (!tmp)
                        break;
                if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
index 5d575836dba66ee45001909c58254b089838e73c..1010cc61931ff00f4f7f8890b07f0af66ef2ed1e 100644 (file)
@@ -964,7 +964,6 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
 {
        bool need_loop;
 
-repeat:
        /*
         * Allow tasks that have access to memory reserves because they have
         * been OOM killed to get memory anywhere.
@@ -983,45 +982,19 @@ repeat:
         */
        need_loop = task_has_mempolicy(tsk) ||
                        !nodes_intersects(*newmems, tsk->mems_allowed);
-       nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
-       mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
 
-       /*
-        * ensure checking ->mems_allowed_change_disable after setting all new
-        * allowed nodes.
-        *
-        * the read-side task can see an nodemask with new allowed nodes and
-        * old allowed nodes. and if it allocates page when cpuset clears newly
-        * disallowed ones continuous, it can see the new allowed bits.
-        *
-        * And if setting all new allowed nodes is after the checking, setting
-        * all new allowed nodes and clearing newly disallowed ones will be done
-        * continuous, and the read-side task may find no node to alloc page.
-        */
-       smp_mb();
+       if (need_loop)
+               write_seqcount_begin(&tsk->mems_allowed_seq);
 
-       /*
-        * Allocation of memory is very fast, we needn't sleep when waiting
-        * for the read-side.
-        */
-       while (need_loop && ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
-               task_unlock(tsk);
-               if (!task_curr(tsk))
-                       yield();
-               goto repeat;
-       }
-
-       /*
-        * ensure checking ->mems_allowed_change_disable before clearing all new
-        * disallowed nodes.
-        *
-        * if clearing newly disallowed bits before the checking, the read-side
-        * task may find no node to alloc page.
-        */
-       smp_mb();
+       nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
+       mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
 
        mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
        tsk->mems_allowed = *newmems;
+
+       if (need_loop)
+               write_seqcount_end(&tsk->mems_allowed_seq);
+
        task_unlock(tsk);
 }
 
index 7ad335c3045a15e4e3d28757d27a8103168a2cdc..16b07bfac224d3f13960b09502fbecfba8897db0 100644 (file)
@@ -935,7 +935,7 @@ void do_exit(long code)
        acct_update_integrals(tsk);
        /* sync mm's RSS info before statistics gathering */
        if (tsk->mm)
-               sync_mm_rss(tsk, tsk->mm);
+               sync_mm_rss(tsk->mm);
        group_dead = atomic_dec_and_test(&tsk->signal->live);
        if (group_dead) {
                hrtimer_cancel(&tsk->signal->real_timer);
index 26a7138bb849340e76a5cd8f73ed9d9bc84e01e1..37674ec55cde19d12e3deda5845ecb1b8b4d376c 100644 (file)
@@ -512,6 +512,23 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
        return NULL;
 }
 
+static void check_mm(struct mm_struct *mm)
+{
+       int i;
+
+       for (i = 0; i < NR_MM_COUNTERS; i++) {
+               long x = atomic_long_read(&mm->rss_stat.count[i]);
+
+               if (unlikely(x))
+                       printk(KERN_ALERT "BUG: Bad rss-counter state "
+                                         "mm:%p idx:%d val:%ld\n", mm, i, x);
+       }
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       VM_BUG_ON(mm->pmd_huge_pte);
+#endif
+}
+
 /*
  * Allocate and initialize an mm_struct.
  */
@@ -539,9 +556,7 @@ void __mmdrop(struct mm_struct *mm)
        mm_free_pgd(mm);
        destroy_context(mm);
        mmu_notifier_mm_destroy(mm);
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-       VM_BUG_ON(mm->pmd_huge_pte);
-#endif
+       check_mm(mm);
        free_mm(mm);
 }
 EXPORT_SYMBOL_GPL(__mmdrop);
@@ -1223,6 +1238,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #ifdef CONFIG_CPUSETS
        p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
        p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
+       seqcount_init(&p->mems_allowed_seq);
 #endif
 #ifdef CONFIG_TRACE_IRQFLAGS
        p->irq_events = 0;
index ed055b297c81c127dcbef79cb821f904936245c2..12499ba7967e08f5388835834c08c67dc7af5c24 100644 (file)
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -595,8 +595,10 @@ EXPORT_SYMBOL(idr_for_each);
  * Returns pointer to registered object with id, which is next number to
  * given id. After being looked up, *@nextidp will be updated for the next
  * iteration.
+ *
+ * This function can be called under rcu_read_lock(), given that the leaf
+ * pointers lifetimes are correctly managed.
  */
-
 void *idr_get_next(struct idr *idp, int *nextidp)
 {
        struct idr_layer *p, *pa[MAX_LEVEL];
@@ -605,11 +607,11 @@ void *idr_get_next(struct idr *idp, int *nextidp)
        int n, max;
 
        /* find first ent */
-       n = idp->layers * IDR_BITS;
-       max = 1 << n;
        p = rcu_dereference_raw(idp->top);
        if (!p)
                return NULL;
+       n = (p->layer + 1) * IDR_BITS;
+       max = 1 << n;
 
        while (id < max) {
                while (n > 0 && p) {
index 668e94df8cf23ab1fa68ab54ff0cd8542a4c460c..0131170c9d540a572c7b2ba3108ca5c2d9db30b7 100644 (file)
@@ -766,14 +766,13 @@ void * __init alloc_bootmem_section(unsigned long size,
                                    unsigned long section_nr)
 {
        bootmem_data_t *bdata;
-       unsigned long pfn, goal, limit;
+       unsigned long pfn, goal;
 
        pfn = section_nr_to_pfn(section_nr);
        goal = pfn << PAGE_SHIFT;
-       limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
        bdata = &bootmem_node_data[early_pfn_to_nid(pfn)];
 
-       return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit);
+       return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, 0);
 }
 #endif
 
index d9ebebe1a2aaaea69074fe3615faeddd41a4feb7..74a8c825ff289a12079701dfc42be4e1787b8d8f 100644 (file)
@@ -35,7 +35,7 @@ struct compact_control {
        unsigned long migrate_pfn;      /* isolate_migratepages search base */
        bool sync;                      /* Synchronous migration */
 
-       unsigned int order;             /* order a direct compactor needs */
+       int order;                      /* order a direct compactor needs */
        int migratetype;                /* MOVABLE, RECLAIMABLE etc */
        struct zone *zone;
 };
@@ -675,49 +675,71 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
 
 
 /* Compact all zones within a node */
-static int compact_node(int nid)
+static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
 {
        int zoneid;
-       pg_data_t *pgdat;
        struct zone *zone;
 
-       if (nid < 0 || nid >= nr_node_ids || !node_online(nid))
-               return -EINVAL;
-       pgdat = NODE_DATA(nid);
-
-       /* Flush pending updates to the LRU lists */
-       lru_add_drain_all();
-
        for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
-               struct compact_control cc = {
-                       .nr_freepages = 0,
-                       .nr_migratepages = 0,
-                       .order = -1,
-                       .sync = true,
-               };
 
                zone = &pgdat->node_zones[zoneid];
                if (!populated_zone(zone))
                        continue;
 
-               cc.zone = zone;
-               INIT_LIST_HEAD(&cc.freepages);
-               INIT_LIST_HEAD(&cc.migratepages);
-
-               compact_zone(zone, &cc);
+               cc->nr_freepages = 0;
+               cc->nr_migratepages = 0;
+               cc->zone = zone;
+               INIT_LIST_HEAD(&cc->freepages);
+               INIT_LIST_HEAD(&cc->migratepages);
+
+               if (cc->order == -1 || !compaction_deferred(zone, cc->order))
+                       compact_zone(zone, cc);
+
+               if (cc->order > 0) {
+                       int ok = zone_watermark_ok(zone, cc->order,
+                                               low_wmark_pages(zone), 0, 0);
+                       if (ok && cc->order > zone->compact_order_failed)
+                               zone->compact_order_failed = cc->order + 1;
+                       /* Currently async compaction is never deferred. */
+                       else if (!ok && cc->sync)
+                               defer_compaction(zone, cc->order);
+               }
 
-               VM_BUG_ON(!list_empty(&cc.freepages));
-               VM_BUG_ON(!list_empty(&cc.migratepages));
+               VM_BUG_ON(!list_empty(&cc->freepages));
+               VM_BUG_ON(!list_empty(&cc->migratepages));
        }
 
        return 0;
 }
 
+int compact_pgdat(pg_data_t *pgdat, int order)
+{
+       struct compact_control cc = {
+               .order = order,
+               .sync = false,
+       };
+
+       return __compact_pgdat(pgdat, &cc);
+}
+
+static int compact_node(int nid)
+{
+       struct compact_control cc = {
+               .order = -1,
+               .sync = true,
+       };
+
+       return __compact_pgdat(NODE_DATA(nid), &cc);
+}
+
 /* Compact all nodes in the system */
 static int compact_nodes(void)
 {
        int nid;
 
+       /* Flush pending updates to the LRU lists */
+       lru_add_drain_all();
+
        for_each_online_node(nid)
                compact_node(nid);
 
@@ -750,7 +772,14 @@ ssize_t sysfs_compact_node(struct device *dev,
                        struct device_attribute *attr,
                        const char *buf, size_t count)
 {
-       compact_node(dev->id);
+       int nid = dev->id;
+
+       if (nid >= 0 && nid < nr_node_ids && node_online(nid)) {
+               /* Flush pending updates to the LRU lists */
+               lru_add_drain_all();
+
+               compact_node(nid);
+       }
 
        return count;
 }
index 2f8165075a5a9ac67358f2b51dac2bf2cf14f6c0..843042045dc99821864f50c1cb53c732f093c4e3 100644 (file)
  *    ->inode->i_lock          (zap_pte_range->set_page_dirty)
  *    ->private_lock           (zap_pte_range->__set_page_dirty_buffers)
  *
- *  (code doesn't rely on that order, so you could switch it around)
- *  ->tasklist_lock             (memory_failure, collect_procs_ao)
- *    ->i_mmap_mutex
+ * ->i_mmap_mutex
+ *   ->tasklist_lock            (memory_failure, collect_procs_ao)
  */
 
 /*
@@ -500,10 +499,13 @@ struct page *__page_cache_alloc(gfp_t gfp)
        struct page *page;
 
        if (cpuset_do_page_mem_spread()) {
-               get_mems_allowed();
-               n = cpuset_mem_spread_node();
-               page = alloc_pages_exact_node(n, gfp, 0);
-               put_mems_allowed();
+               unsigned int cpuset_mems_cookie;
+               do {
+                       cpuset_mems_cookie = get_mems_allowed();
+                       n = cpuset_mem_spread_node();
+                       page = alloc_pages_exact_node(n, gfp, 0);
+               } while (!put_mems_allowed(cpuset_mems_cookie) && !page);
+
                return page;
        }
        return alloc_pages(gfp, 0);
@@ -2341,7 +2343,9 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping,
        struct page *page;
        gfp_t gfp_notmask = 0;
 
-       gfp_mask = mapping_gfp_mask(mapping) | __GFP_WRITE;
+       gfp_mask = mapping_gfp_mask(mapping);
+       if (mapping_cap_account_dirty(mapping))
+               gfp_mask |= __GFP_WRITE;
        if (flags & AOP_FLAG_NOFS)
                gfp_notmask = __GFP_FS;
 repeat:
index 8f7fc394f63672954cc4444da0f2158a42470b2c..f0e5306eeb55e8e179da3abbe6c033045b6ad073 100644 (file)
@@ -1031,32 +1031,23 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 {
        int ret = 0;
 
-       spin_lock(&tlb->mm->page_table_lock);
-       if (likely(pmd_trans_huge(*pmd))) {
-               if (unlikely(pmd_trans_splitting(*pmd))) {
-                       spin_unlock(&tlb->mm->page_table_lock);
-                       wait_split_huge_page(vma->anon_vma,
-                                            pmd);
-               } else {
-                       struct page *page;
-                       pgtable_t pgtable;
-                       pgtable = get_pmd_huge_pte(tlb->mm);
-                       page = pmd_page(*pmd);
-                       pmd_clear(pmd);
-                       tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
-                       page_remove_rmap(page);
-                       VM_BUG_ON(page_mapcount(page) < 0);
-                       add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
-                       VM_BUG_ON(!PageHead(page));
-                       tlb->mm->nr_ptes--;
-                       spin_unlock(&tlb->mm->page_table_lock);
-                       tlb_remove_page(tlb, page);
-                       pte_free(tlb->mm, pgtable);
-                       ret = 1;
-               }
-       } else
+       if (__pmd_trans_huge_lock(pmd, vma) == 1) {
+               struct page *page;
+               pgtable_t pgtable;
+               pgtable = get_pmd_huge_pte(tlb->mm);
+               page = pmd_page(*pmd);
+               pmd_clear(pmd);
+               tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
+               page_remove_rmap(page);
+               VM_BUG_ON(page_mapcount(page) < 0);
+               add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
+               VM_BUG_ON(!PageHead(page));
+               tlb->mm->nr_ptes--;
                spin_unlock(&tlb->mm->page_table_lock);
-
+               tlb_remove_page(tlb, page);
+               pte_free(tlb->mm, pgtable);
+               ret = 1;
+       }
        return ret;
 }
 
@@ -1066,21 +1057,15 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 {
        int ret = 0;
 
-       spin_lock(&vma->vm_mm->page_table_lock);
-       if (likely(pmd_trans_huge(*pmd))) {
-               ret = !pmd_trans_splitting(*pmd);
-               spin_unlock(&vma->vm_mm->page_table_lock);
-               if (unlikely(!ret))
-                       wait_split_huge_page(vma->anon_vma, pmd);
-               else {
-                       /*
-                        * All logical pages in the range are present
-                        * if backed by a huge page.
-                        */
-                       memset(vec, 1, (end - addr) >> PAGE_SHIFT);
-               }
-       } else
+       if (__pmd_trans_huge_lock(pmd, vma) == 1) {
+               /*
+                * All logical pages in the range are present
+                * if backed by a huge page.
+                */
                spin_unlock(&vma->vm_mm->page_table_lock);
+               memset(vec, 1, (end - addr) >> PAGE_SHIFT);
+               ret = 1;
+       }
 
        return ret;
 }
@@ -1110,20 +1095,11 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
                goto out;
        }
 
-       spin_lock(&mm->page_table_lock);
-       if (likely(pmd_trans_huge(*old_pmd))) {
-               if (pmd_trans_splitting(*old_pmd)) {
-                       spin_unlock(&mm->page_table_lock);
-                       wait_split_huge_page(vma->anon_vma, old_pmd);
-                       ret = -1;
-               } else {
-                       pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
-                       VM_BUG_ON(!pmd_none(*new_pmd));
-                       set_pmd_at(mm, new_addr, new_pmd, pmd);
-                       spin_unlock(&mm->page_table_lock);
-                       ret = 1;
-               }
-       } else {
+       ret = __pmd_trans_huge_lock(old_pmd, vma);
+       if (ret == 1) {
+               pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
+               VM_BUG_ON(!pmd_none(*new_pmd));
+               set_pmd_at(mm, new_addr, new_pmd, pmd);
                spin_unlock(&mm->page_table_lock);
        }
 out:
@@ -1136,24 +1112,41 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
        struct mm_struct *mm = vma->vm_mm;
        int ret = 0;
 
-       spin_lock(&mm->page_table_lock);
+       if (__pmd_trans_huge_lock(pmd, vma) == 1) {
+               pmd_t entry;
+               entry = pmdp_get_and_clear(mm, addr, pmd);
+               entry = pmd_modify(entry, newprot);
+               set_pmd_at(mm, addr, pmd, entry);
+               spin_unlock(&vma->vm_mm->page_table_lock);
+               ret = 1;
+       }
+
+       return ret;
+}
+
+/*
+ * Returns 1 if a given pmd maps a stable (not under splitting) thp.
+ * Returns -1 if it maps a thp under splitting. Returns 0 otherwise.
+ *
+ * Note that if it returns 1, this routine returns without unlocking page
+ * table locks. So callers must unlock them.
+ */
+int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
+{
+       spin_lock(&vma->vm_mm->page_table_lock);
        if (likely(pmd_trans_huge(*pmd))) {
                if (unlikely(pmd_trans_splitting(*pmd))) {
-                       spin_unlock(&mm->page_table_lock);
+                       spin_unlock(&vma->vm_mm->page_table_lock);
                        wait_split_huge_page(vma->anon_vma, pmd);
+                       return -1;
                } else {
-                       pmd_t entry;
-
-                       entry = pmdp_get_and_clear(mm, addr, pmd);
-                       entry = pmd_modify(entry, newprot);
-                       set_pmd_at(mm, addr, pmd, entry);
-                       spin_unlock(&vma->vm_mm->page_table_lock);
-                       ret = 1;
+                       /* Thp mapped by 'pmd' is stable, so we can
+                        * handle it as it is. */
+                       return 1;
                }
-       } else
-               spin_unlock(&vma->vm_mm->page_table_lock);
-
-       return ret;
+       }
+       spin_unlock(&vma->vm_mm->page_table_lock);
+       return 0;
 }
 
 pmd_t *page_check_address_pmd(struct page *page,
index a876871f6be56d15b06d1da56a9f3dbbd61cda91..afa057a1d3fe6806eb19dcb73b4ae1fe327e47b6 100644 (file)
@@ -53,6 +53,84 @@ static unsigned long __initdata default_hstate_size;
  */
 static DEFINE_SPINLOCK(hugetlb_lock);
 
+static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
+{
+       bool free = (spool->count == 0) && (spool->used_hpages == 0);
+
+       spin_unlock(&spool->lock);
+
+       /* If no pages are used, and no other handles to the subpool
+        * remain, free the subpool the subpool remain */
+       if (free)
+               kfree(spool);
+}
+
+struct hugepage_subpool *hugepage_new_subpool(long nr_blocks)
+{
+       struct hugepage_subpool *spool;
+
+       spool = kmalloc(sizeof(*spool), GFP_KERNEL);
+       if (!spool)
+               return NULL;
+
+       spin_lock_init(&spool->lock);
+       spool->count = 1;
+       spool->max_hpages = nr_blocks;
+       spool->used_hpages = 0;
+
+       return spool;
+}
+
+void hugepage_put_subpool(struct hugepage_subpool *spool)
+{
+       spin_lock(&spool->lock);
+       BUG_ON(!spool->count);
+       spool->count--;
+       unlock_or_release_subpool(spool);
+}
+
+static int hugepage_subpool_get_pages(struct hugepage_subpool *spool,
+                                     long delta)
+{
+       int ret = 0;
+
+       if (!spool)
+               return 0;
+
+       spin_lock(&spool->lock);
+       if ((spool->used_hpages + delta) <= spool->max_hpages) {
+               spool->used_hpages += delta;
+       } else {
+               ret = -ENOMEM;
+       }
+       spin_unlock(&spool->lock);
+
+       return ret;
+}
+
+static void hugepage_subpool_put_pages(struct hugepage_subpool *spool,
+                                      long delta)
+{
+       if (!spool)
+               return;
+
+       spin_lock(&spool->lock);
+       spool->used_hpages -= delta;
+       /* If hugetlbfs_put_super couldn't free spool due to
+       * an outstanding quota reference, free it now. */
+       unlock_or_release_subpool(spool);
+}
+
+static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
+{
+       return HUGETLBFS_SB(inode->i_sb)->spool;
+}
+
+static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
+{
+       return subpool_inode(vma->vm_file->f_dentry->d_inode);
+}
+
 /*
  * Region tracking -- allows tracking of reservations and instantiated pages
  *                    across the pages in a mapping.
@@ -454,14 +532,16 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
                                struct vm_area_struct *vma,
                                unsigned long address, int avoid_reserve)
 {
-       struct page *page = NULL;
+       struct page *page;
        struct mempolicy *mpol;
        nodemask_t *nodemask;
        struct zonelist *zonelist;
        struct zone *zone;
        struct zoneref *z;
+       unsigned int cpuset_mems_cookie;
 
-       get_mems_allowed();
+retry_cpuset:
+       cpuset_mems_cookie = get_mems_allowed();
        zonelist = huge_zonelist(vma, address,
                                        htlb_alloc_mask, &mpol, &nodemask);
        /*
@@ -488,10 +568,15 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
                        }
                }
        }
-err:
+
        mpol_cond_put(mpol);
-       put_mems_allowed();
+       if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+               goto retry_cpuset;
        return page;
+
+err:
+       mpol_cond_put(mpol);
+       return NULL;
 }
 
 static void update_and_free_page(struct hstate *h, struct page *page)
@@ -533,9 +618,9 @@ static void free_huge_page(struct page *page)
         */
        struct hstate *h = page_hstate(page);
        int nid = page_to_nid(page);
-       struct address_space *mapping;
+       struct hugepage_subpool *spool =
+               (struct hugepage_subpool *)page_private(page);
 
-       mapping = (struct address_space *) page_private(page);
        set_page_private(page, 0);
        page->mapping = NULL;
        BUG_ON(page_count(page));
@@ -551,8 +636,7 @@ static void free_huge_page(struct page *page)
                enqueue_huge_page(h, page);
        }
        spin_unlock(&hugetlb_lock);
-       if (mapping)
-               hugetlb_put_quota(mapping, 1);
+       hugepage_subpool_put_pages(spool, 1);
 }
 
 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
@@ -852,6 +936,7 @@ static int gather_surplus_pages(struct hstate *h, int delta)
        struct page *page, *tmp;
        int ret, i;
        int needed, allocated;
+       bool alloc_ok = true;
 
        needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
        if (needed <= 0) {
@@ -867,17 +952,13 @@ retry:
        spin_unlock(&hugetlb_lock);
        for (i = 0; i < needed; i++) {
                page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
-               if (!page)
-                       /*
-                        * We were not able to allocate enough pages to
-                        * satisfy the entire reservation so we free what
-                        * we've allocated so far.
-                        */
-                       goto free;
-
+               if (!page) {
+                       alloc_ok = false;
+                       break;
+               }
                list_add(&page->lru, &surplus_list);
        }
-       allocated += needed;
+       allocated += i;
 
        /*
         * After retaking hugetlb_lock, we need to recalculate 'needed'
@@ -886,9 +967,16 @@ retry:
        spin_lock(&hugetlb_lock);
        needed = (h->resv_huge_pages + delta) -
                        (h->free_huge_pages + allocated);
-       if (needed > 0)
-               goto retry;
-
+       if (needed > 0) {
+               if (alloc_ok)
+                       goto retry;
+               /*
+                * We were not able to allocate enough pages to
+                * satisfy the entire reservation so we free what
+                * we've allocated so far.
+                */
+               goto free;
+       }
        /*
         * The surplus_list now contains _at_least_ the number of extra pages
         * needed to accommodate the reservation.  Add the appropriate number
@@ -914,10 +1002,10 @@ retry:
                VM_BUG_ON(page_count(page));
                enqueue_huge_page(h, page);
        }
+free:
        spin_unlock(&hugetlb_lock);
 
        /* Free unnecessary surplus pages to the buddy allocator */
-free:
        if (!list_empty(&surplus_list)) {
                list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
                        list_del(&page->lru);
@@ -966,11 +1054,12 @@ static void return_unused_surplus_pages(struct hstate *h,
 /*
  * Determine if the huge page at addr within the vma has an associated
  * reservation.  Where it does not we will need to logically increase
- * reservation and actually increase quota before an allocation can occur.
- * Where any new reservation would be required the reservation change is
- * prepared, but not committed.  Once the page has been quota'd allocated
- * an instantiated the change should be committed via vma_commit_reservation.
- * No action is required on failure.
+ * reservation and actually increase subpool usage before an allocation
+ * can occur.  Where any new reservation would be required the
+ * reservation change is prepared, but not committed.  Once the page
+ * has been allocated from the subpool and instantiated the change should
+ * be committed via vma_commit_reservation.  No action is required on
+ * failure.
  */
 static long vma_needs_reservation(struct hstate *h,
                        struct vm_area_struct *vma, unsigned long addr)
@@ -1019,24 +1108,24 @@ static void vma_commit_reservation(struct hstate *h,
 static struct page *alloc_huge_page(struct vm_area_struct *vma,
                                    unsigned long addr, int avoid_reserve)
 {
+       struct hugepage_subpool *spool = subpool_vma(vma);
        struct hstate *h = hstate_vma(vma);
        struct page *page;
-       struct address_space *mapping = vma->vm_file->f_mapping;
-       struct inode *inode = mapping->host;
        long chg;
 
        /*
-        * Processes that did not create the mapping will have no reserves and
-        * will not have accounted against quota. Check that the quota can be
-        * made before satisfying the allocation
-        * MAP_NORESERVE mappings may also need pages and quota allocated
-        * if no reserve mapping overlaps.
+        * Processes that did not create the mapping will have no
+        * reserves and will not have accounted against subpool
+        * limit. Check that the subpool limit can be made before
+        * satisfying the allocation MAP_NORESERVE mappings may also
+        * need pages and subpool limit allocated allocated if no reserve
+        * mapping overlaps.
         */
        chg = vma_needs_reservation(h, vma, addr);
        if (chg < 0)
                return ERR_PTR(-VM_FAULT_OOM);
        if (chg)
-               if (hugetlb_get_quota(inode->i_mapping, chg))
+               if (hugepage_subpool_get_pages(spool, chg))
                        return ERR_PTR(-VM_FAULT_SIGBUS);
 
        spin_lock(&hugetlb_lock);
@@ -1046,12 +1135,12 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
        if (!page) {
                page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
                if (!page) {
-                       hugetlb_put_quota(inode->i_mapping, chg);
+                       hugepage_subpool_put_pages(spool, chg);
                        return ERR_PTR(-VM_FAULT_SIGBUS);
                }
        }
 
-       set_page_private(page, (unsigned long) mapping);
+       set_page_private(page, (unsigned long)spool);
 
        vma_commit_reservation(h, vma, addr);
 
@@ -2072,6 +2161,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
 {
        struct hstate *h = hstate_vma(vma);
        struct resv_map *reservations = vma_resv_map(vma);
+       struct hugepage_subpool *spool = subpool_vma(vma);
        unsigned long reserve;
        unsigned long start;
        unsigned long end;
@@ -2087,7 +2177,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
 
                if (reserve) {
                        hugetlb_acct_memory(h, -reserve);
-                       hugetlb_put_quota(vma->vm_file->f_mapping, reserve);
+                       hugepage_subpool_put_pages(spool, reserve);
                }
        }
 }
@@ -2276,6 +2366,10 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
                if (pte_dirty(pte))
                        set_page_dirty(page);
                list_add(&page->lru, &page_list);
+
+               /* Bail out after unmapping reference page if supplied */
+               if (ref_page)
+                       break;
        }
        flush_tlb_range(vma, start, end);
        spin_unlock(&mm->page_table_lock);
@@ -2316,7 +2410,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
         */
        address = address & huge_page_mask(h);
        pgoff = vma_hugecache_offset(h, vma, address);
-       mapping = (struct address_space *)page_private(page);
+       mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
 
        /*
         * Take the mapping lock for the duration of the table walk. As
@@ -2869,11 +2963,12 @@ int hugetlb_reserve_pages(struct inode *inode,
 {
        long ret, chg;
        struct hstate *h = hstate_inode(inode);
+       struct hugepage_subpool *spool = subpool_inode(inode);
 
        /*
         * Only apply hugepage reservation if asked. At fault time, an
         * attempt will be made for VM_NORESERVE to allocate a page
-        * and filesystem quota without using reserves
+        * without using reserves
         */
        if (vm_flags & VM_NORESERVE)
                return 0;
@@ -2900,17 +2995,17 @@ int hugetlb_reserve_pages(struct inode *inode,
        if (chg < 0)
                return chg;
 
-       /* There must be enough filesystem quota for the mapping */
-       if (hugetlb_get_quota(inode->i_mapping, chg))
+       /* There must be enough pages in the subpool for the mapping */
+       if (hugepage_subpool_get_pages(spool, chg))
                return -ENOSPC;
 
        /*
         * Check enough hugepages are available for the reservation.
-        * Hand back the quota if there are not
+        * Hand the pages back to the subpool if there are not
         */
        ret = hugetlb_acct_memory(h, chg);
        if (ret < 0) {
-               hugetlb_put_quota(inode->i_mapping, chg);
+               hugepage_subpool_put_pages(spool, chg);
                return ret;
        }
 
@@ -2934,12 +3029,13 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
 {
        struct hstate *h = hstate_inode(inode);
        long chg = region_truncate(&inode->i_mapping->private_list, offset);
+       struct hugepage_subpool *spool = subpool_inode(inode);
 
        spin_lock(&inode->i_lock);
        inode->i_blocks -= (blocks_per_huge_page(h) * freed);
        spin_unlock(&inode->i_lock);
 
-       hugetlb_put_quota(inode->i_mapping, (chg - freed));
+       hugepage_subpool_put_pages(spool, (chg - freed));
        hugetlb_acct_memory(h, -(chg - freed));
 }
 
index a6d3fb7e6c10576eb6f075a61ebf43535da2ce53..47c885368890741407eb45e8699acb12fc773aca 100644 (file)
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -374,6 +374,20 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
        return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
 }
 
+static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
+               unsigned long addr)
+{
+       struct vm_area_struct *vma;
+       if (ksm_test_exit(mm))
+               return NULL;
+       vma = find_vma(mm, addr);
+       if (!vma || vma->vm_start > addr)
+               return NULL;
+       if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
+               return NULL;
+       return vma;
+}
+
 static void break_cow(struct rmap_item *rmap_item)
 {
        struct mm_struct *mm = rmap_item->mm;
@@ -387,15 +401,9 @@ static void break_cow(struct rmap_item *rmap_item)
        put_anon_vma(rmap_item->anon_vma);
 
        down_read(&mm->mmap_sem);
-       if (ksm_test_exit(mm))
-               goto out;
-       vma = find_vma(mm, addr);
-       if (!vma || vma->vm_start > addr)
-               goto out;
-       if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
-               goto out;
-       break_ksm(vma, addr);
-out:
+       vma = find_mergeable_vma(mm, addr);
+       if (vma)
+               break_ksm(vma, addr);
        up_read(&mm->mmap_sem);
 }
 
@@ -421,12 +429,8 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
        struct page *page;
 
        down_read(&mm->mmap_sem);
-       if (ksm_test_exit(mm))
-               goto out;
-       vma = find_vma(mm, addr);
-       if (!vma || vma->vm_start > addr)
-               goto out;
-       if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
+       vma = find_mergeable_vma(mm, addr);
+       if (!vma)
                goto out;
 
        page = follow_page(vma, addr, FOLL_GET);
index 26c6f4ec20f43e626f0564776f1b38be8a54cd41..b2ee6df0e9bb31eebd3b2f1528cbbcccb2b9c43e 100644 (file)
@@ -89,7 +89,6 @@ enum mem_cgroup_stat_index {
        MEM_CGROUP_STAT_FILE_MAPPED,  /* # of pages charged as file rss */
        MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
        MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */
-       MEM_CGROUP_ON_MOVE,     /* someone is moving account between groups */
        MEM_CGROUP_STAT_NSTATS,
 };
 
@@ -135,7 +134,7 @@ struct mem_cgroup_reclaim_iter {
  */
 struct mem_cgroup_per_zone {
        struct lruvec           lruvec;
-       unsigned long           count[NR_LRU_LISTS];
+       unsigned long           lru_size[NR_LRU_LISTS];
 
        struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
 
@@ -144,11 +143,9 @@ struct mem_cgroup_per_zone {
        unsigned long long      usage_in_excess;/* Set to the value by which */
                                                /* the soft limit is exceeded*/
        bool                    on_tree;
-       struct mem_cgroup       *mem;           /* Back pointer, we cannot */
+       struct mem_cgroup       *memcg;         /* Back pointer, we cannot */
                                                /* use container_of        */
 };
-/* Macro for accessing counter */
-#define MEM_CGROUP_ZSTAT(mz, idx)      ((mz)->count[(idx)])
 
 struct mem_cgroup_per_node {
        struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
@@ -299,6 +296,12 @@ struct mem_cgroup {
         * mem_cgroup ? And what type of charges should we move ?
         */
        unsigned long   move_charge_at_immigrate;
+       /*
+        * set > 0 if pages under this cgroup are moving to other cgroup.
+        */
+       atomic_t        moving_account;
+       /* taken only while moving_account > 0 */
+       spinlock_t      move_lock;
        /*
         * percpu counter.
         */
@@ -612,9 +615,9 @@ retry:
         * we will to add it back at the end of reclaim to its correct
         * position in the tree.
         */
-       __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
-       if (!res_counter_soft_limit_excess(&mz->mem->res) ||
-               !css_tryget(&mz->mem->css))
+       __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
+       if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
+               !css_tryget(&mz->memcg->css))
                goto retry;
 done:
        return mz;
@@ -692,15 +695,19 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
 }
 
 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
-                                        bool file, int nr_pages)
+                                        bool anon, int nr_pages)
 {
        preempt_disable();
 
-       if (file)
-               __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
+       /*
+        * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
+        * counted as CACHE even if it's on ANON LRU.
+        */
+       if (anon)
+               __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
                                nr_pages);
        else
-               __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
+               __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
                                nr_pages);
 
        /* pagein of a big page is an event. So, ignore page size */
@@ -721,14 +728,14 @@ mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
                        unsigned int lru_mask)
 {
        struct mem_cgroup_per_zone *mz;
-       enum lru_list l;
+       enum lru_list lru;
        unsigned long ret = 0;
 
        mz = mem_cgroup_zoneinfo(memcg, nid, zid);
 
-       for_each_lru(l) {
-               if (BIT(l) & lru_mask)
-                       ret += MEM_CGROUP_ZSTAT(mz, l);
+       for_each_lru(lru) {
+               if (BIT(lru) & lru_mask)
+                       ret += mz->lru_size[lru];
        }
        return ret;
 }
@@ -1077,7 +1084,7 @@ struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page,
 
        mz = page_cgroup_zoneinfo(memcg, page);
        /* compound_order() is stabilized through lru_lock */
-       MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);
+       mz->lru_size[lru] += 1 << compound_order(page);
        return &mz->lruvec;
 }
 
@@ -1105,8 +1112,8 @@ void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru)
        VM_BUG_ON(!memcg);
        mz = page_cgroup_zoneinfo(memcg, page);
        /* huge page split is done under lru_lock. so, we have no races. */
-       VM_BUG_ON(MEM_CGROUP_ZSTAT(mz, lru) < (1 << compound_order(page)));
-       MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);
+       VM_BUG_ON(mz->lru_size[lru] < (1 << compound_order(page)));
+       mz->lru_size[lru] -= 1 << compound_order(page);
 }
 
 void mem_cgroup_lru_del(struct page *page)
@@ -1285,40 +1292,48 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg)
        return memcg->swappiness;
 }
 
-static void mem_cgroup_start_move(struct mem_cgroup *memcg)
-{
-       int cpu;
+/*
+ * memcg->moving_account is used for checking possibility that some thread is
+ * calling move_account(). When a thread on CPU-A starts moving pages under
+ * a memcg, other threads should check memcg->moving_account under
+ * rcu_read_lock(), like this:
+ *
+ *         CPU-A                                    CPU-B
+ *                                              rcu_read_lock()
+ *         memcg->moving_account+1              if (memcg->mocing_account)
+ *                                                   take heavy locks.
+ *         synchronize_rcu()                    update something.
+ *                                              rcu_read_unlock()
+ *         start move here.
+ */
 
-       get_online_cpus();
-       spin_lock(&memcg->pcp_counter_lock);
-       for_each_online_cpu(cpu)
-               per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
-       memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;
-       spin_unlock(&memcg->pcp_counter_lock);
-       put_online_cpus();
+/* for quick checking without looking up memcg */
+atomic_t memcg_moving __read_mostly;
 
+static void mem_cgroup_start_move(struct mem_cgroup *memcg)
+{
+       atomic_inc(&memcg_moving);
+       atomic_inc(&memcg->moving_account);
        synchronize_rcu();
 }
 
 static void mem_cgroup_end_move(struct mem_cgroup *memcg)
 {
-       int cpu;
-
-       if (!memcg)
-               return;
-       get_online_cpus();
-       spin_lock(&memcg->pcp_counter_lock);
-       for_each_online_cpu(cpu)
-               per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
-       memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;
-       spin_unlock(&memcg->pcp_counter_lock);
-       put_online_cpus();
+       /*
+        * Now, mem_cgroup_clear_mc() may call this function with NULL.
+        * We check NULL in callee rather than caller.
+        */
+       if (memcg) {
+               atomic_dec(&memcg_moving);
+               atomic_dec(&memcg->moving_account);
+       }
 }
+
 /*
  * 2 routines for checking "mem" is under move_account() or not.
  *
- * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used
- *                       for avoiding race in accounting. If true,
+ * mem_cgroup_stolen() -  checking whether a cgroup is mc.from or not. This
+ *                       is used for avoiding races in accounting.  If true,
  *                       pc->mem_cgroup may be overwritten.
  *
  * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
@@ -1326,10 +1341,10 @@ static void mem_cgroup_end_move(struct mem_cgroup *memcg)
  *                       waiting at hith-memory prressure caused by "move".
  */
 
-static bool mem_cgroup_stealed(struct mem_cgroup *memcg)
+static bool mem_cgroup_stolen(struct mem_cgroup *memcg)
 {
        VM_BUG_ON(!rcu_read_lock_held());
-       return this_cpu_read(memcg->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
+       return atomic_read(&memcg->moving_account) > 0;
 }
 
 static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
@@ -1370,6 +1385,24 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
        return false;
 }
 
+/*
+ * Take this lock when
+ * - a code tries to modify page's memcg while it's USED.
+ * - a code tries to modify page state accounting in a memcg.
+ * see mem_cgroup_stolen(), too.
+ */
+static void move_lock_mem_cgroup(struct mem_cgroup *memcg,
+                                 unsigned long *flags)
+{
+       spin_lock_irqsave(&memcg->move_lock, *flags);
+}
+
+static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
+                               unsigned long *flags)
+{
+       spin_unlock_irqrestore(&memcg->move_lock, *flags);
+}
+
 /**
  * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
  * @memcg: The memory cgroup that went over limit
@@ -1393,7 +1426,6 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
        if (!memcg || !p)
                return;
 
-
        rcu_read_lock();
 
        mem_cgrp = memcg->css.cgroup;
@@ -1772,22 +1804,22 @@ static DEFINE_SPINLOCK(memcg_oom_lock);
 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
 
 struct oom_wait_info {
-       struct mem_cgroup *mem;
+       struct mem_cgroup *memcg;
        wait_queue_t    wait;
 };
 
 static int memcg_oom_wake_function(wait_queue_t *wait,
        unsigned mode, int sync, void *arg)
 {
-       struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg,
-                         *oom_wait_memcg;
+       struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
+       struct mem_cgroup *oom_wait_memcg;
        struct oom_wait_info *oom_wait_info;
 
        oom_wait_info = container_of(wait, struct oom_wait_info, wait);
-       oom_wait_memcg = oom_wait_info->mem;
+       oom_wait_memcg = oom_wait_info->memcg;
 
        /*
-        * Both of oom_wait_info->mem and wake_mem are stable under us.
+        * Both of oom_wait_info->memcg and wake_memcg are stable under us.
         * Then we can use css_is_ancestor without taking care of RCU.
         */
        if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
@@ -1811,12 +1843,12 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
 /*
  * try to call OOM killer. returns false if we should exit memory-reclaim loop.
  */
-bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask)
+bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
 {
        struct oom_wait_info owait;
        bool locked, need_to_kill;
 
-       owait.mem = memcg;
+       owait.memcg = memcg;
        owait.wait.flags = 0;
        owait.wait.func = memcg_oom_wake_function;
        owait.wait.private = current;
@@ -1841,7 +1873,7 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask)
 
        if (need_to_kill) {
                finish_wait(&memcg_oom_waitq, &owait.wait);
-               mem_cgroup_out_of_memory(memcg, mask);
+               mem_cgroup_out_of_memory(memcg, mask, order);
        } else {
                schedule();
                finish_wait(&memcg_oom_waitq, &owait.wait);
@@ -1881,41 +1913,66 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask)
  * by flags.
  *
  * Considering "move", this is an only case we see a race. To make the race
- * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are
- * possibility of race condition. If there is, we take a lock.
+ * small, we check mm->moving_account and detect there are possibility of race
+ * If there is, we take a lock.
  */
 
+void __mem_cgroup_begin_update_page_stat(struct page *page,
+                               bool *locked, unsigned long *flags)
+{
+       struct mem_cgroup *memcg;
+       struct page_cgroup *pc;
+
+       pc = lookup_page_cgroup(page);
+again:
+       memcg = pc->mem_cgroup;
+       if (unlikely(!memcg || !PageCgroupUsed(pc)))
+               return;
+       /*
+        * If this memory cgroup is not under account moving, we don't
+        * need to take move_lock_page_cgroup(). Because we already hold
+        * rcu_read_lock(), any calls to move_account will be delayed until
+        * rcu_read_unlock() if mem_cgroup_stolen() == true.
+        */
+       if (!mem_cgroup_stolen(memcg))
+               return;
+
+       move_lock_mem_cgroup(memcg, flags);
+       if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {
+               move_unlock_mem_cgroup(memcg, flags);
+               goto again;
+       }
+       *locked = true;
+}
+
+void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
+{
+       struct page_cgroup *pc = lookup_page_cgroup(page);
+
+       /*
+        * It's guaranteed that pc->mem_cgroup never changes while
+        * lock is held because a routine modifies pc->mem_cgroup
+        * should take move_lock_page_cgroup().
+        */
+       move_unlock_mem_cgroup(pc->mem_cgroup, flags);
+}
+
 void mem_cgroup_update_page_stat(struct page *page,
                                 enum mem_cgroup_page_stat_item idx, int val)
 {
        struct mem_cgroup *memcg;
        struct page_cgroup *pc = lookup_page_cgroup(page);
-       bool need_unlock = false;
        unsigned long uninitialized_var(flags);
 
        if (mem_cgroup_disabled())
                return;
 
-       rcu_read_lock();
        memcg = pc->mem_cgroup;
        if (unlikely(!memcg || !PageCgroupUsed(pc)))
-               goto out;
-       /* pc->mem_cgroup is unstable ? */
-       if (unlikely(mem_cgroup_stealed(memcg)) || PageTransHuge(page)) {
-               /* take a lock against to access pc->mem_cgroup */
-               move_lock_page_cgroup(pc, &flags);
-               need_unlock = true;
-               memcg = pc->mem_cgroup;
-               if (!memcg || !PageCgroupUsed(pc))
-                       goto out;
-       }
+               return;
 
        switch (idx) {
        case MEMCG_NR_FILE_MAPPED:
-               if (val > 0)
-                       SetPageCgroupFileMapped(pc);
-               else if (!page_mapped(page))
-                       ClearPageCgroupFileMapped(pc);
                idx = MEM_CGROUP_STAT_FILE_MAPPED;
                break;
        default:
@@ -1923,14 +1980,7 @@ void mem_cgroup_update_page_stat(struct page *page,
        }
 
        this_cpu_add(memcg->stat->count[idx], val);
-
-out:
-       if (unlikely(need_unlock))
-               move_unlock_page_cgroup(pc, &flags);
-       rcu_read_unlock();
-       return;
 }
-EXPORT_SYMBOL(mem_cgroup_update_page_stat);
 
 /*
  * size of first charge trial. "32" comes from vmscan.c's magic value.
@@ -2101,17 +2151,6 @@ static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
                per_cpu(memcg->stat->events[i], cpu) = 0;
                memcg->nocpu_base.events[i] += x;
        }
-       /* need to clear ON_MOVE value, works as a kind of lock. */
-       per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;
-       spin_unlock(&memcg->pcp_counter_lock);
-}
-
-static void synchronize_mem_cgroup_on_move(struct mem_cgroup *memcg, int cpu)
-{
-       int idx = MEM_CGROUP_ON_MOVE;
-
-       spin_lock(&memcg->pcp_counter_lock);
-       per_cpu(memcg->stat->count[idx], cpu) = memcg->nocpu_base.count[idx];
        spin_unlock(&memcg->pcp_counter_lock);
 }
 
@@ -2123,11 +2162,8 @@ static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
        struct memcg_stock_pcp *stock;
        struct mem_cgroup *iter;
 
-       if ((action == CPU_ONLINE)) {
-               for_each_mem_cgroup(iter)
-                       synchronize_mem_cgroup_on_move(iter, cpu);
+       if (action == CPU_ONLINE)
                return NOTIFY_OK;
-       }
 
        if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)
                return NOTIFY_OK;
@@ -2212,7 +2248,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
        if (!oom_check)
                return CHARGE_NOMEM;
        /* check OOM */
-       if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask))
+       if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize)))
                return CHARGE_OOM_DIE;
 
        return CHARGE_RETRY;
@@ -2446,6 +2482,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
 {
        struct zone *uninitialized_var(zone);
        bool was_on_lru = false;
+       bool anon;
 
        lock_page_cgroup(pc);
        if (unlikely(PageCgroupUsed(pc))) {
@@ -2481,19 +2518,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
         * See mem_cgroup_add_lru_list(), etc.
         */
        smp_wmb();
-       switch (ctype) {
-       case MEM_CGROUP_CHARGE_TYPE_CACHE:
-       case MEM_CGROUP_CHARGE_TYPE_SHMEM:
-               SetPageCgroupCache(pc);
-               SetPageCgroupUsed(pc);
-               break;
-       case MEM_CGROUP_CHARGE_TYPE_MAPPED:
-               ClearPageCgroupCache(pc);
-               SetPageCgroupUsed(pc);
-               break;
-       default:
-               break;
-       }
+       SetPageCgroupUsed(pc);
 
        if (lrucare) {
                if (was_on_lru) {
@@ -2504,7 +2529,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
                spin_unlock_irq(&zone->lru_lock);
        }
 
-       mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages);
+       if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
+               anon = true;
+       else
+               anon = false;
+
+       mem_cgroup_charge_statistics(memcg, anon, nr_pages);
        unlock_page_cgroup(pc);
 
        /*
@@ -2517,8 +2547,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 
-#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\
-                       (1 << PCG_MIGRATION))
+#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MIGRATION))
 /*
  * Because tail pages are not marked as "used", set it. We're under
  * zone->lru_lock, 'splitting on pmd' and compound_lock.
@@ -2569,6 +2598,7 @@ static int mem_cgroup_move_account(struct page *page,
 {
        unsigned long flags;
        int ret;
+       bool anon = PageAnon(page);
 
        VM_BUG_ON(from == to);
        VM_BUG_ON(PageLRU(page));
@@ -2588,23 +2618,23 @@ static int mem_cgroup_move_account(struct page *page,
        if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
                goto unlock;
 
-       move_lock_page_cgroup(pc, &flags);
+       move_lock_mem_cgroup(from, &flags);
 
-       if (PageCgroupFileMapped(pc)) {
+       if (!anon && page_mapped(page)) {
                /* Update mapped_file data for mem_cgroup */
                preempt_disable();
                __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
                __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
                preempt_enable();
        }
-       mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages);
+       mem_cgroup_charge_statistics(from, anon, -nr_pages);
        if (uncharge)
                /* This is not "cancel", but cancel_charge does all we need. */
                __mem_cgroup_cancel_charge(from, nr_pages);
 
        /* caller should have done css_get */
        pc->mem_cgroup = to;
-       mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages);
+       mem_cgroup_charge_statistics(to, anon, nr_pages);
        /*
         * We charges against "to" which may not have any tasks. Then, "to"
         * can be under rmdir(). But in current implementation, caller of
@@ -2612,7 +2642,7 @@ static int mem_cgroup_move_account(struct page *page,
         * guaranteed that "to" is never removed. So, we don't check rmdir
         * status here.
         */
-       move_unlock_page_cgroup(pc, &flags);
+       move_unlock_mem_cgroup(from, &flags);
        ret = 0;
 unlock:
        unlock_page_cgroup(pc);
@@ -2914,7 +2944,6 @@ direct_uncharge:
                res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);
        if (unlikely(batch->memcg != memcg))
                memcg_oom_recover(memcg);
-       return;
 }
 
 /*
@@ -2926,6 +2955,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
        struct mem_cgroup *memcg = NULL;
        unsigned int nr_pages = 1;
        struct page_cgroup *pc;
+       bool anon;
 
        if (mem_cgroup_disabled())
                return NULL;
@@ -2951,8 +2981,17 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
        if (!PageCgroupUsed(pc))
                goto unlock_out;
 
+       anon = PageAnon(page);
+
        switch (ctype) {
        case MEM_CGROUP_CHARGE_TYPE_MAPPED:
+               /*
+                * Generally PageAnon tells if it's the anon statistics to be
+                * updated; but sometimes e.g. mem_cgroup_uncharge_page() is
+                * used before page reached the stage of being marked PageAnon.
+                */
+               anon = true;
+               /* fallthrough */
        case MEM_CGROUP_CHARGE_TYPE_DROP:
                /* See mem_cgroup_prepare_migration() */
                if (page_mapped(page) || PageCgroupMigration(pc))
@@ -2969,7 +3008,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
                break;
        }
 
-       mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -nr_pages);
+       mem_cgroup_charge_statistics(memcg, anon, -nr_pages);
 
        ClearPageCgroupUsed(pc);
        /*
@@ -3276,6 +3315,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
 {
        struct page *used, *unused;
        struct page_cgroup *pc;
+       bool anon;
 
        if (!memcg)
                return;
@@ -3297,8 +3337,10 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
        lock_page_cgroup(pc);
        ClearPageCgroupMigration(pc);
        unlock_page_cgroup(pc);
-
-       __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE);
+       anon = PageAnon(used);
+       __mem_cgroup_uncharge_common(unused,
+               anon ? MEM_CGROUP_CHARGE_TYPE_MAPPED
+                    : MEM_CGROUP_CHARGE_TYPE_CACHE);
 
        /*
         * If a page is a file cache, radix-tree replacement is very atomic
@@ -3308,7 +3350,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
         * and USED bit check in mem_cgroup_uncharge_page() will do enough
         * check. (see prepare_charge() also)
         */
-       if (PageAnon(used))
+       if (anon)
                mem_cgroup_uncharge_page(used);
        /*
         * At migration, we may charge account against cgroup which has no
@@ -3338,7 +3380,7 @@ void mem_cgroup_replace_page_cache(struct page *oldpage,
        /* fix accounting on old pages */
        lock_page_cgroup(pc);
        memcg = pc->mem_cgroup;
-       mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -1);
+       mem_cgroup_charge_statistics(memcg, false, -1);
        ClearPageCgroupUsed(pc);
        unlock_page_cgroup(pc);
 
@@ -3549,7 +3591,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
                        break;
 
                nr_scanned = 0;
-               reclaimed = mem_cgroup_soft_reclaim(mz->mem, zone,
+               reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
                                                    gfp_mask, &nr_scanned);
                nr_reclaimed += reclaimed;
                *total_scanned += nr_scanned;
@@ -3576,13 +3618,13 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
                                next_mz =
                                __mem_cgroup_largest_soft_limit_node(mctz);
                                if (next_mz == mz)
-                                       css_put(&next_mz->mem->css);
+                                       css_put(&next_mz->memcg->css);
                                else /* next_mz == NULL or other memcg */
                                        break;
                        } while (1);
                }
-               __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
-               excess = res_counter_soft_limit_excess(&mz->mem->res);
+               __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
+               excess = res_counter_soft_limit_excess(&mz->memcg->res);
                /*
                 * One school of thought says that we should not add
                 * back the node to the tree if reclaim returns 0.
@@ -3592,9 +3634,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
                 * term TODO.
                 */
                /* If excess == 0, no tree ops */
-               __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess);
+               __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);
                spin_unlock(&mctz->lock);
-               css_put(&mz->mem->css);
+               css_put(&mz->memcg->css);
                loop++;
                /*
                 * Could not reclaim anything and there are no more
@@ -3607,7 +3649,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
                        break;
        } while (!nr_reclaimed);
        if (next_mz)
-               css_put(&next_mz->mem->css);
+               css_put(&next_mz->memcg->css);
        return nr_reclaimed;
 }
 
@@ -3629,7 +3671,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
        mz = mem_cgroup_zoneinfo(memcg, node, zid);
        list = &mz->lruvec.lists[lru];
 
-       loop = MEM_CGROUP_ZSTAT(mz, lru);
+       loop = mz->lru_size[lru];
        /* give some margin against EBUSY etc...*/
        loop += 256;
        busy = NULL;
@@ -3703,10 +3745,10 @@ move_account:
                mem_cgroup_start_move(memcg);
                for_each_node_state(node, N_HIGH_MEMORY) {
                        for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
-                               enum lru_list l;
-                               for_each_lru(l) {
+                               enum lru_list lru;
+                               for_each_lru(lru) {
                                        ret = mem_cgroup_force_empty_list(memcg,
-                                                       node, zid, l);
+                                                       node, zid, lru);
                                        if (ret)
                                                break;
                                }
@@ -3860,7 +3902,6 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
                break;
        default:
                BUG();
-               break;
        }
        return val;
 }
@@ -3939,7 +3980,6 @@ static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
 out:
        *mem_limit = min_limit;
        *memsw_limit = min_memsw_limit;
-       return;
 }
 
 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
@@ -4098,38 +4138,38 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
        unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
        unsigned long node_nr;
        struct cgroup *cont = m->private;
-       struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
+       struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
 
-       total_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL);
+       total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);
        seq_printf(m, "total=%lu", total_nr);
        for_each_node_state(nid, N_HIGH_MEMORY) {
-               node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, LRU_ALL);
+               node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL);
                seq_printf(m, " N%d=%lu", nid, node_nr);
        }
        seq_putc(m, '\n');
 
-       file_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_FILE);
+       file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE);
        seq_printf(m, "file=%lu", file_nr);
        for_each_node_state(nid, N_HIGH_MEMORY) {
-               node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
+               node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
                                LRU_ALL_FILE);
                seq_printf(m, " N%d=%lu", nid, node_nr);
        }
        seq_putc(m, '\n');
 
-       anon_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_ANON);
+       anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON);
        seq_printf(m, "anon=%lu", anon_nr);
        for_each_node_state(nid, N_HIGH_MEMORY) {
-               node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
+               node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
                                LRU_ALL_ANON);
                seq_printf(m, " N%d=%lu", nid, node_nr);
        }
        seq_putc(m, '\n');
 
-       unevictable_nr = mem_cgroup_nr_lru_pages(mem_cont, BIT(LRU_UNEVICTABLE));
+       unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
        seq_printf(m, "unevictable=%lu", unevictable_nr);
        for_each_node_state(nid, N_HIGH_MEMORY) {
-               node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
+               node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
                                BIT(LRU_UNEVICTABLE));
                seq_printf(m, " N%d=%lu", nid, node_nr);
        }
@@ -4141,12 +4181,12 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
 static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
                                 struct cgroup_map_cb *cb)
 {
-       struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
+       struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
        struct mcs_total_stat mystat;
        int i;
 
        memset(&mystat, 0, sizeof(mystat));
-       mem_cgroup_get_local_stat(mem_cont, &mystat);
+       mem_cgroup_get_local_stat(memcg, &mystat);
 
 
        for (i = 0; i < NR_MCS_STAT; i++) {
@@ -4158,14 +4198,14 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
        /* Hierarchical information */
        {
                unsigned long long limit, memsw_limit;
-               memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit);
+               memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit);
                cb->fill(cb, "hierarchical_memory_limit", limit);
                if (do_swap_account)
                        cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);
        }
 
        memset(&mystat, 0, sizeof(mystat));
-       mem_cgroup_get_total_stat(mem_cont, &mystat);
+       mem_cgroup_get_total_stat(memcg, &mystat);
        for (i = 0; i < NR_MCS_STAT; i++) {
                if (i == MCS_SWAP && !do_swap_account)
                        continue;
@@ -4181,7 +4221,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
 
                for_each_online_node(nid)
                        for (zid = 0; zid < MAX_NR_ZONES; zid++) {
-                               mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
+                               mz = mem_cgroup_zoneinfo(memcg, nid, zid);
 
                                recent_rotated[0] +=
                                        mz->reclaim_stat.recent_rotated[0];
@@ -4426,12 +4466,6 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
        else
                BUG();
 
-       /*
-        * Something went wrong if we trying to unregister a threshold
-        * if we don't have thresholds
-        */
-       BUG_ON(!thresholds);
-
        if (!thresholds->primary)
                goto unlock;
 
@@ -4736,7 +4770,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
 {
        struct mem_cgroup_per_node *pn;
        struct mem_cgroup_per_zone *mz;
-       enum lru_list l;
+       enum lru_list lru;
        int zone, tmp = node;
        /*
         * This routine is called against possible nodes.
@@ -4754,11 +4788,11 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
 
        for (zone = 0; zone < MAX_NR_ZONES; zone++) {
                mz = &pn->zoneinfo[zone];
-               for_each_lru(l)
-                       INIT_LIST_HEAD(&mz->lruvec.lists[l]);
+               for_each_lru(lru)
+                       INIT_LIST_HEAD(&mz->lruvec.lists[lru]);
                mz->usage_in_excess = 0;
                mz->on_tree = false;
-               mz->mem = memcg;
+               mz->memcg = memcg;
        }
        memcg->info.nodeinfo[node] = pn;
        return 0;
@@ -4771,29 +4805,29 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
 
 static struct mem_cgroup *mem_cgroup_alloc(void)
 {
-       struct mem_cgroup *mem;
+       struct mem_cgroup *memcg;
        int size = sizeof(struct mem_cgroup);
 
        /* Can be very big if MAX_NUMNODES is very big */
        if (size < PAGE_SIZE)
-               mem = kzalloc(size, GFP_KERNEL);
+               memcg = kzalloc(size, GFP_KERNEL);
        else
-               mem = vzalloc(size);
+               memcg = vzalloc(size);
 
-       if (!mem)
+       if (!memcg)
                return NULL;
 
-       mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
-       if (!mem->stat)
+       memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
+       if (!memcg->stat)
                goto out_free;
-       spin_lock_init(&mem->pcp_counter_lock);
-       return mem;
+       spin_lock_init(&memcg->pcp_counter_lock);
+       return memcg;
 
 out_free:
        if (size < PAGE_SIZE)
-               kfree(mem);
+               kfree(memcg);
        else
-               vfree(mem);
+               vfree(memcg);
        return NULL;
 }
 
@@ -4981,6 +5015,7 @@ mem_cgroup_create(struct cgroup *cont)
        atomic_set(&memcg->refcnt, 1);
        memcg->move_charge_at_immigrate = 0;
        mutex_init(&memcg->thresholds_lock);
+       spin_lock_init(&memcg->move_lock);
        return &memcg->css;
 free_out:
        __mem_cgroup_free(memcg);
@@ -5075,7 +5110,7 @@ one_by_one:
 }
 
 /**
- * is_target_pte_for_mc - check a pte whether it is valid for move charge
+ * get_mctgt_type - get target type of moving charge
  * @vma: the vma the pte to be checked belongs
  * @addr: the address corresponding to the pte to be checked
  * @ptent: the pte to be checked
@@ -5098,7 +5133,7 @@ union mc_target {
 };
 
 enum mc_target_type {
-       MC_TARGET_NONE, /* not used */
+       MC_TARGET_NONE = 0,
        MC_TARGET_PAGE,
        MC_TARGET_SWAP,
 };
@@ -5179,12 +5214,12 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
        return page;
 }
 
-static int is_target_pte_for_mc(struct vm_area_struct *vma,
+static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
                unsigned long addr, pte_t ptent, union mc_target *target)
 {
        struct page *page = NULL;
        struct page_cgroup *pc;
-       int ret = 0;
+       enum mc_target_type ret = MC_TARGET_NONE;
        swp_entry_t ent = { .val = 0 };
 
        if (pte_present(ptent))
@@ -5195,7 +5230,7 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma,
                page = mc_handle_file_pte(vma, addr, ptent, &ent);
 
        if (!page && !ent.val)
-               return 0;
+               return ret;
        if (page) {
                pc = lookup_page_cgroup(page);
                /*
@@ -5221,6 +5256,41 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma,
        return ret;
 }
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/*
+ * We don't consider swapping or file mapped pages because THP does not
+ * support them for now.
+ * Caller should make sure that pmd_trans_huge(pmd) is true.
+ */
+static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
+               unsigned long addr, pmd_t pmd, union mc_target *target)
+{
+       struct page *page = NULL;
+       struct page_cgroup *pc;
+       enum mc_target_type ret = MC_TARGET_NONE;
+
+       page = pmd_page(pmd);
+       VM_BUG_ON(!page || !PageHead(page));
+       if (!move_anon())
+               return ret;
+       pc = lookup_page_cgroup(page);
+       if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
+               ret = MC_TARGET_PAGE;
+               if (target) {
+                       get_page(page);
+                       target->page = page;
+               }
+       }
+       return ret;
+}
+#else
+static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
+               unsigned long addr, pmd_t pmd, union mc_target *target)
+{
+       return MC_TARGET_NONE;
+}
+#endif
+
 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
                                        unsigned long addr, unsigned long end,
                                        struct mm_walk *walk)
@@ -5229,11 +5299,16 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
        pte_t *pte;
        spinlock_t *ptl;
 
-       split_huge_page_pmd(walk->mm, pmd);
+       if (pmd_trans_huge_lock(pmd, vma) == 1) {
+               if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
+                       mc.precharge += HPAGE_PMD_NR;
+               spin_unlock(&vma->vm_mm->page_table_lock);
+               return 0;
+       }
 
        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
        for (; addr != end; pte++, addr += PAGE_SIZE)
-               if (is_target_pte_for_mc(vma, addr, *pte, NULL))
+               if (get_mctgt_type(vma, addr, *pte, NULL))
                        mc.precharge++; /* increment precharge temporarily */
        pte_unmap_unlock(pte - 1, ptl);
        cond_resched();
@@ -5388,23 +5463,55 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
        struct vm_area_struct *vma = walk->private;
        pte_t *pte;
        spinlock_t *ptl;
+       enum mc_target_type target_type;
+       union mc_target target;
+       struct page *page;
+       struct page_cgroup *pc;
+
+       /*
+        * We don't take compound_lock() here but no race with splitting thp
+        * happens because:
+        *  - if pmd_trans_huge_lock() returns 1, the relevant thp is not
+        *    under splitting, which means there's no concurrent thp split,
+        *  - if another thread runs into split_huge_page() just after we
+        *    entered this if-block, the thread must wait for page table lock
+        *    to be unlocked in __split_huge_page_splitting(), where the main
+        *    part of thp split is not executed yet.
+        */
+       if (pmd_trans_huge_lock(pmd, vma) == 1) {
+               if (!mc.precharge) {
+                       spin_unlock(&vma->vm_mm->page_table_lock);
+                       return 0;
+               }
+               target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
+               if (target_type == MC_TARGET_PAGE) {
+                       page = target.page;
+                       if (!isolate_lru_page(page)) {
+                               pc = lookup_page_cgroup(page);
+                               if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
+                                                            pc, mc.from, mc.to,
+                                                            false)) {
+                                       mc.precharge -= HPAGE_PMD_NR;
+                                       mc.moved_charge += HPAGE_PMD_NR;
+                               }
+                               putback_lru_page(page);
+                       }
+                       put_page(page);
+               }
+               spin_unlock(&vma->vm_mm->page_table_lock);
+               return 0;
+       }
 
-       split_huge_page_pmd(walk->mm, pmd);
 retry:
        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
        for (; addr != end; addr += PAGE_SIZE) {
                pte_t ptent = *(pte++);
-               union mc_target target;
-               int type;
-               struct page *page;
-               struct page_cgroup *pc;
                swp_entry_t ent;
 
                if (!mc.precharge)
                        break;
 
-               type = is_target_pte_for_mc(vma, addr, ptent, &target);
-               switch (type) {
+               switch (get_mctgt_type(vma, addr, ptent, &target)) {
                case MC_TARGET_PAGE:
                        page = target.page;
                        if (isolate_lru_page(page))
@@ -5417,7 +5524,7 @@ retry:
                                mc.moved_charge++;
                        }
                        putback_lru_page(page);
-put:                   /* is_target_pte_for_mc() gets the page */
+put:                   /* get_mctgt_type() gets the page */
                        put_page(page);
                        break;
                case MC_TARGET_SWAP:
index 56080ea361406090860493861d5253dc314debc1..c22076ffdd44283a07a9a8f0fde2567654f44302 100644 (file)
@@ -1063,7 +1063,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
         * The check (unnecessarily) ignores LRU pages being isolated and
         * walked by the page reclaim code, however that's not a big loss.
         */
-       if (!PageHuge(p) && !PageTransCompound(p)) {
+       if (!PageHuge(p) && !PageTransTail(p)) {
                if (!PageLRU(p))
                        shake_page(p, 0);
                if (!PageLRU(p)) {
index 8438c157e4d99bed4b34ba4f48a05f02bcd55515..3416b6e018d6a7667fce3168924da3dccdec8e22 100644 (file)
@@ -125,17 +125,17 @@ core_initcall(init_zero_pfn);
 
 #if defined(SPLIT_RSS_COUNTING)
 
-static void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm)
+void sync_mm_rss(struct mm_struct *mm)
 {
        int i;
 
        for (i = 0; i < NR_MM_COUNTERS; i++) {
-               if (task->rss_stat.count[i]) {
-                       add_mm_counter(mm, i, task->rss_stat.count[i]);
-                       task->rss_stat.count[i] = 0;
+               if (current->rss_stat.count[i]) {
+                       add_mm_counter(mm, i, current->rss_stat.count[i]);
+                       current->rss_stat.count[i] = 0;
                }
        }
-       task->rss_stat.events = 0;
+       current->rss_stat.events = 0;
 }
 
 static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
@@ -157,30 +157,7 @@ static void check_sync_rss_stat(struct task_struct *task)
        if (unlikely(task != current))
                return;
        if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
-               __sync_task_rss_stat(task, task->mm);
-}
-
-unsigned long get_mm_counter(struct mm_struct *mm, int member)
-{
-       long val = 0;
-
-       /*
-        * Don't use task->mm here...for avoiding to use task_get_mm()..
-        * The caller must guarantee task->mm is not invalid.
-        */
-       val = atomic_long_read(&mm->rss_stat.count[member]);
-       /*
-        * counter is updated in asynchronous manner and may go to minus.
-        * But it's never be expected number for users.
-        */
-       if (val < 0)
-               return 0;
-       return (unsigned long)val;
-}
-
-void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
-{
-       __sync_task_rss_stat(task, mm);
+               sync_mm_rss(task->mm);
 }
 #else /* SPLIT_RSS_COUNTING */
 
@@ -661,7 +638,7 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
        int i;
 
        if (current->mm == mm)
-               sync_mm_rss(current, mm);
+               sync_mm_rss(mm);
        for (i = 0; i < NR_MM_COUNTERS; i++)
                if (rss[i])
                        add_mm_counter(mm, i, rss[i]);
@@ -1247,16 +1224,24 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
        do {
                next = pmd_addr_end(addr, end);
                if (pmd_trans_huge(*pmd)) {
-                       if (next-addr != HPAGE_PMD_SIZE) {
+                       if (next - addr != HPAGE_PMD_SIZE) {
                                VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
                                split_huge_page_pmd(vma->vm_mm, pmd);
                        } else if (zap_huge_pmd(tlb, vma, pmd, addr))
-                               continue;
+                               goto next;
                        /* fall through */
                }
-               if (pmd_none_or_clear_bad(pmd))
-                       continue;
+               /*
+                * Here there can be other concurrent MADV_DONTNEED or
+                * trans huge page faults running, and if the pmd is
+                * none or trans huge it can change under us. This is
+                * because MADV_DONTNEED holds the mmap_sem in read
+                * mode.
+                */
+               if (pmd_none_or_trans_huge_or_clear_bad(pmd))
+                       goto next;
                next = zap_pte_range(tlb, vma, pmd, addr, next, details);
+next:
                cond_resched();
        } while (pmd++, addr = next, addr != end);
 
index 47296fee23dbaa67c3408227d7653037a371bcdf..cfb6c8678754fdb3baf411e2331e87ea64766fb2 100644 (file)
@@ -512,7 +512,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
        do {
                next = pmd_addr_end(addr, end);
                split_huge_page_pmd(vma->vm_mm, pmd);
-               if (pmd_none_or_clear_bad(pmd))
+               if (pmd_none_or_trans_huge_or_clear_bad(pmd))
                        continue;
                if (check_pte_range(vma, pmd, addr, next, nodes,
                                    flags, private))
@@ -1323,12 +1323,9 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
                err = -ESRCH;
                goto out;
        }
-       mm = get_task_mm(task);
-       rcu_read_unlock();
+       get_task_struct(task);
 
        err = -EINVAL;
-       if (!mm)
-               goto out;
 
        /*
         * Check if this process has the right to modify the specified
@@ -1336,14 +1333,13 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
         * capabilities, superuser privileges or the same
         * userid as the target process.
         */
-       rcu_read_lock();
        tcred = __task_cred(task);
        if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
            cred->uid  != tcred->suid && cred->uid  != tcred->uid &&
            !capable(CAP_SYS_NICE)) {
                rcu_read_unlock();
                err = -EPERM;
-               goto out;
+               goto out_put;
        }
        rcu_read_unlock();
 
@@ -1351,26 +1347,36 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
        /* Is the user allowed to access the target nodes? */
        if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
                err = -EPERM;
-               goto out;
+               goto out_put;
        }
 
        if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) {
                err = -EINVAL;
-               goto out;
+               goto out_put;
        }
 
        err = security_task_movememory(task);
        if (err)
-               goto out;
+               goto out_put;
 
-       err = do_migrate_pages(mm, old, new,
-               capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
-out:
+       mm = get_task_mm(task);
+       put_task_struct(task);
        if (mm)
-               mmput(mm);
+               err = do_migrate_pages(mm, old, new,
+                       capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
+       else
+               err = -EINVAL;
+
+       mmput(mm);
+out:
        NODEMASK_SCRATCH_FREE(scratch);
 
        return err;
+
+out_put:
+       put_task_struct(task);
+       goto out;
+
 }
 
 
@@ -1844,18 +1850,24 @@ struct page *
 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
                unsigned long addr, int node)
 {
-       struct mempolicy *pol = get_vma_policy(current, vma, addr);
+       struct mempolicy *pol;
        struct zonelist *zl;
        struct page *page;
+       unsigned int cpuset_mems_cookie;
+
+retry_cpuset:
+       pol = get_vma_policy(current, vma, addr);
+       cpuset_mems_cookie = get_mems_allowed();
 
-       get_mems_allowed();
        if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
                unsigned nid;
 
                nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
                mpol_cond_put(pol);
                page = alloc_page_interleave(gfp, order, nid);
-               put_mems_allowed();
+               if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+                       goto retry_cpuset;
+
                return page;
        }
        zl = policy_zonelist(gfp, pol, node);
@@ -1866,7 +1878,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
                struct page *page =  __alloc_pages_nodemask(gfp, order,
                                                zl, policy_nodemask(gfp, pol));
                __mpol_put(pol);
-               put_mems_allowed();
+               if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+                       goto retry_cpuset;
                return page;
        }
        /*
@@ -1874,7 +1887,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
         */
        page = __alloc_pages_nodemask(gfp, order, zl,
                                      policy_nodemask(gfp, pol));
-       put_mems_allowed();
+       if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+               goto retry_cpuset;
        return page;
 }
 
@@ -1901,11 +1915,14 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 {
        struct mempolicy *pol = current->mempolicy;
        struct page *page;
+       unsigned int cpuset_mems_cookie;
 
        if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
                pol = &default_policy;
 
-       get_mems_allowed();
+retry_cpuset:
+       cpuset_mems_cookie = get_mems_allowed();
+
        /*
         * No reference counting needed for current->mempolicy
         * nor system default_policy
@@ -1916,7 +1933,10 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
                page = __alloc_pages_nodemask(gfp, order,
                                policy_zonelist(gfp, pol, numa_node_id()),
                                policy_nodemask(gfp, pol));
-       put_mems_allowed();
+
+       if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+               goto retry_cpuset;
+
        return page;
 }
 EXPORT_SYMBOL(alloc_pages_current);
index 1503b6b54ecbd1817e6e92968d40c404a6e90f75..51c08a0c6f68ac6e78d09568bd270ad95aac7622 100644 (file)
@@ -1174,20 +1174,17 @@ set_status:
  * Migrate an array of page address onto an array of nodes and fill
  * the corresponding array of status.
  */
-static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
+static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
                         unsigned long nr_pages,
                         const void __user * __user *pages,
                         const int __user *nodes,
                         int __user *status, int flags)
 {
        struct page_to_node *pm;
-       nodemask_t task_nodes;
        unsigned long chunk_nr_pages;
        unsigned long chunk_start;
        int err;
 
-       task_nodes = cpuset_mems_allowed(task);
-
        err = -ENOMEM;
        pm = (struct page_to_node *)__get_free_page(GFP_KERNEL);
        if (!pm)
@@ -1349,6 +1346,7 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
        struct task_struct *task;
        struct mm_struct *mm;
        int err;
+       nodemask_t task_nodes;
 
        /* Check flags */
        if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
@@ -1364,11 +1362,7 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
                rcu_read_unlock();
                return -ESRCH;
        }
-       mm = get_task_mm(task);
-       rcu_read_unlock();
-
-       if (!mm)
-               return -EINVAL;
+       get_task_struct(task);
 
        /*
         * Check if this process has the right to modify the specified
@@ -1376,7 +1370,6 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
         * capabilities, superuser privileges or the same
         * userid as the target process.
         */
-       rcu_read_lock();
        tcred = __task_cred(task);
        if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
            cred->uid  != tcred->suid && cred->uid  != tcred->uid &&
@@ -1391,16 +1384,25 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
        if (err)
                goto out;
 
-       if (nodes) {
-               err = do_pages_move(mm, task, nr_pages, pages, nodes, status,
-                                   flags);
-       } else {
-               err = do_pages_stat(mm, nr_pages, pages, status);
-       }
+       task_nodes = cpuset_mems_allowed(task);
+       mm = get_task_mm(task);
+       put_task_struct(task);
+
+       if (mm) {
+               if (nodes)
+                       err = do_pages_move(mm, task_nodes, nr_pages, pages,
+                                           nodes, status, flags);
+               else
+                       err = do_pages_stat(mm, nr_pages, pages, status);
+       } else
+               err = -EINVAL;
 
-out:
        mmput(mm);
        return err;
+
+out:
+       put_task_struct(task);
+       return err;
 }
 
 /*
index 636a86876ff217a7f6d18f29c18524c2f2645a63..936b4cee8cb1ee126e7c78672d54062437ba5c45 100644 (file)
@@ -164,7 +164,7 @@ static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud,
                        }
                        /* fall through */
                }
-               if (pmd_none_or_clear_bad(pmd))
+               if (pmd_none_or_trans_huge_or_clear_bad(pmd))
                        mincore_unmapped_range(vma, addr, next, vec);
                else
                        mincore_pte_range(vma, pmd, addr, next, vec);
index 6f3766b57803c20d6115eb12f295f81e20cad4d3..a7bf6a31c9f62be11cb8e5819322565b7cf2c266 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -451,9 +451,8 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
 }
 
 /*
- * Helper for vma_adjust in the split_vma insert case:
- * insert vm structure into list and rbtree and anon_vma,
- * but it has already been inserted into prio_tree earlier.
+ * Helper for vma_adjust() in the split_vma insert case: insert a vma into the
+ * mm's list and rbtree.  It has already been inserted into the prio_tree.
  */
 static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
 {
@@ -1112,9 +1111,9 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
                 * A dummy user value is used because we are not locking
                 * memory so no accounting is necessary
                 */
-               len = ALIGN(len, huge_page_size(&default_hstate));
-               file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE,
-                                               &user, HUGETLB_ANONHUGE_INODE);
+               file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len,
+                                               VM_NORESERVE, &user,
+                                               HUGETLB_ANONHUGE_INODE);
                if (IS_ERR(file))
                        return PTR_ERR(file);
        }
@@ -1439,10 +1438,8 @@ void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
        /*
         * Is this a new hole at the lowest possible address?
         */
-       if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache) {
+       if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache)
                mm->free_area_cache = addr;
-               mm->cached_hole_size = ~0UL;
-       }
 }
 
 /*
@@ -1457,7 +1454,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 {
        struct vm_area_struct *vma;
        struct mm_struct *mm = current->mm;
-       unsigned long addr = addr0;
+       unsigned long addr = addr0, start_addr;
 
        /* requested length too big for entire address space */
        if (len > TASK_SIZE)
@@ -1481,22 +1478,14 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
                mm->free_area_cache = mm->mmap_base;
        }
 
+try_again:
        /* either no address requested or can't fit in requested address hole */
-       addr = mm->free_area_cache;
-
-       /* make sure it can fit in the remaining address space */
-       if (addr > len) {
-               vma = find_vma(mm, addr-len);
-               if (!vma || addr <= vma->vm_start)
-                       /* remember the address as a hint for next time */
-                       return (mm->free_area_cache = addr-len);
-       }
-
-       if (mm->mmap_base < len)
-               goto bottomup;
+       start_addr = addr = mm->free_area_cache;
 
-       addr = mm->mmap_base-len;
+       if (addr < len)
+               goto fail;
 
+       addr -= len;
        do {
                /*
                 * Lookup failure means no vma is above this address,
@@ -1516,7 +1505,21 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
                addr = vma->vm_start-len;
        } while (len < vma->vm_start);
 
-bottomup:
+fail:
+       /*
+        * if hint left us with no space for the requested
+        * mapping then try again:
+        *
+        * Note: this is different with the case of bottomup
+        * which does the fully line-search, but we use find_vma
+        * here that causes some holes skipped.
+        */
+       if (start_addr != mm->mmap_base) {
+               mm->free_area_cache = mm->mmap_base;
+               mm->cached_hole_size = 0;
+               goto try_again;
+       }
+
        /*
         * A failed mmap() very likely causes application failure,
         * so fall back to the bottom-up function here. This scenario
index cf332bc0080a39df65f3ce2cb425d714f41b7292..3dcfaf4ed355a3deb6dee71dab84bdaa98a7e9d4 100644 (file)
@@ -53,7 +53,7 @@ void unuse_mm(struct mm_struct *mm)
        struct task_struct *tsk = current;
 
        task_lock(tsk);
-       sync_mm_rss(tsk, mm);
+       sync_mm_rss(mm);
        tsk->mm = NULL;
        /* active_mm is still 'mm' */
        enter_lazy_tlb(mm, tsk);
index 142ef4a1f480d193822b0228aaba8f4715167169..a40992610ab6f6c0cbe96f9a9bd43a3fa973fd96 100644 (file)
@@ -60,7 +60,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
                                ptent = pte_mkwrite(ptent);
 
                        ptep_modify_prot_commit(mm, addr, pte, ptent);
-               } else if (PAGE_MIGRATION && !pte_file(oldpte)) {
+               } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) {
                        swp_entry_t entry = pte_to_swp_entry(oldpte);
 
                        if (is_write_migration_entry(entry)) {
index 2958fd8e7c9abcfcf6cc7b38e2dac34133905b22..4198e000f41a5c3162a7851ace0a307a71026626 100644 (file)
@@ -34,6 +34,7 @@
 #include <linux/ptrace.h>
 #include <linux/freezer.h>
 #include <linux/ftrace.h>
+#include <linux/ratelimit.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/oom.h>
@@ -309,7 +310,7 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
  */
 static struct task_struct *select_bad_process(unsigned int *ppoints,
                unsigned long totalpages, struct mem_cgroup *memcg,
-               const nodemask_t *nodemask)
+               const nodemask_t *nodemask, bool force_kill)
 {
        struct task_struct *g, *p;
        struct task_struct *chosen = NULL;
@@ -335,7 +336,8 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
                if (test_tsk_thread_flag(p, TIF_MEMDIE)) {
                        if (unlikely(frozen(p)))
                                __thaw_task(p);
-                       return ERR_PTR(-1UL);
+                       if (!force_kill)
+                               return ERR_PTR(-1UL);
                }
                if (!p->mm)
                        continue;
@@ -353,7 +355,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
                        if (p == current) {
                                chosen = p;
                                *ppoints = 1000;
-                       } else {
+                       } else if (!force_kill) {
                                /*
                                 * If this task is not being ptraced on exit,
                                 * then wait for it to finish before killing
@@ -434,66 +436,18 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
 }
 
 #define K(x) ((x) << (PAGE_SHIFT-10))
-static int oom_kill_task(struct task_struct *p)
-{
-       struct task_struct *q;
-       struct mm_struct *mm;
-
-       p = find_lock_task_mm(p);
-       if (!p)
-               return 1;
-
-       /* mm cannot be safely dereferenced after task_unlock(p) */
-       mm = p->mm;
-
-       pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
-               task_pid_nr(p), p->comm, K(p->mm->total_vm),
-               K(get_mm_counter(p->mm, MM_ANONPAGES)),
-               K(get_mm_counter(p->mm, MM_FILEPAGES)));
-       task_unlock(p);
-
-       /*
-        * Kill all user processes sharing p->mm in other thread groups, if any.
-        * They don't get access to memory reserves or a higher scheduler
-        * priority, though, to avoid depletion of all memory or task
-        * starvation.  This prevents mm->mmap_sem livelock when an oom killed
-        * task cannot exit because it requires the semaphore and its contended
-        * by another thread trying to allocate memory itself.  That thread will
-        * now get access to memory reserves since it has a pending fatal
-        * signal.
-        */
-       for_each_process(q)
-               if (q->mm == mm && !same_thread_group(q, p) &&
-                   !(q->flags & PF_KTHREAD)) {
-                       if (q->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
-                               continue;
-
-                       task_lock(q);   /* Protect ->comm from prctl() */
-                       pr_err("Kill process %d (%s) sharing same memory\n",
-                               task_pid_nr(q), q->comm);
-                       task_unlock(q);
-                       force_sig(SIGKILL, q);
-               }
-
-       set_tsk_thread_flag(p, TIF_MEMDIE);
-       force_sig(SIGKILL, p);
-
-       return 0;
-}
-#undef K
-
-static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
-                           unsigned int points, unsigned long totalpages,
-                           struct mem_cgroup *memcg, nodemask_t *nodemask,
-                           const char *message)
+static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
+                            unsigned int points, unsigned long totalpages,
+                            struct mem_cgroup *memcg, nodemask_t *nodemask,
+                            const char *message)
 {
        struct task_struct *victim = p;
        struct task_struct *child;
        struct task_struct *t = p;
+       struct mm_struct *mm;
        unsigned int victim_points = 0;
-
-       if (printk_ratelimit())
-               dump_header(p, gfp_mask, order, memcg, nodemask);
+       static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
+                                             DEFAULT_RATELIMIT_BURST);
 
        /*
         * If the task is already exiting, don't alarm the sysadmin or kill
@@ -501,9 +455,12 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
         */
        if (p->flags & PF_EXITING) {
                set_tsk_thread_flag(p, TIF_MEMDIE);
-               return 0;
+               return;
        }
 
+       if (__ratelimit(&oom_rs))
+               dump_header(p, gfp_mask, order, memcg, nodemask);
+
        task_lock(p);
        pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n",
                message, task_pid_nr(p), p->comm, points);
@@ -533,8 +490,44 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
                }
        } while_each_thread(p, t);
 
-       return oom_kill_task(victim);
+       victim = find_lock_task_mm(victim);
+       if (!victim)
+               return;
+
+       /* mm cannot safely be dereferenced after task_unlock(victim) */
+       mm = victim->mm;
+       pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
+               task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
+               K(get_mm_counter(victim->mm, MM_ANONPAGES)),
+               K(get_mm_counter(victim->mm, MM_FILEPAGES)));
+       task_unlock(victim);
+
+       /*
+        * Kill all user processes sharing victim->mm in other thread groups, if
+        * any.  They don't get access to memory reserves, though, to avoid
+        * depletion of all memory.  This prevents mm->mmap_sem livelock when an
+        * oom killed thread cannot exit because it requires the semaphore and
+        * its contended by another thread trying to allocate memory itself.
+        * That thread will now get access to memory reserves since it has a
+        * pending fatal signal.
+        */
+       for_each_process(p)
+               if (p->mm == mm && !same_thread_group(p, victim) &&
+                   !(p->flags & PF_KTHREAD)) {
+                       if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+                               continue;
+
+                       task_lock(p);   /* Protect ->comm from prctl() */
+                       pr_err("Kill process %d (%s) sharing same memory\n",
+                               task_pid_nr(p), p->comm);
+                       task_unlock(p);
+                       force_sig(SIGKILL, p);
+               }
+
+       set_tsk_thread_flag(victim, TIF_MEMDIE);
+       force_sig(SIGKILL, victim);
 }
+#undef K
 
 /*
  * Determines whether the kernel must panic because of the panic_on_oom sysctl.
@@ -561,7 +554,8 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
 }
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
-void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask)
+void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
+                             int order)
 {
        unsigned long limit;
        unsigned int points = 0;
@@ -577,18 +571,13 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask)
                return;
        }
 
-       check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL);
+       check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
        limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT;
        read_lock(&tasklist_lock);
-retry:
-       p = select_bad_process(&points, limit, memcg, NULL);
-       if (!p || PTR_ERR(p) == -1UL)
-               goto out;
-
-       if (oom_kill_process(p, gfp_mask, 0, points, limit, memcg, NULL,
-                               "Memory cgroup out of memory"))
-               goto retry;
-out:
+       p = select_bad_process(&points, limit, memcg, NULL, false);
+       if (p && PTR_ERR(p) != -1UL)
+               oom_kill_process(p, gfp_mask, order, points, limit, memcg, NULL,
+                                "Memory cgroup out of memory");
        read_unlock(&tasklist_lock);
 }
 #endif
@@ -700,6 +689,7 @@ static void clear_system_oom(void)
  * @gfp_mask: memory allocation flags
  * @order: amount of memory being requested as a power of 2
  * @nodemask: nodemask passed to page allocator
+ * @force_kill: true if a task must be killed, even if others are exiting
  *
  * If we run out of memory, we have the choice between either
  * killing a random task (bad), letting the system crash (worse)
@@ -707,7 +697,7 @@ static void clear_system_oom(void)
  * don't have to be perfect here, we just have to be good.
  */
 void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
-               int order, nodemask_t *nodemask)
+               int order, nodemask_t *nodemask, bool force_kill)
 {
        const nodemask_t *mpol_mask;
        struct task_struct *p;
@@ -745,33 +735,25 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
        if (sysctl_oom_kill_allocating_task &&
            !oom_unkillable_task(current, NULL, nodemask) &&
            current->mm) {
-               /*
-                * oom_kill_process() needs tasklist_lock held.  If it returns
-                * non-zero, current could not be killed so we must fallback to
-                * the tasklist scan.
-                */
-               if (!oom_kill_process(current, gfp_mask, order, 0, totalpages,
-                               NULL, nodemask,
-                               "Out of memory (oom_kill_allocating_task)"))
-                       goto out;
-       }
-
-retry:
-       p = select_bad_process(&points, totalpages, NULL, mpol_mask);
-       if (PTR_ERR(p) == -1UL)
+               oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL,
+                                nodemask,
+                                "Out of memory (oom_kill_allocating_task)");
                goto out;
+       }
 
+       p = select_bad_process(&points, totalpages, NULL, mpol_mask,
+                              force_kill);
        /* Found nothing?!?! Either we hang forever, or we panic. */
        if (!p) {
                dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
                read_unlock(&tasklist_lock);
                panic("Out of memory and no killable processes...\n");
        }
-
-       if (oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
-                               nodemask, "Out of memory"))
-               goto retry;
-       killed = 1;
+       if (PTR_ERR(p) != -1UL) {
+               oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
+                                nodemask, "Out of memory");
+               killed = 1;
+       }
 out:
        read_unlock(&tasklist_lock);
 
@@ -792,7 +774,7 @@ out:
 void pagefault_out_of_memory(void)
 {
        if (try_set_system_oom()) {
-               out_of_memory(NULL, 0, 0, NULL);
+               out_of_memory(NULL, 0, 0, NULL, false);
                clear_system_oom();
        }
        if (!test_thread_flag(TIF_MEMDIE))
index 363ba7082ef59efab5e90d94184cbf593aeff7ea..3fc261705b1e068c28c72a329f31ee431ae3fdc9 100644 (file)
@@ -1472,6 +1472,7 @@ void throttle_vm_writeout(gfp_t gfp_mask)
 
         for ( ; ; ) {
                global_dirty_limits(&background_thresh, &dirty_thresh);
+               dirty_thresh = hard_dirty_limit(dirty_thresh);
 
                 /*
                  * Boost the allowable dirty threshold a bit for page
index a13ded1938f0bacd90a438d983e3d301a649bfe9..caea788628e4dd62060e4838d385e9858f415f25 100644 (file)
@@ -1968,7 +1968,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
                        goto out;
        }
        /* Exhausted what can be done so it's blamo time */
-       out_of_memory(zonelist, gfp_mask, order, nodemask);
+       out_of_memory(zonelist, gfp_mask, order, nodemask, false);
 
 out:
        clear_zonelist_oom(zonelist, gfp_mask);
@@ -1990,7 +1990,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        if (!order)
                return NULL;
 
-       if (compaction_deferred(preferred_zone)) {
+       if (compaction_deferred(preferred_zone, order)) {
                *deferred_compaction = true;
                return NULL;
        }
@@ -2012,6 +2012,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
                if (page) {
                        preferred_zone->compact_considered = 0;
                        preferred_zone->compact_defer_shift = 0;
+                       if (order >= preferred_zone->compact_order_failed)
+                               preferred_zone->compact_order_failed = order + 1;
                        count_vm_event(COMPACTSUCCESS);
                        return page;
                }
@@ -2028,7 +2030,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
                 * defer if the failure was a sync compaction failure.
                 */
                if (sync_migration)
-                       defer_compaction(preferred_zone);
+                       defer_compaction(preferred_zone, order);
 
                cond_resched();
        }
@@ -2378,8 +2380,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 {
        enum zone_type high_zoneidx = gfp_zone(gfp_mask);
        struct zone *preferred_zone;
-       struct page *page;
+       struct page *page = NULL;
        int migratetype = allocflags_to_migratetype(gfp_mask);
+       unsigned int cpuset_mems_cookie;
 
        gfp_mask &= gfp_allowed_mask;
 
@@ -2398,15 +2401,15 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
        if (unlikely(!zonelist->_zonerefs->zone))
                return NULL;
 
-       get_mems_allowed();
+retry_cpuset:
+       cpuset_mems_cookie = get_mems_allowed();
+
        /* The preferred zone is used for statistics later */
        first_zones_zonelist(zonelist, high_zoneidx,
                                nodemask ? : &cpuset_current_mems_allowed,
                                &preferred_zone);
-       if (!preferred_zone) {
-               put_mems_allowed();
-               return NULL;
-       }
+       if (!preferred_zone)
+               goto out;
 
        /* First allocation attempt */
        page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
@@ -2416,9 +2419,19 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
                page = __alloc_pages_slowpath(gfp_mask, order,
                                zonelist, high_zoneidx, nodemask,
                                preferred_zone, migratetype);
-       put_mems_allowed();
 
        trace_mm_page_alloc(page, order, gfp_mask, migratetype);
+
+out:
+       /*
+        * When updating a task's mems_allowed, it is possible to race with
+        * parallel threads in such a way that an allocation can fail while
+        * the mask is being updated. If a page allocation is about to fail,
+        * check if the cpuset changed during allocation and if so, retry.
+        */
+       if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+               goto retry_cpuset;
+
        return page;
 }
 EXPORT_SYMBOL(__alloc_pages_nodemask);
@@ -2632,13 +2645,15 @@ void si_meminfo_node(struct sysinfo *val, int nid)
 bool skip_free_areas_node(unsigned int flags, int nid)
 {
        bool ret = false;
+       unsigned int cpuset_mems_cookie;
 
        if (!(flags & SHOW_MEM_FILTER_NODES))
                goto out;
 
-       get_mems_allowed();
-       ret = !node_isset(nid, cpuset_current_mems_allowed);
-       put_mems_allowed();
+       do {
+               cpuset_mems_cookie = get_mems_allowed();
+               ret = !node_isset(nid, cpuset_current_mems_allowed);
+       } while (!put_mems_allowed(cpuset_mems_cookie));
 out:
        return ret;
 }
@@ -3925,18 +3940,6 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
        }
 }
 
-int __init add_from_early_node_map(struct range *range, int az,
-                                  int nr_range, int nid)
-{
-       unsigned long start_pfn, end_pfn;
-       int i;
-
-       /* need to go over early_node_map to find out good range for node */
-       for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL)
-               nr_range = add_range(range, az, nr_range, start_pfn, end_pfn);
-       return nr_range;
-}
-
 /**
  * sparse_memory_present_with_active_regions - Call memory_present for each active range
  * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
@@ -4521,7 +4524,7 @@ static unsigned long __init early_calculate_totalpages(void)
  * memory. When they don't, some nodes will have more kernelcore than
  * others
  */
-static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
+static void __init find_zone_movable_pfns_for_nodes(void)
 {
        int i, nid;
        unsigned long usable_startpfn;
@@ -4713,7 +4716,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
 
        /* Find the PFNs that ZONE_MOVABLE begins at in each node */
        memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
-       find_zone_movable_pfns_for_nodes(zone_movable_pfn);
+       find_zone_movable_pfns_for_nodes();
 
        /* Print out the zone ranges */
        printk("Zone PFN ranges:\n");
@@ -4823,6 +4826,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
        int cpu = (unsigned long)hcpu;
 
        if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
+               lru_add_drain_cpu(cpu);
                drain_pages(cpu);
 
                /*
index 2f5cf10ff6607483433cc3a059d28023afa9a042..aa9701e12714af2ce7ead752def02200910bf2b1 100644 (file)
@@ -59,7 +59,7 @@ again:
                        continue;
 
                split_huge_page_pmd(walk->mm, pmd);
-               if (pmd_none_or_clear_bad(pmd))
+               if (pmd_none_or_trans_huge_or_clear_bad(pmd))
                        goto again;
                err = walk_pte_range(pmd, addr, next, walk);
                if (err)
index eb663fb533e06f45305ac0b348b2365ecb1615bb..5a74fea182f152bd2fee6182e6b06f1dbaa4f922 100644 (file)
@@ -70,10 +70,11 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma,
                           unsigned long address, pmd_t *pmdp)
 {
        int young;
-#ifndef CONFIG_TRANSPARENT_HUGEPAGE
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+#else
        BUG();
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
        young = pmdp_test_and_clear_young(vma, address, pmdp);
        if (young)
                flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
index c8454e06b6c84b424de3952c567fbbcc8264ff75..5b5ad584ffb7dd7c9e00a885a018aaadba22f371 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -120,6 +120,21 @@ static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
        kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
 }
 
+static void anon_vma_chain_link(struct vm_area_struct *vma,
+                               struct anon_vma_chain *avc,
+                               struct anon_vma *anon_vma)
+{
+       avc->vma = vma;
+       avc->anon_vma = anon_vma;
+       list_add(&avc->same_vma, &vma->anon_vma_chain);
+
+       /*
+        * It's critical to add new vmas to the tail of the anon_vma,
+        * see comment in huge_memory.c:__split_huge_page().
+        */
+       list_add_tail(&avc->same_anon_vma, &anon_vma->head);
+}
+
 /**
  * anon_vma_prepare - attach an anon_vma to a memory region
  * @vma: the memory region in question
@@ -175,10 +190,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
                spin_lock(&mm->page_table_lock);
                if (likely(!vma->anon_vma)) {
                        vma->anon_vma = anon_vma;
-                       avc->anon_vma = anon_vma;
-                       avc->vma = vma;
-                       list_add(&avc->same_vma, &vma->anon_vma_chain);
-                       list_add_tail(&avc->same_anon_vma, &anon_vma->head);
+                       anon_vma_chain_link(vma, avc, anon_vma);
                        allocated = NULL;
                        avc = NULL;
                }
@@ -224,21 +236,6 @@ static inline void unlock_anon_vma_root(struct anon_vma *root)
                mutex_unlock(&root->mutex);
 }
 
-static void anon_vma_chain_link(struct vm_area_struct *vma,
-                               struct anon_vma_chain *avc,
-                               struct anon_vma *anon_vma)
-{
-       avc->vma = vma;
-       avc->anon_vma = anon_vma;
-       list_add(&avc->same_vma, &vma->anon_vma_chain);
-
-       /*
-        * It's critical to add new vmas to the tail of the anon_vma,
-        * see comment in huge_memory.c:__split_huge_page().
-        */
-       list_add_tail(&avc->same_anon_vma, &anon_vma->head);
-}
-
 /*
  * Attach the anon_vmas from src to dst.
  * Returns 0 on success, -ENOMEM on failure.
@@ -1151,10 +1148,15 @@ void page_add_new_anon_rmap(struct page *page,
  */
 void page_add_file_rmap(struct page *page)
 {
+       bool locked;
+       unsigned long flags;
+
+       mem_cgroup_begin_update_page_stat(page, &locked, &flags);
        if (atomic_inc_and_test(&page->_mapcount)) {
                __inc_zone_page_state(page, NR_FILE_MAPPED);
                mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED);
        }
+       mem_cgroup_end_update_page_stat(page, &locked, &flags);
 }
 
 /**
@@ -1165,9 +1167,21 @@ void page_add_file_rmap(struct page *page)
  */
 void page_remove_rmap(struct page *page)
 {
+       bool anon = PageAnon(page);
+       bool locked;
+       unsigned long flags;
+
+       /*
+        * The anon case has no mem_cgroup page_stat to update; but may
+        * uncharge_page() below, where the lock ordering can deadlock if
+        * we hold the lock against page_stat move: so avoid it on anon.
+        */
+       if (!anon)
+               mem_cgroup_begin_update_page_stat(page, &locked, &flags);
+
        /* page still mapped by someone else? */
        if (!atomic_add_negative(-1, &page->_mapcount))
-               return;
+               goto out;
 
        /*
         * Now that the last pte has gone, s390 must transfer dirty
@@ -1176,7 +1190,7 @@ void page_remove_rmap(struct page *page)
         * not if it's in swapcache - there might be another pte slot
         * containing the swap entry, but page not yet written to swap.
         */
-       if ((!PageAnon(page) || PageSwapCache(page)) &&
+       if ((!anon || PageSwapCache(page)) &&
            page_test_and_clear_dirty(page_to_pfn(page), 1))
                set_page_dirty(page);
        /*
@@ -1184,8 +1198,8 @@ void page_remove_rmap(struct page *page)
         * and not charged by memcg for now.
         */
        if (unlikely(PageHuge(page)))
-               return;
-       if (PageAnon(page)) {
+               goto out;
+       if (anon) {
                mem_cgroup_uncharge_page(page);
                if (!PageTransHuge(page))
                        __dec_zone_page_state(page, NR_ANON_PAGES);
@@ -1205,6 +1219,9 @@ void page_remove_rmap(struct page *page)
         * Leaving it set also helps swapoff to reinstate ptes
         * faster for those pages still in swapcache.
         */
+out:
+       if (!anon)
+               mem_cgroup_end_update_page_stat(page, &locked, &flags);
 }
 
 /*
@@ -1282,7 +1299,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                        }
                        dec_mm_counter(mm, MM_ANONPAGES);
                        inc_mm_counter(mm, MM_SWAPENTS);
-               } else if (PAGE_MIGRATION) {
+               } else if (IS_ENABLED(CONFIG_MIGRATION)) {
                        /*
                         * Store the pfn of the page in a special migration
                         * pte. do_swap_page() will wait until the migration
@@ -1293,7 +1310,8 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                }
                set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
                BUG_ON(pte_file(*pte));
-       } else if (PAGE_MIGRATION && (TTU_ACTION(flags) == TTU_MIGRATION)) {
+       } else if (IS_ENABLED(CONFIG_MIGRATION) &&
+                  (TTU_ACTION(flags) == TTU_MIGRATION)) {
                /* Establish migration entry for a file page */
                swp_entry_t entry;
                entry = make_migration_entry(page, pte_write(pteval));
@@ -1499,7 +1517,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
                 * locking requirements of exec(), migration skips
                 * temporary VMAs until after exec() completes.
                 */
-               if (PAGE_MIGRATION && (flags & TTU_MIGRATION) &&
+               if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) &&
                                is_vma_temporary_stack(vma))
                        continue;
 
index 7a45ad004cfd0f908da54b47e298702bc7080a2a..f99ff3e50bd6af061b30722528d002bab68e6e40 100644 (file)
@@ -1178,6 +1178,12 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
 static const struct inode_operations shmem_symlink_inode_operations;
 static const struct inode_operations shmem_short_symlink_operations;
 
+#ifdef CONFIG_TMPFS_XATTR
+static int shmem_initxattrs(struct inode *, const struct xattr *, void *);
+#else
+#define shmem_initxattrs NULL
+#endif
+
 static int
 shmem_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
@@ -1490,7 +1496,7 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
        if (inode) {
                error = security_inode_init_security(inode, dir,
                                                     &dentry->d_name,
-                                                    NULL, NULL);
+                                                    shmem_initxattrs, NULL);
                if (error) {
                        if (error != -EOPNOTSUPP) {
                                iput(inode);
@@ -1630,7 +1636,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
                return -ENOSPC;
 
        error = security_inode_init_security(inode, dir, &dentry->d_name,
-                                            NULL, NULL);
+                                            shmem_initxattrs, NULL);
        if (error) {
                if (error != -EOPNOTSUPP) {
                        iput(inode);
@@ -1704,6 +1710,66 @@ static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *co
  * filesystem level, though.
  */
 
+/*
+ * Allocate new xattr and copy in the value; but leave the name to callers.
+ */
+static struct shmem_xattr *shmem_xattr_alloc(const void *value, size_t size)
+{
+       struct shmem_xattr *new_xattr;
+       size_t len;
+
+       /* wrap around? */
+       len = sizeof(*new_xattr) + size;
+       if (len <= sizeof(*new_xattr))
+               return NULL;
+
+       new_xattr = kmalloc(len, GFP_KERNEL);
+       if (!new_xattr)
+               return NULL;
+
+       new_xattr->size = size;
+       memcpy(new_xattr->value, value, size);
+       return new_xattr;
+}
+
+/*
+ * Callback for security_inode_init_security() for acquiring xattrs.
+ */
+static int shmem_initxattrs(struct inode *inode,
+                           const struct xattr *xattr_array,
+                           void *fs_info)
+{
+       struct shmem_inode_info *info = SHMEM_I(inode);
+       const struct xattr *xattr;
+       struct shmem_xattr *new_xattr;
+       size_t len;
+
+       for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+               new_xattr = shmem_xattr_alloc(xattr->value, xattr->value_len);
+               if (!new_xattr)
+                       return -ENOMEM;
+
+               len = strlen(xattr->name) + 1;
+               new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
+                                         GFP_KERNEL);
+               if (!new_xattr->name) {
+                       kfree(new_xattr);
+                       return -ENOMEM;
+               }
+
+               memcpy(new_xattr->name, XATTR_SECURITY_PREFIX,
+                      XATTR_SECURITY_PREFIX_LEN);
+               memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,
+                      xattr->name, len);
+
+               spin_lock(&info->lock);
+               list_add(&new_xattr->list, &info->xattr_list);
+               spin_unlock(&info->lock);
+       }
+
+       return 0;
+}
+
 static int shmem_xattr_get(struct dentry *dentry, const char *name,
                           void *buffer, size_t size)
 {
@@ -1731,24 +1797,17 @@ static int shmem_xattr_get(struct dentry *dentry, const char *name,
        return ret;
 }
 
-static int shmem_xattr_set(struct dentry *dentry, const char *name,
+static int shmem_xattr_set(struct inode *inode, const char *name,
                           const void *value, size_t size, int flags)
 {
-       struct inode *inode = dentry->d_inode;
        struct shmem_inode_info *info = SHMEM_I(inode);
        struct shmem_xattr *xattr;
        struct shmem_xattr *new_xattr = NULL;
-       size_t len;
        int err = 0;
 
        /* value == NULL means remove */
        if (value) {
-               /* wrap around? */
-               len = sizeof(*new_xattr) + size;
-               if (len <= sizeof(*new_xattr))
-                       return -ENOMEM;
-
-               new_xattr = kmalloc(len, GFP_KERNEL);
+               new_xattr = shmem_xattr_alloc(value, size);
                if (!new_xattr)
                        return -ENOMEM;
 
@@ -1757,9 +1816,6 @@ static int shmem_xattr_set(struct dentry *dentry, const char *name,
                        kfree(new_xattr);
                        return -ENOMEM;
                }
-
-               new_xattr->size = size;
-               memcpy(new_xattr->value, value, size);
        }
 
        spin_lock(&info->lock);
@@ -1858,7 +1914,7 @@ static int shmem_setxattr(struct dentry *dentry, const char *name,
        if (size == 0)
                value = "";  /* empty EA, do not remove */
 
-       return shmem_xattr_set(dentry, name, value, size, flags);
+       return shmem_xattr_set(dentry->d_inode, name, value, size, flags);
 
 }
 
@@ -1878,7 +1934,7 @@ static int shmem_removexattr(struct dentry *dentry, const char *name)
        if (err)
                return err;
 
-       return shmem_xattr_set(dentry, name, NULL, 0, XATTR_REPLACE);
+       return shmem_xattr_set(dentry->d_inode, name, NULL, 0, XATTR_REPLACE);
 }
 
 static bool xattr_is_trusted(const char *name)
index f0bd7857ab3bed2adf6649e60dda6ad712ef0b92..29c8716eb7a9a69a4d67501081add66d0d9caa36 100644 (file)
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3284,12 +3284,10 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
        if (in_interrupt() || (flags & __GFP_THISNODE))
                return NULL;
        nid_alloc = nid_here = numa_mem_id();
-       get_mems_allowed();
        if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
                nid_alloc = cpuset_slab_spread_node();
        else if (current->mempolicy)
                nid_alloc = slab_node(current->mempolicy);
-       put_mems_allowed();
        if (nid_alloc != nid_here)
                return ____cache_alloc_node(cachep, flags, nid_alloc);
        return NULL;
@@ -3312,14 +3310,17 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
        enum zone_type high_zoneidx = gfp_zone(flags);
        void *obj = NULL;
        int nid;
+       unsigned int cpuset_mems_cookie;
 
        if (flags & __GFP_THISNODE)
                return NULL;
 
-       get_mems_allowed();
-       zonelist = node_zonelist(slab_node(current->mempolicy), flags);
        local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
 
+retry_cpuset:
+       cpuset_mems_cookie = get_mems_allowed();
+       zonelist = node_zonelist(slab_node(current->mempolicy), flags);
+
 retry:
        /*
         * Look through allowed nodes for objects available
@@ -3372,7 +3373,9 @@ retry:
                        }
                }
        }
-       put_mems_allowed();
+
+       if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj))
+               goto retry_cpuset;
        return obj;
 }
 
index 4907563ef7ff7e199577f5d2ffedda6fa358271e..f4a6229848fdb6bd654c90947f341b6060ab0bde 100644 (file)
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1581,6 +1581,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags,
        struct zone *zone;
        enum zone_type high_zoneidx = gfp_zone(flags);
        void *object;
+       unsigned int cpuset_mems_cookie;
 
        /*
         * The defrag ratio allows a configuration of the tradeoffs between
@@ -1604,23 +1605,32 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags,
                        get_cycles() % 1024 > s->remote_node_defrag_ratio)
                return NULL;
 
-       get_mems_allowed();
-       zonelist = node_zonelist(slab_node(current->mempolicy), flags);
-       for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
-               struct kmem_cache_node *n;
-
-               n = get_node(s, zone_to_nid(zone));
-
-               if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
-                               n->nr_partial > s->min_partial) {
-                       object = get_partial_node(s, n, c);
-                       if (object) {
-                               put_mems_allowed();
-                               return object;
+       do {
+               cpuset_mems_cookie = get_mems_allowed();
+               zonelist = node_zonelist(slab_node(current->mempolicy), flags);
+               for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+                       struct kmem_cache_node *n;
+
+                       n = get_node(s, zone_to_nid(zone));
+
+                       if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
+                                       n->nr_partial > s->min_partial) {
+                               object = get_partial_node(s, n, c);
+                               if (object) {
+                                       /*
+                                        * Return the object even if
+                                        * put_mems_allowed indicated that
+                                        * the cpuset mems_allowed was
+                                        * updated in parallel. It's a
+                                        * harmless race between the alloc
+                                        * and the cpuset update.
+                                        */
+                                       put_mems_allowed(cpuset_mems_cookie);
+                                       return object;
+                               }
                        }
                }
-       }
-       put_mems_allowed();
+       } while (!put_mems_allowed(cpuset_mems_cookie));
 #endif
        return NULL;
 }
index 61d7cde23111e91714dde8fe42e461a3f39c3f0d..a8bc7d364deb0a764cbd28956f1853fbb3ce421c 100644 (file)
@@ -353,29 +353,21 @@ static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map,
 
        usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid),
                                                                 usemap_count);
-       if (usemap) {
-               for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
-                       if (!present_section_nr(pnum))
-                               continue;
-                       usemap_map[pnum] = usemap;
-                       usemap += size;
+       if (!usemap) {
+               usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count);
+               if (!usemap) {
+                       printk(KERN_WARNING "%s: allocation failed\n", __func__);
+                       return;
                }
-               return;
        }
 
-       usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count);
-       if (usemap) {
-               for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
-                       if (!present_section_nr(pnum))
-                               continue;
-                       usemap_map[pnum] = usemap;
-                       usemap += size;
-                       check_usemap_section_nr(nodeid, usemap_map[pnum]);
-               }
-               return;
+       for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
+               if (!present_section_nr(pnum))
+                       continue;
+               usemap_map[pnum] = usemap;
+               usemap += size;
+               check_usemap_section_nr(nodeid, usemap_map[pnum]);
        }
-
-       printk(KERN_WARNING "%s: allocation failed\n", __func__);
 }
 
 #ifndef CONFIG_SPARSEMEM_VMEMMAP
index 14380e9fbe3380b927cb58d75c5a950a5ec14f77..5c13f13389721fe60756ffb4dabe66d0c1e86e47 100644 (file)
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -496,7 +496,7 @@ static void lru_deactivate_fn(struct page *page, void *arg)
  * Either "cpu" is the current CPU, and preemption has already been
  * disabled; or "cpu" is being hot-unplugged, and is already dead.
  */
-static void drain_cpu_pagevecs(int cpu)
+void lru_add_drain_cpu(int cpu)
 {
        struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu);
        struct pagevec *pvec;
@@ -553,7 +553,7 @@ void deactivate_page(struct page *page)
 
 void lru_add_drain(void)
 {
-       drain_cpu_pagevecs(get_cpu());
+       lru_add_drain_cpu(get_cpu());
        put_cpu();
 }
 
index ea6b32d61873185f59db1eb9ae303488ec6de471..9d3dd3763cf763460b0f4f17b7ff630e5c8115c0 100644 (file)
@@ -372,25 +372,23 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
                        struct vm_area_struct *vma, unsigned long addr)
 {
-       int nr_pages;
        struct page *page;
-       unsigned long offset;
-       unsigned long end_offset;
+       unsigned long offset = swp_offset(entry);
+       unsigned long start_offset, end_offset;
+       unsigned long mask = (1UL << page_cluster) - 1;
 
-       /*
-        * Get starting offset for readaround, and number of pages to read.
-        * Adjust starting address by readbehind (for NUMA interleave case)?
-        * No, it's very unlikely that swap layout would follow vma layout,
-        * more likely that neighbouring swap pages came from the same node:
-        * so use the same "addr" to choose the same node for each swap read.
-        */
-       nr_pages = valid_swaphandles(entry, &offset);
-       for (end_offset = offset + nr_pages; offset < end_offset; offset++) {
+       /* Read a page_cluster sized and aligned cluster around offset. */
+       start_offset = offset & ~mask;
+       end_offset = offset | mask;
+       if (!start_offset)      /* First page is swap header. */
+               start_offset++;
+
+       for (offset = start_offset; offset <= end_offset ; offset++) {
                /* Ok, do the async read-ahead now */
                page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
                                                gfp_mask, vma, addr);
                if (!page)
-                       break;
+                       continue;
                page_cache_release(page);
        }
        lru_add_drain();        /* Push any new pages onto the LRU now */
index 6bf67ab6e469ab2329a3fc1a679e79228c6cee9f..dae42f380d6ebcde88d3aaef6aa3cb1d64e0f5a3 100644 (file)
@@ -932,9 +932,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
-               if (unlikely(pmd_trans_huge(*pmd)))
-                       continue;
-               if (pmd_none_or_clear_bad(pmd))
+               if (pmd_none_or_trans_huge_or_clear_bad(pmd))
                        continue;
                ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
                if (ret)
@@ -2107,7 +2105,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
                        p->flags |= SWP_SOLIDSTATE;
                        p->cluster_next = 1 + (random32() % p->highest_bit);
                }
-               if (discard_swap(p) == 0 && (swap_flags & SWAP_FLAG_DISCARD))
+               if ((swap_flags & SWAP_FLAG_DISCARD) && discard_swap(p) == 0)
                        p->flags |= SWP_DISCARDABLE;
        }
 
@@ -2291,58 +2289,6 @@ int swapcache_prepare(swp_entry_t entry)
        return __swap_duplicate(entry, SWAP_HAS_CACHE);
 }
 
-/*
- * swap_lock prevents swap_map being freed. Don't grab an extra
- * reference on the swaphandle, it doesn't matter if it becomes unused.
- */
-int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
-{
-       struct swap_info_struct *si;
-       int our_page_cluster = page_cluster;
-       pgoff_t target, toff;
-       pgoff_t base, end;
-       int nr_pages = 0;
-
-       if (!our_page_cluster)  /* no readahead */
-               return 0;
-
-       si = swap_info[swp_type(entry)];
-       target = swp_offset(entry);
-       base = (target >> our_page_cluster) << our_page_cluster;
-       end = base + (1 << our_page_cluster);
-       if (!base)              /* first page is swap header */
-               base++;
-
-       spin_lock(&swap_lock);
-       if (end > si->max)      /* don't go beyond end of map */
-               end = si->max;
-
-       /* Count contiguous allocated slots above our target */
-       for (toff = target; ++toff < end; nr_pages++) {
-               /* Don't read in free or bad pages */
-               if (!si->swap_map[toff])
-                       break;
-               if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
-                       break;
-       }
-       /* Count contiguous allocated slots below our target */
-       for (toff = target; --toff >= base; nr_pages++) {
-               /* Don't read in free or bad pages */
-               if (!si->swap_map[toff])
-                       break;
-               if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
-                       break;
-       }
-       spin_unlock(&swap_lock);
-
-       /*
-        * Indicate starting offset, and return number of pages to get:
-        * if only 1, say 0, since there's then no readahead to be done.
-        */
-       *offset = ++toff;
-       return nr_pages? ++nr_pages: 0;
-}
-
 /*
  * add_swap_count_continuation - called when a swap count is duplicated
  * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
index 136ac4f322b8248a5bea3c11d93c69c001b7bda8..ae962b31de888a55990769aae948bac3ef0db338 100644 (file)
--- a/mm/util.c
+++ b/mm/util.c
@@ -239,6 +239,47 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
                next->vm_prev = vma;
 }
 
+/* Check if the vma is being used as a stack by this task */
+static int vm_is_stack_for_task(struct task_struct *t,
+                               struct vm_area_struct *vma)
+{
+       return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
+}
+
+/*
+ * Check if the vma is being used as a stack.
+ * If is_group is non-zero, check in the entire thread group or else
+ * just check in the current task. Returns the pid of the task that
+ * the vma is stack for.
+ */
+pid_t vm_is_stack(struct task_struct *task,
+                 struct vm_area_struct *vma, int in_group)
+{
+       pid_t ret = 0;
+
+       if (vm_is_stack_for_task(task, vma))
+               return task->pid;
+
+       if (in_group) {
+               struct task_struct *t;
+               rcu_read_lock();
+               if (!pid_alive(task))
+                       goto done;
+
+               t = task;
+               do {
+                       if (vm_is_stack_for_task(t, vma)) {
+                               ret = t->pid;
+                               goto done;
+                       }
+               } while_each_thread(task, t);
+done:
+               rcu_read_unlock();
+       }
+
+       return ret;
+}
+
 #if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
 void arch_pick_mmap_layout(struct mm_struct *mm)
 {
index c52b23552659af5bce0038dd6fa7ac10d044be40..49f15ef0a99a3f7f78adcd687f3ed54c421b2a0b 100644 (file)
@@ -1138,7 +1138,7 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
  * @mz:                The mem_cgroup_zone to pull pages from.
  * @dst:       The temp list to put pages on to.
  * @nr_scanned:        The number of pages that were scanned.
- * @order:     The caller's attempted allocation order
+ * @sc:                The scan_control struct for this reclaim session
  * @mode:      One of the LRU isolation modes
  * @active:    True [1] if isolating active pages
  * @file:      True [1] if isolating file [!anon] pages
@@ -1147,8 +1147,8 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
  */
 static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                struct mem_cgroup_zone *mz, struct list_head *dst,
-               unsigned long *nr_scanned, int order, isolate_mode_t mode,
-               int active, int file)
+               unsigned long *nr_scanned, struct scan_control *sc,
+               isolate_mode_t mode, int active, int file)
 {
        struct lruvec *lruvec;
        struct list_head *src;
@@ -1194,7 +1194,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                        BUG();
                }
 
-               if (!order)
+               if (!sc->order || !(sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM))
                        continue;
 
                /*
@@ -1208,8 +1208,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                 */
                zone_id = page_zone_id(page);
                page_pfn = page_to_pfn(page);
-               pfn = page_pfn & ~((1 << order) - 1);
-               end_pfn = pfn + (1 << order);
+               pfn = page_pfn & ~((1 << sc->order) - 1);
+               end_pfn = pfn + (1 << sc->order);
                for (; pfn < end_pfn; pfn++) {
                        struct page *cursor_page;
 
@@ -1275,7 +1275,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 
        *nr_scanned = scan;
 
-       trace_mm_vmscan_lru_isolate(order,
+       trace_mm_vmscan_lru_isolate(sc->order,
                        nr_to_scan, scan,
                        nr_taken,
                        nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed,
@@ -1413,7 +1413,6 @@ update_isolated_counts(struct mem_cgroup_zone *mz,
                       unsigned long *nr_anon,
                       unsigned long *nr_file)
 {
-       struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
        struct zone *zone = mz->zone;
        unsigned int count[NR_LRU_LISTS] = { 0, };
        unsigned long nr_active = 0;
@@ -1434,6 +1433,7 @@ update_isolated_counts(struct mem_cgroup_zone *mz,
                count[lru] += numpages;
        }
 
+       preempt_disable();
        __count_vm_events(PGDEACTIVATE, nr_active);
 
        __mod_zone_page_state(zone, NR_ACTIVE_FILE,
@@ -1448,8 +1448,9 @@ update_isolated_counts(struct mem_cgroup_zone *mz,
        *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
        *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
 
-       reclaim_stat->recent_scanned[0] += *nr_anon;
-       reclaim_stat->recent_scanned[1] += *nr_file;
+       __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon);
+       __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file);
+       preempt_enable();
 }
 
 /*
@@ -1509,8 +1510,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
        unsigned long nr_file;
        unsigned long nr_dirty = 0;
        unsigned long nr_writeback = 0;
-       isolate_mode_t reclaim_mode = ISOLATE_INACTIVE;
+       isolate_mode_t isolate_mode = ISOLATE_INACTIVE;
        struct zone *zone = mz->zone;
+       struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
 
        while (unlikely(too_many_isolated(zone, file, sc))) {
                congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1522,20 +1524,19 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
 
        set_reclaim_mode(priority, sc, false);
        if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
-               reclaim_mode |= ISOLATE_ACTIVE;
+               isolate_mode |= ISOLATE_ACTIVE;
 
        lru_add_drain();
 
        if (!sc->may_unmap)
-               reclaim_mode |= ISOLATE_UNMAPPED;
+               isolate_mode |= ISOLATE_UNMAPPED;
        if (!sc->may_writepage)
-               reclaim_mode |= ISOLATE_CLEAN;
+               isolate_mode |= ISOLATE_CLEAN;
 
        spin_lock_irq(&zone->lru_lock);
 
-       nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list,
-                                    &nr_scanned, sc->order,
-                                    reclaim_mode, 0, file);
+       nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list, &nr_scanned,
+                                    sc, isolate_mode, 0, file);
        if (global_reclaim(sc)) {
                zone->pages_scanned += nr_scanned;
                if (current_is_kswapd())
@@ -1545,19 +1546,13 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
                        __count_zone_vm_events(PGSCAN_DIRECT, zone,
                                               nr_scanned);
        }
+       spin_unlock_irq(&zone->lru_lock);
 
-       if (nr_taken == 0) {
-               spin_unlock_irq(&zone->lru_lock);
+       if (nr_taken == 0)
                return 0;
-       }
 
        update_isolated_counts(mz, &page_list, &nr_anon, &nr_file);
 
-       __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon);
-       __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);
-
-       spin_unlock_irq(&zone->lru_lock);
-
        nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority,
                                                &nr_dirty, &nr_writeback);
 
@@ -1570,6 +1565,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
 
        spin_lock_irq(&zone->lru_lock);
 
+       reclaim_stat->recent_scanned[0] += nr_anon;
+       reclaim_stat->recent_scanned[1] += nr_file;
+
        if (current_is_kswapd())
                __count_vm_events(KSWAPD_STEAL, nr_reclaimed);
        __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);
@@ -1643,18 +1641,6 @@ static void move_active_pages_to_lru(struct zone *zone,
        unsigned long pgmoved = 0;
        struct page *page;
 
-       if (buffer_heads_over_limit) {
-               spin_unlock_irq(&zone->lru_lock);
-               list_for_each_entry(page, list, lru) {
-                       if (page_has_private(page) && trylock_page(page)) {
-                               if (page_has_private(page))
-                                       try_to_release_page(page, 0);
-                               unlock_page(page);
-                       }
-               }
-               spin_lock_irq(&zone->lru_lock);
-       }
-
        while (!list_empty(list)) {
                struct lruvec *lruvec;
 
@@ -1699,21 +1685,22 @@ static void shrink_active_list(unsigned long nr_to_scan,
        struct page *page;
        struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
        unsigned long nr_rotated = 0;
-       isolate_mode_t reclaim_mode = ISOLATE_ACTIVE;
+       isolate_mode_t isolate_mode = ISOLATE_ACTIVE;
        struct zone *zone = mz->zone;
 
        lru_add_drain();
 
+       reset_reclaim_mode(sc);
+
        if (!sc->may_unmap)
-               reclaim_mode |= ISOLATE_UNMAPPED;
+               isolate_mode |= ISOLATE_UNMAPPED;
        if (!sc->may_writepage)
-               reclaim_mode |= ISOLATE_CLEAN;
+               isolate_mode |= ISOLATE_CLEAN;
 
        spin_lock_irq(&zone->lru_lock);
 
-       nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold,
-                                    &nr_scanned, sc->order,
-                                    reclaim_mode, 1, file);
+       nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold, &nr_scanned, sc,
+                                    isolate_mode, 1, file);
        if (global_reclaim(sc))
                zone->pages_scanned += nr_scanned;
 
@@ -1737,6 +1724,14 @@ static void shrink_active_list(unsigned long nr_to_scan,
                        continue;
                }
 
+               if (unlikely(buffer_heads_over_limit)) {
+                       if (page_has_private(page) && trylock_page(page)) {
+                               if (page_has_private(page))
+                                       try_to_release_page(page, 0);
+                               unlock_page(page);
+                       }
+               }
+
                if (page_referenced(page, 0, mz->mem_cgroup, &vm_flags)) {
                        nr_rotated += hpage_nr_pages(page);
                        /*
@@ -2112,7 +2107,12 @@ restart:
                 * with multiple processes reclaiming pages, the total
                 * freeing target can get unreasonably large.
                 */
-               if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
+               if (nr_reclaimed >= nr_to_reclaim)
+                       nr_to_reclaim = 0;
+               else
+                       nr_to_reclaim -= nr_reclaimed;
+
+               if (!nr_to_reclaim && priority < DEF_PRIORITY)
                        break;
        }
        blk_finish_plug(&plug);
@@ -2195,7 +2195,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
         * If compaction is deferred, reclaim up to a point where
         * compaction will have a chance of success when re-enabled
         */
-       if (compaction_deferred(zone))
+       if (compaction_deferred(zone, sc->order))
                return watermark_ok;
 
        /* If compaction is not ready to start, keep reclaiming */
@@ -2235,6 +2235,14 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
        unsigned long nr_soft_scanned;
        bool aborted_reclaim = false;
 
+       /*
+        * If the number o