Merge branch 'hwpoison' of git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux...
authorLinus Torvalds <torvalds@linux-foundation.org>
Tue, 26 Oct 2010 17:13:10 +0000 (10:13 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 26 Oct 2010 17:13:10 +0000 (10:13 -0700)
* 'hwpoison' of git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-mce-2.6: (22 commits)
  Add _addr_lsb field to ia64 siginfo
  Fix migration.c compilation on s390
  HWPOISON: Remove retry loop for try_to_unmap
  HWPOISON: Turn addr_valid from bitfield into char
  HWPOISON: Disable DEBUG by default
  HWPOISON: Convert pr_debugs to pr_info
  HWPOISON: Improve comments in memory-failure.c
  x86: HWPOISON: Report correct address granuality for huge hwpoison faults
  Encode huge page size for VM_FAULT_HWPOISON errors
  Fix build error with !CONFIG_MIGRATION
  hugepage: move is_hugepage_on_freelist inside ifdef to avoid warning
  Clean up __page_set_anon_rmap
  HWPOISON, hugetlb: fix unpoison for hugepage
  HWPOISON, hugetlb: soft offlining for hugepage
  HWPOSION, hugetlb: recover from free hugepage error when !MF_COUNT_INCREASED
  hugetlb: move refcounting in hugepage allocation inside hugetlb_lock
  HWPOISON, hugetlb: add free check to dequeue_hwpoison_huge_page()
  hugetlb: hugepage migration core
  hugetlb: redefine hugepage copy functions
  hugetlb: add allocate function for hugepage migration
  ...

13 files changed:
arch/ia64/include/asm/siginfo.h
arch/x86/mm/fault.c
fs/hugetlbfs/inode.c
fs/signalfd.c
include/linux/hugetlb.h
include/linux/migrate.h
include/linux/mm.h
include/linux/signalfd.h
mm/hugetlb.c
mm/memory-failure.c
mm/memory.c
mm/migrate.c
mm/rmap.c

index 118d4297900319df2d97840fc7149cce0ad8610c..c8fcaa2ac48f88f13bd93e1d25fd3fd5d2031c6a 100644 (file)
@@ -62,6 +62,7 @@ typedef struct siginfo {
                        int _imm;               /* immediate value for "break" */
                        unsigned int _flags;    /* see below */
                        unsigned long _isr;     /* isr */
+                       short _addr_lsb;        /* lsb of faulting address */
                } _sigfault;
 
                /* SIGPOLL */
index 79b0b372d2d033ca35a4bb83295a332c17bbb6c4..852b319edbdcfc524bb1c5d719b53036c13adf2c 100644 (file)
@@ -11,6 +11,7 @@
 #include <linux/kprobes.h>             /* __kprobes, ...               */
 #include <linux/mmiotrace.h>           /* kmmio_handler, ...           */
 #include <linux/perf_event.h>          /* perf_sw_event                */
+#include <linux/hugetlb.h>             /* hstate_index_to_shift        */
 
 #include <asm/traps.h>                 /* dotraplinkage, ...           */
 #include <asm/pgalloc.h>               /* pgd_*(), ...                 */
@@ -160,15 +161,20 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
 
 static void
 force_sig_info_fault(int si_signo, int si_code, unsigned long address,
-                    struct task_struct *tsk)
+                    struct task_struct *tsk, int fault)
 {
+       unsigned lsb = 0;
        siginfo_t info;
 
        info.si_signo   = si_signo;
        info.si_errno   = 0;
        info.si_code    = si_code;
        info.si_addr    = (void __user *)address;
-       info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0;
+       if (fault & VM_FAULT_HWPOISON_LARGE)
+               lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); 
+       if (fault & VM_FAULT_HWPOISON)
+               lsb = PAGE_SHIFT;
+       info.si_addr_lsb = lsb;
 
        force_sig_info(si_signo, &info, tsk);
 }
@@ -722,7 +728,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
                tsk->thread.error_code  = error_code | (address >= TASK_SIZE);
                tsk->thread.trap_no     = 14;
 
-               force_sig_info_fault(SIGSEGV, si_code, address, tsk);
+               force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0);
 
                return;
        }
@@ -807,14 +813,14 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
        tsk->thread.trap_no     = 14;
 
 #ifdef CONFIG_MEMORY_FAILURE
-       if (fault & VM_FAULT_HWPOISON) {
+       if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
                printk(KERN_ERR
        "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
                        tsk->comm, tsk->pid, address);
                code = BUS_MCEERR_AR;
        }
 #endif
-       force_sig_info_fault(SIGBUS, code, address, tsk);
+       force_sig_info_fault(SIGBUS, code, address, tsk, fault);
 }
 
 static noinline void
@@ -824,7 +830,8 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
        if (fault & VM_FAULT_OOM) {
                out_of_memory(regs, error_code, address);
        } else {
-               if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON))
+               if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
+                            VM_FAULT_HWPOISON_LARGE))
                        do_sigbus(regs, error_code, address, fault);
                else
                        BUG();
index 113eba3d3c386e139453abedb72f1da8dc844097..a14328d270e855a4d16d3772b9ad464bab296bdc 100644 (file)
@@ -31,6 +31,7 @@
 #include <linux/statfs.h>
 #include <linux/security.h>
 #include <linux/magic.h>
+#include <linux/migrate.h>
 
 #include <asm/uaccess.h>
 
@@ -573,6 +574,19 @@ static int hugetlbfs_set_page_dirty(struct page *page)
        return 0;
 }
 
+static int hugetlbfs_migrate_page(struct address_space *mapping,
+                               struct page *newpage, struct page *page)
+{
+       int rc;
+
+       rc = migrate_huge_page_move_mapping(mapping, newpage, page);
+       if (rc)
+               return rc;
+       migrate_page_copy(newpage, page);
+
+       return 0;
+}
+
 static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
@@ -659,6 +673,7 @@ static const struct address_space_operations hugetlbfs_aops = {
        .write_begin    = hugetlbfs_write_begin,
        .write_end      = hugetlbfs_write_end,
        .set_page_dirty = hugetlbfs_set_page_dirty,
+       .migratepage    = hugetlbfs_migrate_page,
 };
 
 
index 74047304b01a4300bf7f781ad4d2838dc3bc316b..492465b451ddd34f6f60214fb48224687649faa9 100644 (file)
@@ -98,6 +98,16 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
                err |= __put_user((long) kinfo->si_addr, &uinfo->ssi_addr);
 #ifdef __ARCH_SI_TRAPNO
                err |= __put_user(kinfo->si_trapno, &uinfo->ssi_trapno);
+#endif
+#ifdef BUS_MCEERR_AO
+               /* 
+                * Other callers might not initialize the si_lsb field,
+                * so check explicitly for the right codes here.
+                */
+               if (kinfo->si_code == BUS_MCEERR_AR ||
+                   kinfo->si_code == BUS_MCEERR_AO)
+                       err |= __put_user((short) kinfo->si_addr_lsb,
+                                         &uinfo->ssi_addr_lsb);
 #endif
                break;
        case __SI_CHLD:
index f479700df61b186118d607a0db86b1e47ffa8294..943c76b3d4bb94d9536ecf8e324bea8ffda853e4 100644 (file)
@@ -43,7 +43,8 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to,
                                                struct vm_area_struct *vma,
                                                int acctflags);
 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
-void __isolate_hwpoisoned_huge_page(struct page *page);
+int dequeue_hwpoisoned_huge_page(struct page *page);
+void copy_huge_page(struct page *dst, struct page *src);
 
 extern unsigned long hugepages_treat_as_movable;
 extern const unsigned long hugetlb_zero, hugetlb_infinity;
@@ -101,7 +102,10 @@ static inline void hugetlb_report_meminfo(struct seq_file *m)
 #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; })
 #define hugetlb_fault(mm, vma, addr, flags)    ({ BUG(); 0; })
 #define huge_pte_offset(mm, address)   0
-#define __isolate_hwpoisoned_huge_page(page)   0
+#define dequeue_hwpoisoned_huge_page(page)     0
+static inline void copy_huge_page(struct page *dst, struct page *src)
+{
+}
 
 #define hugetlb_change_protection(vma, address, end, newprot)
 
@@ -228,6 +232,8 @@ struct huge_bootmem_page {
        struct hstate *hstate;
 };
 
+struct page *alloc_huge_page_node(struct hstate *h, int nid);
+
 /* arch callback */
 int __init alloc_bootmem_huge_page(struct hstate *h);
 
@@ -301,8 +307,14 @@ static inline struct hstate *page_hstate(struct page *page)
        return size_to_hstate(PAGE_SIZE << compound_order(page));
 }
 
+static inline unsigned hstate_index_to_shift(unsigned index)
+{
+       return hstates[index].order + PAGE_SHIFT;
+}
+
 #else
 struct hstate {};
+#define alloc_huge_page_node(h, nid) NULL
 #define alloc_bootmem_huge_page(h) NULL
 #define hstate_file(f) NULL
 #define hstate_vma(v) NULL
@@ -317,6 +329,7 @@ static inline unsigned int pages_per_huge_page(struct hstate *h)
 {
        return 1;
 }
+#define hstate_index_to_shift(index) 0
 #endif
 
 #endif /* _LINUX_HUGETLB_H */
index 7238231b8dd40194e980dad7683c854fd88a6047..085527fb82610065e3b8566372780cd6910c2cf3 100644 (file)
@@ -14,6 +14,8 @@ extern int migrate_page(struct address_space *,
                        struct page *, struct page *);
 extern int migrate_pages(struct list_head *l, new_page_t x,
                        unsigned long private, int offlining);
+extern int migrate_huge_pages(struct list_head *l, new_page_t x,
+                       unsigned long private, int offlining);
 
 extern int fail_migrate_page(struct address_space *,
                        struct page *, struct page *);
@@ -23,12 +25,17 @@ extern int migrate_prep_local(void);
 extern int migrate_vmas(struct mm_struct *mm,
                const nodemask_t *from, const nodemask_t *to,
                unsigned long flags);
+extern void migrate_page_copy(struct page *newpage, struct page *page);
+extern int migrate_huge_page_move_mapping(struct address_space *mapping,
+                                 struct page *newpage, struct page *page);
 #else
 #define PAGE_MIGRATION 0
 
 static inline void putback_lru_pages(struct list_head *l) {}
 static inline int migrate_pages(struct list_head *l, new_page_t x,
                unsigned long private, int offlining) { return -ENOSYS; }
+static inline int migrate_huge_pages(struct list_head *l, new_page_t x,
+               unsigned long private, int offlining) { return -ENOSYS; }
 
 static inline int migrate_prep(void) { return -ENOSYS; }
 static inline int migrate_prep_local(void) { return -ENOSYS; }
@@ -40,6 +47,15 @@ static inline int migrate_vmas(struct mm_struct *mm,
        return -ENOSYS;
 }
 
+static inline void migrate_page_copy(struct page *newpage,
+                                    struct page *page) {}
+
+static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
+                                 struct page *newpage, struct page *page)
+{
+       return -ENOSYS;
+}
+
 /* Possible settings for the migrate_page() method in address_operations */
 #define migrate_page NULL
 #define fail_migrate_page NULL
index 7687228dd3b7d16530cf28087ea71f07155ee3ba..a4c66846fb8f9547f71820083ac03a3fa468880c 100644 (file)
@@ -718,12 +718,20 @@ static inline int page_mapped(struct page *page)
 #define VM_FAULT_SIGBUS        0x0002
 #define VM_FAULT_MAJOR 0x0004
 #define VM_FAULT_WRITE 0x0008  /* Special case for get_user_pages */
-#define VM_FAULT_HWPOISON 0x0010       /* Hit poisoned page */
+#define VM_FAULT_HWPOISON 0x0010       /* Hit poisoned small page */
+#define VM_FAULT_HWPOISON_LARGE 0x0020  /* Hit poisoned large page. Index encoded in upper bits */
 
 #define VM_FAULT_NOPAGE        0x0100  /* ->fault installed the pte, not return page */
 #define VM_FAULT_LOCKED        0x0200  /* ->fault locked the returned page */
 
-#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON)
+#define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */
+
+#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | \
+                        VM_FAULT_HWPOISON_LARGE)
+
+/* Encode hstate index for a hwpoisoned large page */
+#define VM_FAULT_SET_HINDEX(x) ((x) << 12)
+#define VM_FAULT_GET_HINDEX(x) (((x) >> 12) & 0xf)
 
 /*
  * Can be called by the pagefault handler when it gets a VM_FAULT_OOM.
index b363b916c90961a9d61877f26d900d9111aeacb7..3ff4961da9b514992cf0edabf07c1b08d2121027 100644 (file)
@@ -33,6 +33,7 @@ struct signalfd_siginfo {
        __u64 ssi_utime;
        __u64 ssi_stime;
        __u64 ssi_addr;
+       __u16 ssi_addr_lsb;
 
        /*
         * Pad strcture to 128 bytes. Remember to update the
@@ -43,7 +44,7 @@ struct signalfd_siginfo {
         * comes out of a read(2) and we really don't want to have
         * a compat on read(2).
         */
-       __u8 __pad[48];
+       __u8 __pad[46];
 };
 
 
index c03273807182dde1d9dd2e905c0db11a6dfe2441..96991ded82fe90f46d564e97da0544889a575bf7 100644 (file)
@@ -423,14 +423,14 @@ static void clear_huge_page(struct page *page,
        }
 }
 
-static void copy_gigantic_page(struct page *dst, struct page *src,
+static void copy_user_gigantic_page(struct page *dst, struct page *src,
                           unsigned long addr, struct vm_area_struct *vma)
 {
        int i;
        struct hstate *h = hstate_vma(vma);
        struct page *dst_base = dst;
        struct page *src_base = src;
-       might_sleep();
+
        for (i = 0; i < pages_per_huge_page(h); ) {
                cond_resched();
                copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
@@ -440,14 +440,15 @@ static void copy_gigantic_page(struct page *dst, struct page *src,
                src = mem_map_next(src, src_base, i);
        }
 }
-static void copy_huge_page(struct page *dst, struct page *src,
+
+static void copy_user_huge_page(struct page *dst, struct page *src,
                           unsigned long addr, struct vm_area_struct *vma)
 {
        int i;
        struct hstate *h = hstate_vma(vma);
 
        if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
-               copy_gigantic_page(dst, src, addr, vma);
+               copy_user_gigantic_page(dst, src, addr, vma);
                return;
        }
 
@@ -458,6 +459,40 @@ static void copy_huge_page(struct page *dst, struct page *src,
        }
 }
 
+static void copy_gigantic_page(struct page *dst, struct page *src)
+{
+       int i;
+       struct hstate *h = page_hstate(src);
+       struct page *dst_base = dst;
+       struct page *src_base = src;
+
+       for (i = 0; i < pages_per_huge_page(h); ) {
+               cond_resched();
+               copy_highpage(dst, src);
+
+               i++;
+               dst = mem_map_next(dst, dst_base, i);
+               src = mem_map_next(src, src_base, i);
+       }
+}
+
+void copy_huge_page(struct page *dst, struct page *src)
+{
+       int i;
+       struct hstate *h = page_hstate(src);
+
+       if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
+               copy_gigantic_page(dst, src);
+               return;
+       }
+
+       might_sleep();
+       for (i = 0; i < pages_per_huge_page(h); i++) {
+               cond_resched();
+               copy_highpage(dst + i, src + i);
+       }
+}
+
 static void enqueue_huge_page(struct hstate *h, struct page *page)
 {
        int nid = page_to_nid(page);
@@ -466,11 +501,24 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
        h->free_huge_pages_node[nid]++;
 }
 
+static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
+{
+       struct page *page;
+
+       if (list_empty(&h->hugepage_freelists[nid]))
+               return NULL;
+       page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
+       list_del(&page->lru);
+       set_page_refcounted(page);
+       h->free_huge_pages--;
+       h->free_huge_pages_node[nid]--;
+       return page;
+}
+
 static struct page *dequeue_huge_page_vma(struct hstate *h,
                                struct vm_area_struct *vma,
                                unsigned long address, int avoid_reserve)
 {
-       int nid;
        struct page *page = NULL;
        struct mempolicy *mpol;
        nodemask_t *nodemask;
@@ -496,19 +544,13 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
 
        for_each_zone_zonelist_nodemask(zone, z, zonelist,
                                                MAX_NR_ZONES - 1, nodemask) {
-               nid = zone_to_nid(zone);
-               if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) &&
-                   !list_empty(&h->hugepage_freelists[nid])) {
-                       page = list_entry(h->hugepage_freelists[nid].next,
-                                         struct page, lru);
-                       list_del(&page->lru);
-                       h->free_huge_pages--;
-                       h->free_huge_pages_node[nid]--;
-
-                       if (!avoid_reserve)
-                               decrement_hugepage_resv_vma(h, vma);
-
-                       break;
+               if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) {
+                       page = dequeue_huge_page_node(h, zone_to_nid(zone));
+                       if (page) {
+                               if (!avoid_reserve)
+                                       decrement_hugepage_resv_vma(h, vma);
+                               break;
+                       }
                }
        }
 err:
@@ -770,11 +812,10 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
        return ret;
 }
 
-static struct page *alloc_buddy_huge_page(struct hstate *h,
-                       struct vm_area_struct *vma, unsigned long address)
+static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
 {
        struct page *page;
-       unsigned int nid;
+       unsigned int r_nid;
 
        if (h->order >= MAX_ORDER)
                return NULL;
@@ -812,9 +853,14 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
        }
        spin_unlock(&hugetlb_lock);
 
-       page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
-                                       __GFP_REPEAT|__GFP_NOWARN,
-                                       huge_page_order(h));
+       if (nid == NUMA_NO_NODE)
+               page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
+                                  __GFP_REPEAT|__GFP_NOWARN,
+                                  huge_page_order(h));
+       else
+               page = alloc_pages_exact_node(nid,
+                       htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
+                       __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
 
        if (page && arch_prepare_hugepage(page)) {
                __free_pages(page, huge_page_order(h));
@@ -823,19 +869,13 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
 
        spin_lock(&hugetlb_lock);
        if (page) {
-               /*
-                * This page is now managed by the hugetlb allocator and has
-                * no users -- drop the buddy allocator's reference.
-                */
-               put_page_testzero(page);
-               VM_BUG_ON(page_count(page));
-               nid = page_to_nid(page);
+               r_nid = page_to_nid(page);
                set_compound_page_dtor(page, free_huge_page);
                /*
                 * We incremented the global counters already
                 */
-               h->nr_huge_pages_node[nid]++;
-               h->surplus_huge_pages_node[nid]++;
+               h->nr_huge_pages_node[r_nid]++;
+               h->surplus_huge_pages_node[r_nid]++;
                __count_vm_event(HTLB_BUDDY_PGALLOC);
        } else {
                h->nr_huge_pages--;
@@ -847,6 +887,25 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
        return page;
 }
 
+/*
+ * This allocation function is useful in the context where vma is irrelevant.
+ * E.g. soft-offlining uses this function because it only cares physical
+ * address of error page.
+ */
+struct page *alloc_huge_page_node(struct hstate *h, int nid)
+{
+       struct page *page;
+
+       spin_lock(&hugetlb_lock);
+       page = dequeue_huge_page_node(h, nid);
+       spin_unlock(&hugetlb_lock);
+
+       if (!page)
+               page = alloc_buddy_huge_page(h, nid);
+
+       return page;
+}
+
 /*
  * Increase the hugetlb pool such that it can accomodate a reservation
  * of size 'delta'.
@@ -871,17 +930,14 @@ static int gather_surplus_pages(struct hstate *h, int delta)
 retry:
        spin_unlock(&hugetlb_lock);
        for (i = 0; i < needed; i++) {
-               page = alloc_buddy_huge_page(h, NULL, 0);
-               if (!page) {
+               page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
+               if (!page)
                        /*
                         * We were not able to allocate enough pages to
                         * satisfy the entire reservation so we free what
                         * we've allocated so far.
                         */
-                       spin_lock(&hugetlb_lock);
-                       needed = 0;
                        goto free;
-               }
 
                list_add(&page->lru, &surplus_list);
        }
@@ -908,31 +964,31 @@ retry:
        needed += allocated;
        h->resv_huge_pages += delta;
        ret = 0;
-free:
+
+       spin_unlock(&hugetlb_lock);
        /* Free the needed pages to the hugetlb pool */
        list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
                if ((--needed) < 0)
                        break;
                list_del(&page->lru);
+               /*
+                * This page is now managed by the hugetlb allocator and has
+                * no users -- drop the buddy allocator's reference.
+                */
+               put_page_testzero(page);
+               VM_BUG_ON(page_count(page));
                enqueue_huge_page(h, page);
        }
 
        /* Free unnecessary surplus pages to the buddy allocator */
+free:
        if (!list_empty(&surplus_list)) {
-               spin_unlock(&hugetlb_lock);
                list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
                        list_del(&page->lru);
-                       /*
-                        * The page has a reference count of zero already, so
-                        * call free_huge_page directly instead of using
-                        * put_page.  This must be done with hugetlb_lock
-                        * unlocked which is safe because free_huge_page takes
-                        * hugetlb_lock before deciding how to free the page.
-                        */
-                       free_huge_page(page);
+                       put_page(page);
                }
-               spin_lock(&hugetlb_lock);
        }
+       spin_lock(&hugetlb_lock);
 
        return ret;
 }
@@ -1052,14 +1108,13 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
        spin_unlock(&hugetlb_lock);
 
        if (!page) {
-               page = alloc_buddy_huge_page(h, vma, addr);
+               page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
                if (!page) {
                        hugetlb_put_quota(inode->i_mapping, chg);
                        return ERR_PTR(-VM_FAULT_SIGBUS);
                }
        }
 
-       set_page_refcounted(page);
        set_page_private(page, (unsigned long) mapping);
 
        vma_commit_reservation(h, vma, addr);
@@ -2153,6 +2208,19 @@ nomem:
        return -ENOMEM;
 }
 
+static int is_hugetlb_entry_migration(pte_t pte)
+{
+       swp_entry_t swp;
+
+       if (huge_pte_none(pte) || pte_present(pte))
+               return 0;
+       swp = pte_to_swp_entry(pte);
+       if (non_swap_entry(swp) && is_migration_entry(swp)) {
+               return 1;
+       } else
+               return 0;
+}
+
 static int is_hugetlb_entry_hwpoisoned(pte_t pte)
 {
        swp_entry_t swp;
@@ -2383,7 +2451,7 @@ retry_avoidcopy:
        if (unlikely(anon_vma_prepare(vma)))
                return VM_FAULT_OOM;
 
-       copy_huge_page(new_page, old_page, address, vma);
+       copy_user_huge_page(new_page, old_page, address, vma);
        __SetPageUptodate(new_page);
 
        /*
@@ -2515,21 +2583,19 @@ retry:
                        hugepage_add_new_anon_rmap(page, vma, address);
                }
        } else {
+               /*
+                * If memory error occurs between mmap() and fault, some process
+                * don't have hwpoisoned swap entry for errored virtual address.
+                * So we need to block hugepage fault by PG_hwpoison bit check.
+                */
+               if (unlikely(PageHWPoison(page))) {
+                       ret = VM_FAULT_HWPOISON | 
+                             VM_FAULT_SET_HINDEX(h - hstates);
+                       goto backout_unlocked;
+               }
                page_dup_rmap(page);
        }
 
-       /*
-        * Since memory error handler replaces pte into hwpoison swap entry
-        * at the time of error handling, a process which reserved but not have
-        * the mapping to the error hugepage does not have hwpoison swap entry.
-        * So we need to block accesses from such a process by checking
-        * PG_hwpoison bit here.
-        */
-       if (unlikely(PageHWPoison(page))) {
-               ret = VM_FAULT_HWPOISON;
-               goto backout_unlocked;
-       }
-
        /*
         * If we are going to COW a private mapping later, we examine the
         * pending reservations for this page now. This will ensure that
@@ -2587,8 +2653,12 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        ptep = huge_pte_offset(mm, address);
        if (ptep) {
                entry = huge_ptep_get(ptep);
-               if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
-                       return VM_FAULT_HWPOISON;
+               if (unlikely(is_hugetlb_entry_migration(entry))) {
+                       migration_entry_wait(mm, (pmd_t *)ptep, address);
+                       return 0;
+               } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
+                       return VM_FAULT_HWPOISON_LARGE | 
+                              VM_FAULT_SET_HINDEX(h - hstates);
        }
 
        ptep = huge_pte_alloc(mm, address, huge_page_size(h));
@@ -2878,18 +2948,41 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
        hugetlb_acct_memory(h, -(chg - freed));
 }
 
+#ifdef CONFIG_MEMORY_FAILURE
+
+/* Should be called in hugetlb_lock */
+static int is_hugepage_on_freelist(struct page *hpage)
+{
+       struct page *page;
+       struct page *tmp;
+       struct hstate *h = page_hstate(hpage);
+       int nid = page_to_nid(hpage);
+
+       list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru)
+               if (page == hpage)
+                       return 1;
+       return 0;
+}
+
 /*
  * This function is called from memory failure code.
  * Assume the caller holds page lock of the head page.
  */
-void __isolate_hwpoisoned_huge_page(struct page *hpage)
+int dequeue_hwpoisoned_huge_page(struct page *hpage)
 {
        struct hstate *h = page_hstate(hpage);
        int nid = page_to_nid(hpage);
+       int ret = -EBUSY;
 
        spin_lock(&hugetlb_lock);
-       list_del(&hpage->lru);
-       h->free_huge_pages--;
-       h->free_huge_pages_node[nid]--;
+       if (is_hugepage_on_freelist(hpage)) {
+               list_del(&hpage->lru);
+               set_page_refcounted(hpage);
+               h->free_huge_pages--;
+               h->free_huge_pages_node[nid]--;
+               ret = 0;
+       }
        spin_unlock(&hugetlb_lock);
+       return ret;
 }
+#endif
index 757f6b0accfe84d959b7fe5899b5916ad0ed1f14..44a8cefeae6eb9627bf8f81330ea4edaa2452d81 100644 (file)
@@ -7,21 +7,26 @@
  * Free Software Foundation.
  *
  * High level machine check handler. Handles pages reported by the
- * hardware as being corrupted usually due to a 2bit ECC memory or cache
+ * hardware as being corrupted usually due to a multi-bit ECC memory or cache
  * failure.
+ * 
+ * In addition there is a "soft offline" entry point that allows stop using
+ * not-yet-corrupted-by-suspicious pages without killing anything.
  *
  * Handles page cache pages in various states. The tricky part
- * here is that we can access any page asynchronous to other VM
- * users, because memory failures could happen anytime and anywhere,
- * possibly violating some of their assumptions. This is why this code
- * has to be extremely careful. Generally it tries to use normal locking
- * rules, as in get the standard locks, even if that means the
- * error handling takes potentially a long time.
- *
- * The operation to map back from RMAP chains to processes has to walk
- * the complete process list and has non linear complexity with the number
- * mappings. In short it can be quite slow. But since memory corruptions
- * are rare we hope to get away with this.
+ * here is that we can access any page asynchronously in respect to 
+ * other VM users, because memory failures could happen anytime and 
+ * anywhere. This could violate some of their assumptions. This is why 
+ * this code has to be extremely careful. Generally it tries to use 
+ * normal locking rules, as in get the standard locks, even if that means 
+ * the error handling takes potentially a long time.
+ * 
+ * There are several operations here with exponential complexity because
+ * of unsuitable VM data structures. For example the operation to map back 
+ * from RMAP chains to processes has to walk the complete process list and 
+ * has non linear complexity with the number. But since memory corruptions
+ * are rare we hope to get away with this. This avoids impacting the core 
+ * VM.
  */
 
 /*
@@ -30,7 +35,6 @@
  * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages
  * - pass bad pages to kdump next kernel
  */
-#define DEBUG 1                /* remove me in 2.6.34 */
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/page-flags.h>
@@ -78,7 +82,7 @@ static int hwpoison_filter_dev(struct page *p)
                return 0;
 
        /*
-        * page_mapping() does not accept slab page
+        * page_mapping() does not accept slab pages.
         */
        if (PageSlab(p))
                return -EINVAL;
@@ -268,7 +272,7 @@ struct to_kill {
        struct list_head nd;
        struct task_struct *tsk;
        unsigned long addr;
-       unsigned addr_valid:1;
+       char addr_valid;
 };
 
 /*
@@ -309,7 +313,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
         * a SIGKILL because the error is not contained anymore.
         */
        if (tk->addr == -EFAULT) {
-               pr_debug("MCE: Unable to find user space address %lx in %s\n",
+               pr_info("MCE: Unable to find user space address %lx in %s\n",
                        page_to_pfn(p), tsk->comm);
                tk->addr_valid = 0;
        }
@@ -577,7 +581,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
                                        pfn, err);
                } else if (page_has_private(p) &&
                                !try_to_release_page(p, GFP_NOIO)) {
-                       pr_debug("MCE %#lx: failed to release buffers\n", pfn);
+                       pr_info("MCE %#lx: failed to release buffers\n", pfn);
                } else {
                        ret = RECOVERED;
                }
@@ -693,11 +697,10 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
  * Issues:
  * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
  *   To narrow down kill region to one page, we need to break up pmd.
- * - To support soft-offlining for hugepage, we need to support hugepage
- *   migration.
  */
 static int me_huge_page(struct page *p, unsigned long pfn)
 {
+       int res = 0;
        struct page *hpage = compound_head(p);
        /*
         * We can safely recover from error on free or reserved (i.e.
@@ -710,8 +713,9 @@ static int me_huge_page(struct page *p, unsigned long pfn)
         * so there is no race between isolation and mapping/unmapping.
         */
        if (!(page_mapping(hpage) || PageAnon(hpage))) {
-               __isolate_hwpoisoned_huge_page(hpage);
-               return RECOVERED;
+               res = dequeue_hwpoisoned_huge_page(hpage);
+               if (!res)
+                       return RECOVERED;
        }
        return DELAYED;
 }
@@ -836,8 +840,6 @@ static int page_action(struct page_state *ps, struct page *p,
        return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
 }
 
-#define N_UNMAP_TRIES 5
-
 /*
  * Do all that is necessary to remove user space mappings. Unmap
  * the pages and send SIGBUS to the processes if the data was dirty.
@@ -849,7 +851,6 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
        struct address_space *mapping;
        LIST_HEAD(tokill);
        int ret;
-       int i;
        int kill = 1;
        struct page *hpage = compound_head(p);
 
@@ -903,17 +904,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
        if (kill)
                collect_procs(hpage, &tokill);
 
-       /*
-        * try_to_unmap can fail temporarily due to races.
-        * Try a few times (RED-PEN better strategy?)
-        */
-       for (i = 0; i < N_UNMAP_TRIES; i++) {
-               ret = try_to_unmap(hpage, ttu);
-               if (ret == SWAP_SUCCESS)
-                       break;
-               pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn,  ret);
-       }
-
+       ret = try_to_unmap(hpage, ttu);
        if (ret != SWAP_SUCCESS)
                printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
                                pfn, page_mapcount(hpage));
@@ -981,7 +972,10 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
         * We need/can do nothing about count=0 pages.
         * 1) it's a free page, and therefore in safe hand:
         *    prep_new_page() will be the gate keeper.
-        * 2) it's part of a non-compound high order page.
+        * 2) it's a free hugepage, which is also safe:
+        *    an affected hugepage will be dequeued from hugepage freelist,
+        *    so there's no concern about reusing it ever after.
+        * 3) it's part of a non-compound high order page.
         *    Implies some kernel user: cannot stop them from
         *    R/W the page; let's pray that the page has been
         *    used and will be freed some time later.
@@ -993,6 +987,24 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
                if (is_free_buddy_page(p)) {
                        action_result(pfn, "free buddy", DELAYED);
                        return 0;
+               } else if (PageHuge(hpage)) {
+                       /*
+                        * Check "just unpoisoned", "filter hit", and
+                        * "race with other subpage."
+                        */
+                       lock_page_nosync(hpage);
+                       if (!PageHWPoison(hpage)
+                           || (hwpoison_filter(p) && TestClearPageHWPoison(p))
+                           || (p != hpage && TestSetPageHWPoison(hpage))) {
+                               atomic_long_sub(nr_pages, &mce_bad_pages);
+                               return 0;
+                       }
+                       set_page_hwpoison_huge_page(hpage);
+                       res = dequeue_hwpoisoned_huge_page(hpage);
+                       action_result(pfn, "free huge",
+                                     res ? IGNORED : DELAYED);
+                       unlock_page(hpage);
+                       return res;
                } else {
                        action_result(pfn, "high order kernel", IGNORED);
                        return -EBUSY;
@@ -1147,16 +1159,26 @@ int unpoison_memory(unsigned long pfn)
        page = compound_head(p);
 
        if (!PageHWPoison(p)) {
-               pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn);
+               pr_info("MCE: Page was already unpoisoned %#lx\n", pfn);
                return 0;
        }
 
        nr_pages = 1 << compound_order(page);
 
        if (!get_page_unless_zero(page)) {
+               /*
+                * Since HWPoisoned hugepage should have non-zero refcount,
+                * race between memory failure and unpoison seems to happen.
+                * In such case unpoison fails and memory failure runs
+                * to the end.
+                */
+               if (PageHuge(page)) {
+                       pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
+                       return 0;
+               }
                if (TestClearPageHWPoison(p))
                        atomic_long_sub(nr_pages, &mce_bad_pages);
-               pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn);
+               pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
                return 0;
        }
 
@@ -1168,12 +1190,12 @@ int unpoison_memory(unsigned long pfn)
         * the free buddy page pool.
         */
        if (TestClearPageHWPoison(page)) {
-               pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn);
+               pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
                atomic_long_sub(nr_pages, &mce_bad_pages);
                freeit = 1;
+               if (PageHuge(page))
+                       clear_page_hwpoison_huge_page(page);
        }
-       if (PageHuge(p))
-               clear_page_hwpoison_huge_page(page);
        unlock_page(page);
 
        put_page(page);
@@ -1187,7 +1209,11 @@ EXPORT_SYMBOL(unpoison_memory);
 static struct page *new_page(struct page *p, unsigned long private, int **x)
 {
        int nid = page_to_nid(p);
-       return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
+       if (PageHuge(p))
+               return alloc_huge_page_node(page_hstate(compound_head(p)),
+                                                  nid);
+       else
+               return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
 }
 
 /*
@@ -1215,14 +1241,21 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
         * was free.
         */
        set_migratetype_isolate(p);
+       /*
+        * When the target page is a free hugepage, just remove it
+        * from free hugepage list.
+        */
        if (!get_page_unless_zero(compound_head(p))) {
-               if (is_free_buddy_page(p)) {
-                       pr_debug("get_any_page: %#lx free buddy page\n", pfn);
+               if (PageHuge(p)) {
+                       pr_info("get_any_page: %#lx free huge page\n", pfn);
+                       ret = dequeue_hwpoisoned_huge_page(compound_head(p));
+               } else if (is_free_buddy_page(p)) {
+                       pr_info("get_any_page: %#lx free buddy page\n", pfn);
                        /* Set hwpoison bit while page is still isolated */
                        SetPageHWPoison(p);
                        ret = 0;
                } else {
-                       pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n",
+                       pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n",
                                pfn, p->flags);
                        ret = -EIO;
                }
@@ -1235,6 +1268,45 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
        return ret;
 }
 
+static int soft_offline_huge_page(struct page *page, int flags)
+{
+       int ret;
+       unsigned long pfn = page_to_pfn(page);
+       struct page *hpage = compound_head(page);
+       LIST_HEAD(pagelist);
+
+       ret = get_any_page(page, pfn, flags);
+       if (ret < 0)
+               return ret;
+       if (ret == 0)
+               goto done;
+
+       if (PageHWPoison(hpage)) {
+               put_page(hpage);
+               pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn);
+               return -EBUSY;
+       }
+
+       /* Keep page count to indicate a given hugepage is isolated. */
+
+       list_add(&hpage->lru, &pagelist);
+       ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
+       if (ret) {
+               pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
+                        pfn, ret, page->flags);
+               if (ret > 0)
+                       ret = -EIO;
+               return ret;
+       }
+done:
+       if (!PageHWPoison(hpage))
+               atomic_long_add(1 << compound_order(hpage), &mce_bad_pages);
+       set_page_hwpoison_huge_page(hpage);
+       dequeue_hwpoisoned_huge_page(hpage);
+       /* keep elevated page count for bad page */
+       return ret;
+}
+
 /**
  * soft_offline_page - Soft offline a page.
  * @page: page to offline
@@ -1262,6 +1334,9 @@ int soft_offline_page(struct page *page, int flags)
        int ret;
        unsigned long pfn = page_to_pfn(page);
 
+       if (PageHuge(page))
+               return soft_offline_huge_page(page, flags);
+
        ret = get_any_page(page, pfn, flags);
        if (ret < 0)
                return ret;
@@ -1288,7 +1363,7 @@ int soft_offline_page(struct page *page, int flags)
                        goto done;
        }
        if (!PageLRU(page)) {
-               pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n",
+               pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
                                pfn, page->flags);
                return -EIO;
        }
@@ -1302,7 +1377,7 @@ int soft_offline_page(struct page *page, int flags)
        if (PageHWPoison(page)) {
                unlock_page(page);
                put_page(page);
-               pr_debug("soft offline: %#lx page already poisoned\n", pfn);
+               pr_info("soft offline: %#lx page already poisoned\n", pfn);
                return -EBUSY;
        }
 
@@ -1323,7 +1398,7 @@ int soft_offline_page(struct page *page, int flags)
        put_page(page);
        if (ret == 1) {
                ret = 0;
-               pr_debug("soft_offline: %#lx: invalidated\n", pfn);
+               pr_info("soft_offline: %#lx: invalidated\n", pfn);
                goto done;
        }
 
@@ -1339,13 +1414,13 @@ int soft_offline_page(struct page *page, int flags)
                list_add(&page->lru, &pagelist);
                ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
                if (ret) {
-                       pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
+                       pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
                                pfn, ret, page->flags);
                        if (ret > 0)
                                ret = -EIO;
                }
        } else {
-               pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
+               pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
                                pfn, ret, page_count(page), page->flags);
        }
        if (ret)
index 98b58fecedeffc236a9c7285689fe4720409bd30..af82741caaa496886efa24b6c57817c4ccf80c4c 100644 (file)
@@ -1450,7 +1450,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                                        if (ret & VM_FAULT_OOM)
                                                return i ? i : -ENOMEM;
                                        if (ret &
-                                           (VM_FAULT_HWPOISON|VM_FAULT_SIGBUS))
+                                           (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE|
+                                            VM_FAULT_SIGBUS))
                                                return i ? i : -EFAULT;
                                        BUG();
                                }
index 38e7cad782f4b008a85f03e6635332f46ba0932b..f8c9bccf252039fd631b5020fe66e8798d8193f7 100644 (file)
@@ -32,6 +32,7 @@
 #include <linux/security.h>
 #include <linux/memcontrol.h>
 #include <linux/syscalls.h>
+#include <linux/hugetlb.h>
 #include <linux/gfp.h>
 
 #include "internal.h"
@@ -95,26 +96,34 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
        pte_t *ptep, pte;
        spinlock_t *ptl;
 
-       pgd = pgd_offset(mm, addr);
-       if (!pgd_present(*pgd))
-               goto out;
+       if (unlikely(PageHuge(new))) {
+               ptep = huge_pte_offset(mm, addr);
+               if (!ptep)
+                       goto out;
+               ptl = &mm->page_table_lock;
+       } else {
+               pgd = pgd_offset(mm, addr);
+               if (!pgd_present(*pgd))
+                       goto out;
 
-       pud = pud_offset(pgd, addr);
-       if (!pud_present(*pud))
-               goto out;
+               pud = pud_offset(pgd, addr);
+               if (!pud_present(*pud))
+                       goto out;
 
-       pmd = pmd_offset(pud, addr);
-       if (!pmd_present(*pmd))
-               goto out;
+               pmd = pmd_offset(pud, addr);
+               if (!pmd_present(*pmd))
+                       goto out;
 
-       ptep = pte_offset_map(pmd, addr);
+               ptep = pte_offset_map(pmd, addr);
 
-       if (!is_swap_pte(*ptep)) {
-               pte_unmap(ptep);
-               goto out;
-       }
+               if (!is_swap_pte(*ptep)) {
+                       pte_unmap(ptep);
+                       goto out;
+               }
+
+               ptl = pte_lockptr(mm, pmd);
+       }
 
-       ptl = pte_lockptr(mm, pmd);
        spin_lock(ptl);
        pte = *ptep;
        if (!is_swap_pte(pte))
@@ -130,10 +139,19 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
        pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
        if (is_write_migration_entry(entry))
                pte = pte_mkwrite(pte);
+#ifdef CONFIG_HUGETLB_PAGE
+       if (PageHuge(new))
+               pte = pte_mkhuge(pte);
+#endif
        flush_cache_page(vma, addr, pte_pfn(pte));
        set_pte_at(mm, addr, ptep, pte);
 
-       if (PageAnon(new))
+       if (PageHuge(new)) {
+               if (PageAnon(new))
+                       hugepage_add_anon_rmap(new, vma, addr);
+               else
+                       page_dup_rmap(new);
+       } else if (PageAnon(new))
                page_add_anon_rmap(new, vma, addr);
        else
                page_add_file_rmap(new);
@@ -275,12 +293,60 @@ static int migrate_page_move_mapping(struct address_space *mapping,
        return 0;
 }
 
+/*
+ * The expected number of remaining references is the same as that
+ * of migrate_page_move_mapping().
+ */
+int migrate_huge_page_move_mapping(struct address_space *mapping,
+                                  struct page *newpage, struct page *page)
+{
+       int expected_count;
+       void **pslot;
+
+       if (!mapping) {
+               if (page_count(page) != 1)
+                       return -EAGAIN;
+               return 0;
+       }
+
+       spin_lock_irq(&mapping->tree_lock);
+
+       pslot = radix_tree_lookup_slot(&mapping->page_tree,
+                                       page_index(page));
+
+       expected_count = 2 + page_has_private(page);
+       if (page_count(page) != expected_count ||
+           (struct page *)radix_tree_deref_slot(pslot) != page) {
+               spin_unlock_irq(&mapping->tree_lock);
+               return -EAGAIN;
+       }
+
+       if (!page_freeze_refs(page, expected_count)) {
+               spin_unlock_irq(&mapping->tree_lock);
+               return -EAGAIN;
+       }
+
+       get_page(newpage);
+
+       radix_tree_replace_slot(pslot, newpage);
+
+       page_unfreeze_refs(page, expected_count);
+
+       __put_page(page);
+
+       spin_unlock_irq(&mapping->tree_lock);
+       return 0;
+}
+
 /*
  * Copy the page to its new location
  */
-static void migrate_page_copy(struct page *newpage, struct page *page)
+void migrate_page_copy(struct page *newpage, struct page *page)
 {
-       copy_highpage(newpage, page);
+       if (PageHuge(page))
+               copy_huge_page(newpage, page);
+       else
+               copy_highpage(newpage, page);
 
        if (PageError(page))
                SetPageError(newpage);
@@ -723,6 +789,92 @@ move_newpage:
        return rc;
 }
 
+/*
+ * Counterpart of unmap_and_move_page() for hugepage migration.
+ *
+ * This function doesn't wait the completion of hugepage I/O
+ * because there is no race between I/O and migration for hugepage.
+ * Note that currently hugepage I/O occurs only in direct I/O
+ * where no lock is held and PG_writeback is irrelevant,
+ * and writeback status of all subpages are counted in the reference
+ * count of the head page (i.e. if all subpages of a 2MB hugepage are
+ * under direct I/O, the reference of the head page is 512 and a bit more.)
+ * This means that when we try to migrate hugepage whose subpages are
+ * doing direct I/O, some references remain after try_to_unmap() and
+ * hugepage migration fails without data corruption.
+ *
+ * There is also no race when direct I/O is issued on the page under migration,
+ * because then pte is replaced with migration swap entry and direct I/O code
+ * will wait in the page fault for migration to complete.
+ */
+static int unmap_and_move_huge_page(new_page_t get_new_page,
+                               unsigned long private, struct page *hpage,
+                               int force, int offlining)
+{
+       int rc = 0;
+       int *result = NULL;
+       struct page *new_hpage = get_new_page(hpage, private, &result);
+       int rcu_locked = 0;
+       struct anon_vma *anon_vma = NULL;
+
+       if (!new_hpage)
+               return -ENOMEM;
+
+       rc = -EAGAIN;
+
+       if (!trylock_page(hpage)) {
+               if (!force)
+                       goto out;
+               lock_page(hpage);
+       }
+
+       if (PageAnon(hpage)) {
+               rcu_read_lock();
+               rcu_locked = 1;
+
+               if (page_mapped(hpage)) {
+                       anon_vma = page_anon_vma(hpage);
+                       atomic_inc(&anon_vma->external_refcount);
+               }
+       }
+
+       try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
+
+       if (!page_mapped(hpage))
+               rc = move_to_new_page(new_hpage, hpage, 1);
+
+       if (rc)
+               remove_migration_ptes(hpage, hpage);
+
+       if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount,
+                                           &anon_vma->lock)) {
+               int empty = list_empty(&anon_vma->head);
+               spin_unlock(&anon_vma->lock);
+               if (empty)
+                       anon_vma_free(anon_vma);
+       }
+
+       if (rcu_locked)
+               rcu_read_unlock();
+out:
+       unlock_page(hpage);
+
+       if (rc != -EAGAIN) {
+               list_del(&hpage->lru);
+               put_page(hpage);
+       }
+
+       put_page(new_hpage);
+
+       if (result) {
+               if (rc)
+                       *result = rc;
+               else
+                       *result = page_to_nid(new_hpage);
+       }
+       return rc;
+}
+
 /*
  * migrate_pages
  *
@@ -788,6 +940,52 @@ out:
        return nr_failed + retry;
 }
 
+int migrate_huge_pages(struct list_head *from,
+               new_page_t get_new_page, unsigned long private, int offlining)
+{
+       int retry = 1;
+       int nr_failed = 0;
+       int pass = 0;
+       struct page *page;
+       struct page *page2;
+       int rc;
+
+       for (pass = 0; pass < 10 && retry; pass++) {
+               retry = 0;
+
+               list_for_each_entry_safe(page, page2, from, lru) {
+                       cond_resched();
+
+                       rc = unmap_and_move_huge_page(get_new_page,
+                                       private, page, pass > 2, offlining);
+
+                       switch(rc) {
+                       case -ENOMEM:
+                               goto out;
+                       case -EAGAIN:
+                               retry++;
+                               break;
+                       case 0:
+                               break;
+                       default:
+                               /* Permanent failure */
+                               nr_failed++;
+                               break;
+                       }
+               }
+       }
+       rc = 0;
+out:
+
+       list_for_each_entry_safe(page, page2, from, lru)
+               put_page(page);
+
+       if (rc)
+               return rc;
+
+       return nr_failed + retry;
+}
+
 #ifdef CONFIG_NUMA
 /*
  * Move a list of individual pages
index 5f17fad1bee8bfcd6281fd955f21019480651d94..f5ad996a4a8f76615d1f383fde5e4a783375a7b7 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -780,10 +780,10 @@ void page_move_anon_rmap(struct page *page,
 }
 
 /**
- * __page_set_anon_rmap - setup new anonymous rmap
- * @page:      the page to add the mapping to
- * @vma:       the vm area in which the mapping is added
- * @address:   the user virtual address mapped
+ * __page_set_anon_rmap - set up new anonymous rmap
+ * @page:      Page to add to rmap     
+ * @vma:       VM area to add page to.
+ * @address:   User virtual address of the mapping     
  * @exclusive: the page is exclusively owned by the current process
  */
 static void __page_set_anon_rmap(struct page *page,
@@ -793,25 +793,16 @@ static void __page_set_anon_rmap(struct page *page,
 
        BUG_ON(!anon_vma);
 
+       if (PageAnon(page))
+               return;
+
        /*
         * If the page isn't exclusively mapped into this vma,
         * we must use the _oldest_ possible anon_vma for the
         * page mapping!
         */
-       if (!exclusive) {
-               if (PageAnon(page))
-                       return;
+       if (!exclusive)
                anon_vma = anon_vma->root;
-       } else {
-               /*
-                * In this case, swapped-out-but-not-discarded swap-cache
-                * is remapped. So, no need to update page->mapping here.
-                * We convice anon_vma poitned by page->mapping is not obsolete
-                * because vma->anon_vma is necessary to be a family of it.
-                */
-               if (PageAnon(page))
-                       return;
-       }
 
        anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
        page->mapping = (struct address_space *) anon_vma;