Merge tag 'please-pull-fix-ia64-build' of git://git.kernel.org/pub/scm/linux/kernel...
authorLinus Torvalds <torvalds@linux-foundation.org>
Sat, 23 Feb 2013 03:27:23 +0000 (19:27 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sat, 23 Feb 2013 03:27:23 +0000 (19:27 -0800)
Pull ia64 build breakage fix from Tony Luck.

* tag 'please-pull-fix-ia64-build' of git://git.kernel.org/pub/scm/linux/kernel/git/aegl/linux:
  sched: move RR_TIMESLICE from sysctl.h to rt.h

36 files changed:
Documentation/lockstat.txt
Documentation/x86/early-microcode.txt [new file with mode: 0644]
arch/x86/Kconfig
arch/x86/include/asm/microcode.h
arch/x86/include/asm/microcode_intel.h [new file with mode: 0644]
arch/x86/include/asm/processor.h
arch/x86/include/asm/proto.h
arch/x86/include/asm/tlbflush.h
arch/x86/kernel/Makefile
arch/x86/kernel/cpu/common.c
arch/x86/kernel/head64.c
arch/x86/kernel/head_32.S
arch/x86/kernel/head_64.S
arch/x86/kernel/microcode_core.c
arch/x86/kernel/microcode_core_early.c [new file with mode: 0644]
arch/x86/kernel/microcode_intel.c
arch/x86/kernel/microcode_intel_early.c [new file with mode: 0644]
arch/x86/kernel/microcode_intel_lib.c [new file with mode: 0644]
arch/x86/mm/init.c
arch/x86/xen/mmu.c
drivers/char/random.c
drivers/idle/i7300_idle.c
drivers/usb/chipidea/debug.c
fs/file.c
include/asm-generic/cmpxchg-local.h
include/linux/idr.h
include/linux/lockdep.h
include/linux/seqlock.h
kernel/futex.c
kernel/futex_compat.c
kernel/lockdep.c
kernel/time/ntp.c
kernel/watchdog.c
lib/locking-selftest.c
lib/rwsem-spinlock.c
lib/rwsem.c

index cef00d4..dd2f7b2 100644 (file)
@@ -65,7 +65,7 @@ that had to wait on lock acquisition.
 
  - CONFIGURATION
 
-Lock statistics are enabled via CONFIG_LOCK_STATS.
+Lock statistics are enabled via CONFIG_LOCK_STAT.
 
  - USAGE
 
diff --git a/Documentation/x86/early-microcode.txt b/Documentation/x86/early-microcode.txt
new file mode 100644 (file)
index 0000000..4aaf0df
--- /dev/null
@@ -0,0 +1,43 @@
+Early load microcode
+====================
+By Fenghua Yu <fenghua.yu@intel.com>
+
+Kernel can update microcode in early phase of boot time. Loading microcode early
+can fix CPU issues before they are observed during kernel boot time.
+
+Microcode is stored in an initrd file. The microcode is read from the initrd
+file and loaded to CPUs during boot time.
+
+The format of the combined initrd image is microcode in cpio format followed by
+the initrd image (maybe compressed). Kernel parses the combined initrd image
+during boot time. The microcode file in cpio name space is:
+kernel/x86/microcode/GenuineIntel.bin
+
+During BSP boot (before SMP starts), if the kernel finds the microcode file in
+the initrd file, it parses the microcode and saves matching microcode in memory.
+If matching microcode is found, it will be uploaded in BSP and later on in all
+APs.
+
+The cached microcode patch is applied when CPUs resume from a sleep state.
+
+There are two legacy user space interfaces to load microcode, either through
+/dev/cpu/microcode or through /sys/devices/system/cpu/microcode/reload file
+in sysfs.
+
+In addition to these two legacy methods, the early loading method described
+here is the third method with which microcode can be uploaded to a system's
+CPUs.
+
+The following example script shows how to generate a new combined initrd file in
+/boot/initrd-3.5.0.ucode.img with original microcode microcode.bin and
+original initrd image /boot/initrd-3.5.0.img.
+
+mkdir initrd
+cd initrd
+mkdir kernel
+mkdir kernel/x86
+mkdir kernel/x86/microcode
+cp ../microcode.bin kernel/x86/microcode/GenuineIntel.bin
+find .|cpio -oc >../ucode.cpio
+cd ..
+cat ucode.cpio /boot/initrd-3.5.0.img >/boot/initrd-3.5.0.ucode.img
index ff0e5f3..4ebc7a6 100644 (file)
@@ -1054,6 +1054,24 @@ config MICROCODE_OLD_INTERFACE
        def_bool y
        depends on MICROCODE
 
+config MICROCODE_INTEL_LIB
+       def_bool y
+       depends on MICROCODE_INTEL
+
+config MICROCODE_INTEL_EARLY
+       bool "Early load microcode"
+       depends on MICROCODE_INTEL && BLK_DEV_INITRD
+       default y
+       help
+         This option provides functionality to read additional microcode data
+         at the beginning of initrd image. The data tells kernel to load
+         microcode to CPU's as early as possible. No functional change if no
+         microcode data is glued to the initrd, therefore it's safe to say Y.
+
+config MICROCODE_EARLY
+       def_bool y
+       depends on MICROCODE_INTEL_EARLY
+
 config X86_MSR
        tristate "/dev/cpu/*/msr - Model-specific register support"
        ---help---
index 43d921b..6825e2e 100644 (file)
@@ -57,4 +57,18 @@ static inline struct microcode_ops * __init init_amd_microcode(void)
 static inline void __exit exit_amd_microcode(void) {}
 #endif
 
+#ifdef CONFIG_MICROCODE_EARLY
+#define MAX_UCODE_COUNT 128
+extern void __init load_ucode_bsp(void);
+extern __init void load_ucode_ap(void);
+extern int __init save_microcode_in_initrd(void);
+#else
+static inline void __init load_ucode_bsp(void) {}
+static inline __init void load_ucode_ap(void) {}
+static inline int __init save_microcode_in_initrd(void)
+{
+       return 0;
+}
+#endif
+
 #endif /* _ASM_X86_MICROCODE_H */
diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h
new file mode 100644 (file)
index 0000000..5356f92
--- /dev/null
@@ -0,0 +1,85 @@
+#ifndef _ASM_X86_MICROCODE_INTEL_H
+#define _ASM_X86_MICROCODE_INTEL_H
+
+#include <asm/microcode.h>
+
+struct microcode_header_intel {
+       unsigned int            hdrver;
+       unsigned int            rev;
+       unsigned int            date;
+       unsigned int            sig;
+       unsigned int            cksum;
+       unsigned int            ldrver;
+       unsigned int            pf;
+       unsigned int            datasize;
+       unsigned int            totalsize;
+       unsigned int            reserved[3];
+};
+
+struct microcode_intel {
+       struct microcode_header_intel hdr;
+       unsigned int            bits[0];
+};
+
+/* microcode format is extended from prescott processors */
+struct extended_signature {
+       unsigned int            sig;
+       unsigned int            pf;
+       unsigned int            cksum;
+};
+
+struct extended_sigtable {
+       unsigned int            count;
+       unsigned int            cksum;
+       unsigned int            reserved[3];
+       struct extended_signature sigs[0];
+};
+
+#define DEFAULT_UCODE_DATASIZE (2000)
+#define MC_HEADER_SIZE         (sizeof(struct microcode_header_intel))
+#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
+#define EXT_HEADER_SIZE                (sizeof(struct extended_sigtable))
+#define EXT_SIGNATURE_SIZE     (sizeof(struct extended_signature))
+#define DWSIZE                 (sizeof(u32))
+
+#define get_totalsize(mc) \
+       (((struct microcode_intel *)mc)->hdr.totalsize ? \
+        ((struct microcode_intel *)mc)->hdr.totalsize : \
+        DEFAULT_UCODE_TOTALSIZE)
+
+#define get_datasize(mc) \
+       (((struct microcode_intel *)mc)->hdr.datasize ? \
+        ((struct microcode_intel *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE)
+
+#define sigmatch(s1, s2, p1, p2) \
+       (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0))))
+
+#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
+
+extern int
+get_matching_microcode(unsigned int csig, int cpf, void *mc, int rev);
+extern int microcode_sanity_check(void *mc, int print_err);
+extern int get_matching_sig(unsigned int csig, int cpf, void *mc, int rev);
+extern int
+update_match_revision(struct microcode_header_intel *mc_header, int rev);
+
+#ifdef CONFIG_MICROCODE_INTEL_EARLY
+extern void __init load_ucode_intel_bsp(void);
+extern void __cpuinit load_ucode_intel_ap(void);
+extern void show_ucode_info_early(void);
+#else
+static inline __init void load_ucode_intel_bsp(void) {}
+static inline __cpuinit void load_ucode_intel_ap(void) {}
+static inline void show_ucode_info_early(void) {}
+#endif
+
+#if defined(CONFIG_MICROCODE_INTEL_EARLY) && defined(CONFIG_HOTPLUG_CPU)
+extern int save_mc_for_early(u8 *mc);
+#else
+static inline int save_mc_for_early(u8 *mc)
+{
+       return 0;
+}
+#endif
+
+#endif /* _ASM_X86_MICROCODE_INTEL_H */
index 8277941..3270116 100644 (file)
@@ -180,6 +180,14 @@ extern void init_amd_cacheinfo(struct cpuinfo_x86 *c);
 extern void detect_extended_topology(struct cpuinfo_x86 *c);
 extern void detect_ht(struct cpuinfo_x86 *c);
 
+#ifdef CONFIG_X86_32
+extern int have_cpuid_p(void);
+#else
+static inline int have_cpuid_p(void)
+{
+       return 1;
+}
+#endif
 static inline void native_cpuid(unsigned int *eax, unsigned int *ebx,
                                unsigned int *ecx, unsigned int *edx)
 {
index 6f414ed..6fd3fd7 100644 (file)
@@ -5,8 +5,6 @@
 
 /* misc architecture specific prototypes */
 
-void early_idt_handler(void);
-
 void system_call(void);
 void syscall_init(void);
 
index 0fee48e..50a7fc0 100644 (file)
@@ -20,10 +20,20 @@ static inline void __native_flush_tlb(void)
        native_write_cr3(native_read_cr3());
 }
 
+static inline void __native_flush_tlb_global_irq_disabled(void)
+{
+       unsigned long cr4;
+
+       cr4 = native_read_cr4();
+       /* clear PGE */
+       native_write_cr4(cr4 & ~X86_CR4_PGE);
+       /* write old PGE again and flush TLBs */
+       native_write_cr4(cr4);
+}
+
 static inline void __native_flush_tlb_global(void)
 {
        unsigned long flags;
-       unsigned long cr4;
 
        /*
         * Read-modify-write to CR4 - protect it from preemption and
@@ -32,11 +42,7 @@ static inline void __native_flush_tlb_global(void)
         */
        raw_local_irq_save(flags);
 
-       cr4 = native_read_cr4();
-       /* clear PGE */
-       native_write_cr4(cr4 & ~X86_CR4_PGE);
-       /* write old PGE again and flush TLBs */
-       native_write_cr4(cr4);
+       __native_flush_tlb_global_irq_disabled();
 
        raw_local_irq_restore(flags);
 }
index ac3b3d0..7bd3bd3 100644 (file)
@@ -87,6 +87,9 @@ obj-$(CONFIG_PARAVIRT_CLOCK)  += pvclock.o
 
 obj-$(CONFIG_PCSPKR_PLATFORM)  += pcspeaker.o
 
+obj-$(CONFIG_MICROCODE_EARLY)          += microcode_core_early.o
+obj-$(CONFIG_MICROCODE_INTEL_EARLY)    += microcode_intel_early.o
+obj-$(CONFIG_MICROCODE_INTEL_LIB)      += microcode_intel_lib.o
 microcode-y                            := microcode_core.o
 microcode-$(CONFIG_MICROCODE_INTEL)    += microcode_intel.o
 microcode-$(CONFIG_MICROCODE_AMD)      += microcode_amd.o
index 9c3ab43..d814772 100644 (file)
@@ -37,6 +37,8 @@
 #include <asm/mce.h>
 #include <asm/msr.h>
 #include <asm/pat.h>
+#include <asm/microcode.h>
+#include <asm/microcode_intel.h>
 
 #ifdef CONFIG_X86_LOCAL_APIC
 #include <asm/uv/uv.h>
@@ -213,7 +215,7 @@ static inline int flag_is_changeable_p(u32 flag)
 }
 
 /* Probe for the CPUID instruction */
-static int __cpuinit have_cpuid_p(void)
+int __cpuinit have_cpuid_p(void)
 {
        return flag_is_changeable_p(X86_EFLAGS_ID);
 }
@@ -249,11 +251,6 @@ static inline int flag_is_changeable_p(u32 flag)
 {
        return 1;
 }
-/* Probe for the CPUID instruction */
-static inline int have_cpuid_p(void)
-{
-       return 1;
-}
 static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
 {
 }
@@ -1223,6 +1220,12 @@ void __cpuinit cpu_init(void)
        int cpu;
        int i;
 
+       /*
+        * Load microcode on this cpu if a valid microcode is available.
+        * This is early microcode loading procedure.
+        */
+       load_ucode_ap();
+
        cpu = stack_smp_processor_id();
        t = &per_cpu(init_tss, cpu);
        oist = &per_cpu(orig_ist, cpu);
@@ -1314,6 +1317,8 @@ void __cpuinit cpu_init(void)
        struct tss_struct *t = &per_cpu(init_tss, cpu);
        struct thread_struct *thread = &curr->thread;
 
+       show_ucode_info_early();
+
        if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) {
                printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
                for (;;)
index 57334f4..c5e403f 100644 (file)
@@ -26,6 +26,7 @@
 #include <asm/e820.h>
 #include <asm/bios_ebda.h>
 #include <asm/bootparam_utils.h>
+#include <asm/microcode.h>
 
 /*
  * Manage page tables very early on.
@@ -159,17 +160,17 @@ void __init x86_64_start_kernel(char * real_mode_data)
        /* clear bss before set_intr_gate with early_idt_handler */
        clear_bss();
 
-       for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) {
-#ifdef CONFIG_EARLY_PRINTK
+       for (i = 0; i < NUM_EXCEPTION_VECTORS; i++)
                set_intr_gate(i, &early_idt_handlers[i]);
-#else
-               set_intr_gate(i, early_idt_handler);
-#endif
-       }
        load_idt((const struct desc_ptr *)&idt_descr);
 
        copy_bootdata(__va(real_mode_data));
 
+       /*
+        * Load microcode early on BSP.
+        */
+       load_ucode_bsp();
+
        if (console_loglevel == 10)
                early_printk("Kernel alive\n");
 
index 3c3f58a..73afd11 100644 (file)
@@ -144,6 +144,11 @@ ENTRY(startup_32)
        movl %eax, pa(olpc_ofw_pgd)
 #endif
 
+#ifdef CONFIG_MICROCODE_EARLY
+       /* Early load ucode on BSP. */
+       call load_ucode_bsp
+#endif
+
 /*
  * Initialize page tables.  This creates a PDE and a set of page
  * tables, which are located immediately beyond __brk_base.  The variable
@@ -299,6 +304,12 @@ ENTRY(startup_32_smp)
        movl %eax,%ss
        leal -__PAGE_OFFSET(%ecx),%esp
 
+#ifdef CONFIG_MICROCODE_EARLY
+       /* Early load ucode on AP. */
+       call load_ucode_ap
+#endif
+
+
 default_entry:
 #define CR0_STATE      (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \
                         X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \
index d94f6d6..b7de3b2 100644 (file)
@@ -336,6 +336,7 @@ early_idt_handlers:
        i = i + 1
        .endr
 
+/* This is global to keep gas from relaxing the jumps */
 ENTRY(early_idt_handler)
        cld
 
@@ -404,6 +405,7 @@ ENTRY(early_idt_handler)
        addq $16,%rsp           # drop vector number and error code
        decl early_recursion_flag(%rip)
        INTERRUPT_RETURN
+ENDPROC(early_idt_handler)
 
        __INITDATA
 
index 3a04b22..22db92b 100644 (file)
@@ -364,10 +364,7 @@ static struct attribute_group mc_attr_group = {
 
 static void microcode_fini_cpu(int cpu)
 {
-       struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
        microcode_ops->microcode_fini_cpu(cpu);
-       uci->valid = 0;
 }
 
 static enum ucode_state microcode_resume_cpu(int cpu)
@@ -383,6 +380,10 @@ static enum ucode_state microcode_resume_cpu(int cpu)
 static enum ucode_state microcode_init_cpu(int cpu, bool refresh_fw)
 {
        enum ucode_state ustate;
+       struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+       if (uci && uci->valid)
+               return UCODE_OK;
 
        if (collect_cpu_info(cpu))
                return UCODE_ERROR;
diff --git a/arch/x86/kernel/microcode_core_early.c b/arch/x86/kernel/microcode_core_early.c
new file mode 100644 (file)
index 0000000..577db84
--- /dev/null
@@ -0,0 +1,76 @@
+/*
+ *     X86 CPU microcode early update for Linux
+ *
+ *     Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
+ *                        H Peter Anvin" <hpa@zytor.com>
+ *
+ *     This driver allows to early upgrade microcode on Intel processors
+ *     belonging to IA-32 family - PentiumPro, Pentium II,
+ *     Pentium III, Xeon, Pentium 4, etc.
+ *
+ *     Reference: Section 9.11 of Volume 3, IA-32 Intel Architecture
+ *     Software Developer's Manual.
+ *
+ *     This program is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License
+ *     as published by the Free Software Foundation; either version
+ *     2 of the License, or (at your option) any later version.
+ */
+#include <linux/module.h>
+#include <asm/microcode_intel.h>
+#include <asm/processor.h>
+
+#define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24))
+#define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u')
+#define CPUID_INTEL2 QCHAR('i', 'n', 'e', 'I')
+#define CPUID_INTEL3 QCHAR('n', 't', 'e', 'l')
+#define CPUID_AMD1 QCHAR('A', 'u', 't', 'h')
+#define CPUID_AMD2 QCHAR('e', 'n', 't', 'i')
+#define CPUID_AMD3 QCHAR('c', 'A', 'M', 'D')
+
+#define CPUID_IS(a, b, c, ebx, ecx, edx)       \
+               (!((ebx ^ (a))|(edx ^ (b))|(ecx ^ (c))))
+
+/*
+ * In early loading microcode phase on BSP, boot_cpu_data is not set up yet.
+ * x86_vendor() gets vendor id for BSP.
+ *
+ * In 32 bit AP case, accessing boot_cpu_data needs linear address. To simplify
+ * coding, we still use x86_vendor() to get vendor id for AP.
+ *
+ * x86_vendor() gets vendor information directly through cpuid.
+ */
+static int __cpuinit x86_vendor(void)
+{
+       u32 eax = 0x00000000;
+       u32 ebx, ecx = 0, edx;
+
+       if (!have_cpuid_p())
+               return X86_VENDOR_UNKNOWN;
+
+       native_cpuid(&eax, &ebx, &ecx, &edx);
+
+       if (CPUID_IS(CPUID_INTEL1, CPUID_INTEL2, CPUID_INTEL3, ebx, ecx, edx))
+               return X86_VENDOR_INTEL;
+
+       if (CPUID_IS(CPUID_AMD1, CPUID_AMD2, CPUID_AMD3, ebx, ecx, edx))
+               return X86_VENDOR_AMD;
+
+       return X86_VENDOR_UNKNOWN;
+}
+
+void __init load_ucode_bsp(void)
+{
+       int vendor = x86_vendor();
+
+       if (vendor == X86_VENDOR_INTEL)
+               load_ucode_intel_bsp();
+}
+
+void __cpuinit load_ucode_ap(void)
+{
+       int vendor = x86_vendor();
+
+       if (vendor == X86_VENDOR_INTEL)
+               load_ucode_intel_ap();
+}
index 3544aed..5fb2ceb 100644 (file)
@@ -79,7 +79,7 @@
 #include <linux/module.h>
 #include <linux/vmalloc.h>
 
-#include <asm/microcode.h>
+#include <asm/microcode_intel.h>
 #include <asm/processor.h>
 #include <asm/msr.h>
 
@@ -87,59 +87,6 @@ MODULE_DESCRIPTION("Microcode Update Driver");
 MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
 MODULE_LICENSE("GPL");
 
-struct microcode_header_intel {
-       unsigned int            hdrver;
-       unsigned int            rev;
-       unsigned int            date;
-       unsigned int            sig;
-       unsigned int            cksum;
-       unsigned int            ldrver;
-       unsigned int            pf;
-       unsigned int            datasize;
-       unsigned int            totalsize;
-       unsigned int            reserved[3];
-};
-
-struct microcode_intel {
-       struct microcode_header_intel hdr;
-       unsigned int            bits[0];
-};
-
-/* microcode format is extended from prescott processors */
-struct extended_signature {
-       unsigned int            sig;
-       unsigned int            pf;
-       unsigned int            cksum;
-};
-
-struct extended_sigtable {
-       unsigned int            count;
-       unsigned int            cksum;
-       unsigned int            reserved[3];
-       struct extended_signature sigs[0];
-};
-
-#define DEFAULT_UCODE_DATASIZE (2000)
-#define MC_HEADER_SIZE         (sizeof(struct microcode_header_intel))
-#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
-#define EXT_HEADER_SIZE                (sizeof(struct extended_sigtable))
-#define EXT_SIGNATURE_SIZE     (sizeof(struct extended_signature))
-#define DWSIZE                 (sizeof(u32))
-
-#define get_totalsize(mc) \
-       (((struct microcode_intel *)mc)->hdr.totalsize ? \
-        ((struct microcode_intel *)mc)->hdr.totalsize : \
-        DEFAULT_UCODE_TOTALSIZE)
-
-#define get_datasize(mc) \
-       (((struct microcode_intel *)mc)->hdr.datasize ? \
-        ((struct microcode_intel *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE)
-
-#define sigmatch(s1, s2, p1, p2) \
-       (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0))))
-
-#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
-
 static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
 {
        struct cpuinfo_x86 *c = &cpu_data(cpu_num);
@@ -162,128 +109,25 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
        return 0;
 }
 
-static inline int update_match_cpu(struct cpu_signature *csig, int sig, int pf)
-{
-       return (!sigmatch(sig, csig->sig, pf, csig->pf)) ? 0 : 1;
-}
-
-static inline int
-update_match_revision(struct microcode_header_intel *mc_header, int rev)
-{
-       return (mc_header->rev <= rev) ? 0 : 1;
-}
-
-static int microcode_sanity_check(void *mc)
-{
-       unsigned long total_size, data_size, ext_table_size;
-       struct microcode_header_intel *mc_header = mc;
-       struct extended_sigtable *ext_header = NULL;
-       int sum, orig_sum, ext_sigcount = 0, i;
-       struct extended_signature *ext_sig;
-
-       total_size = get_totalsize(mc_header);
-       data_size = get_datasize(mc_header);
-
-       if (data_size + MC_HEADER_SIZE > total_size) {
-               pr_err("error! Bad data size in microcode data file\n");
-               return -EINVAL;
-       }
-
-       if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
-               pr_err("error! Unknown microcode update format\n");
-               return -EINVAL;
-       }
-       ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
-       if (ext_table_size) {
-               if ((ext_table_size < EXT_HEADER_SIZE)
-                || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
-                       pr_err("error! Small exttable size in microcode data file\n");
-                       return -EINVAL;
-               }
-               ext_header = mc + MC_HEADER_SIZE + data_size;
-               if (ext_table_size != exttable_size(ext_header)) {
-                       pr_err("error! Bad exttable size in microcode data file\n");
-                       return -EFAULT;
-               }
-               ext_sigcount = ext_header->count;
-       }
-
-       /* check extended table checksum */
-       if (ext_table_size) {
-               int ext_table_sum = 0;
-               int *ext_tablep = (int *)ext_header;
-
-               i = ext_table_size / DWSIZE;
-               while (i--)
-                       ext_table_sum += ext_tablep[i];
-               if (ext_table_sum) {
-                       pr_warning("aborting, bad extended signature table checksum\n");
-                       return -EINVAL;
-               }
-       }
-
-       /* calculate the checksum */
-       orig_sum = 0;
-       i = (MC_HEADER_SIZE + data_size) / DWSIZE;
-       while (i--)
-               orig_sum += ((int *)mc)[i];
-       if (orig_sum) {
-               pr_err("aborting, bad checksum\n");
-               return -EINVAL;
-       }
-       if (!ext_table_size)
-               return 0;
-       /* check extended signature checksum */
-       for (i = 0; i < ext_sigcount; i++) {
-               ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
-                         EXT_SIGNATURE_SIZE * i;
-               sum = orig_sum
-                       - (mc_header->sig + mc_header->pf + mc_header->cksum)
-                       + (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
-               if (sum) {
-                       pr_err("aborting, bad checksum\n");
-                       return -EINVAL;
-               }
-       }
-       return 0;
-}
-
 /*
  * return 0 - no update found
  * return 1 - found update
  */
-static int
-get_matching_microcode(struct cpu_signature *cpu_sig, void *mc, int rev)
+static int get_matching_mc(struct microcode_intel *mc_intel, int cpu)
 {
-       struct microcode_header_intel *mc_header = mc;
-       struct extended_sigtable *ext_header;
-       unsigned long total_size = get_totalsize(mc_header);
-       int ext_sigcount, i;
-       struct extended_signature *ext_sig;
-
-       if (!update_match_revision(mc_header, rev))
-               return 0;
-
-       if (update_match_cpu(cpu_sig, mc_header->sig, mc_header->pf))
-               return 1;
+       struct cpu_signature cpu_sig;
+       unsigned int csig, cpf, crev;
 
-       /* Look for ext. headers: */
-       if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
-               return 0;
+       collect_cpu_info(cpu, &cpu_sig);
 
-       ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE;
-       ext_sigcount = ext_header->count;
-       ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
+       csig = cpu_sig.sig;
+       cpf = cpu_sig.pf;
+       crev = cpu_sig.rev;
 
-       for (i = 0; i < ext_sigcount; i++) {
-               if (update_match_cpu(cpu_sig, ext_sig->sig, ext_sig->pf))
-                       return 1;
-               ext_sig++;
-       }
-       return 0;
+       return get_matching_microcode(csig, cpf, mc_intel, crev);
 }
 
-static int apply_microcode(int cpu)
+int apply_microcode(int cpu)
 {
        struct microcode_intel *mc_intel;
        struct ucode_cpu_info *uci;
@@ -300,6 +144,14 @@ static int apply_microcode(int cpu)
        if (mc_intel == NULL)
                return 0;
 
+       /*
+        * Microcode on this CPU could be updated earlier. Only apply the
+        * microcode patch in mc_intel when it is newer than the one on this
+        * CPU.
+        */
+       if (get_matching_mc(mc_intel, cpu) == 0)
+               return 0;
+
        /* write microcode via MSR 0x79 */
        wrmsr(MSR_IA32_UCODE_WRITE,
              (unsigned long) mc_intel->bits,
@@ -338,6 +190,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
        unsigned int leftover = size;
        enum ucode_state state = UCODE_OK;
        unsigned int curr_mc_size = 0;
+       unsigned int csig, cpf;
 
        while (leftover) {
                struct microcode_header_intel mc_header;
@@ -362,11 +215,13 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
                }
 
                if (get_ucode_data(mc, ucode_ptr, mc_size) ||
-                   microcode_sanity_check(mc) < 0) {
+                   microcode_sanity_check(mc, 1) < 0) {
                        break;
                }
 
-               if (get_matching_microcode(&uci->cpu_sig, mc, new_rev)) {
+               csig = uci->cpu_sig.sig;
+               cpf = uci->cpu_sig.pf;
+               if (get_matching_microcode(csig, cpf, mc, new_rev)) {
                        vfree(new_mc);
                        new_rev = mc_header.rev;
                        new_mc  = mc;
@@ -393,6 +248,13 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
        vfree(uci->mc);
        uci->mc = (struct microcode_intel *)new_mc;
 
+       /*
+        * If early loading microcode is supported, save this mc into
+        * permanent memory. So it will be loaded early when a CPU is hot added
+        * or resumes.
+        */
+       save_mc_for_early(new_mc);
+
        pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
                 cpu, new_rev, uci->cpu_sig.rev);
 out:
diff --git a/arch/x86/kernel/microcode_intel_early.c b/arch/x86/kernel/microcode_intel_early.c
new file mode 100644 (file)
index 0000000..7890bc8
--- /dev/null
@@ -0,0 +1,796 @@
+/*
+ *     Intel CPU microcode early update for Linux
+ *
+ *     Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
+ *                        H Peter Anvin" <hpa@zytor.com>
+ *
+ *     This allows to early upgrade microcode on Intel processors
+ *     belonging to IA-32 family - PentiumPro, Pentium II,
+ *     Pentium III, Xeon, Pentium 4, etc.
+ *
+ *     Reference: Section 9.11 of Volume 3, IA-32 Intel Architecture
+ *     Software Developer's Manual.
+ *
+ *     This program is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License
+ *     as published by the Free Software Foundation; either version
+ *     2 of the License, or (at your option) any later version.
+ */
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/earlycpio.h>
+#include <linux/initrd.h>
+#include <linux/cpu.h>
+#include <asm/msr.h>
+#include <asm/microcode_intel.h>
+#include <asm/processor.h>
+#include <asm/tlbflush.h>
+#include <asm/setup.h>
+
+unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT];
+struct mc_saved_data {
+       unsigned int mc_saved_count;
+       struct microcode_intel **mc_saved;
+} mc_saved_data;
+
+static enum ucode_state __cpuinit
+generic_load_microcode_early(struct microcode_intel **mc_saved_p,
+                            unsigned int mc_saved_count,
+                            struct ucode_cpu_info *uci)
+{
+       struct microcode_intel *ucode_ptr, *new_mc = NULL;
+       int new_rev = uci->cpu_sig.rev;
+       enum ucode_state state = UCODE_OK;
+       unsigned int mc_size;
+       struct microcode_header_intel *mc_header;
+       unsigned int csig = uci->cpu_sig.sig;
+       unsigned int cpf = uci->cpu_sig.pf;
+       int i;
+
+       for (i = 0; i < mc_saved_count; i++) {
+               ucode_ptr = mc_saved_p[i];
+
+               mc_header = (struct microcode_header_intel *)ucode_ptr;
+               mc_size = get_totalsize(mc_header);
+               if (get_matching_microcode(csig, cpf, ucode_ptr, new_rev)) {
+                       new_rev = mc_header->rev;
+                       new_mc  = ucode_ptr;
+               }
+       }
+
+       if (!new_mc) {
+               state = UCODE_NFOUND;
+               goto out;
+       }
+
+       uci->mc = (struct microcode_intel *)new_mc;
+out:
+       return state;
+}
+
+static void __cpuinit
+microcode_pointer(struct microcode_intel **mc_saved,
+                 unsigned long *mc_saved_in_initrd,
+                 unsigned long initrd_start, int mc_saved_count)
+{
+       int i;
+
+       for (i = 0; i < mc_saved_count; i++)
+               mc_saved[i] = (struct microcode_intel *)
+                             (mc_saved_in_initrd[i] + initrd_start);
+}
+
+#ifdef CONFIG_X86_32
+static void __cpuinit
+microcode_phys(struct microcode_intel **mc_saved_tmp,
+              struct mc_saved_data *mc_saved_data)
+{
+       int i;
+       struct microcode_intel ***mc_saved;
+
+       mc_saved = (struct microcode_intel ***)
+                  __pa_symbol(&mc_saved_data->mc_saved);
+       for (i = 0; i < mc_saved_data->mc_saved_count; i++) {
+               struct microcode_intel *p;
+
+               p = *(struct microcode_intel **)
+                       __pa(mc_saved_data->mc_saved + i);
+               mc_saved_tmp[i] = (struct microcode_intel *)__pa(p);
+       }
+}
+#endif
+
+static enum ucode_state __cpuinit
+load_microcode(struct mc_saved_data *mc_saved_data,
+              unsigned long *mc_saved_in_initrd,
+              unsigned long initrd_start,
+              struct ucode_cpu_info *uci)
+{
+       struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
+       unsigned int count = mc_saved_data->mc_saved_count;
+
+       if (!mc_saved_data->mc_saved) {
+               microcode_pointer(mc_saved_tmp, mc_saved_in_initrd,
+                                 initrd_start, count);
+
+               return generic_load_microcode_early(mc_saved_tmp, count, uci);
+       } else {
+#ifdef CONFIG_X86_32
+               microcode_phys(mc_saved_tmp, mc_saved_data);
+               return generic_load_microcode_early(mc_saved_tmp, count, uci);
+#else
+               return generic_load_microcode_early(mc_saved_data->mc_saved,
+                                                   count, uci);
+#endif
+       }
+}
+
+static u8 get_x86_family(unsigned long sig)
+{
+       u8 x86;
+
+       x86 = (sig >> 8) & 0xf;
+
+       if (x86 == 0xf)
+               x86 += (sig >> 20) & 0xff;
+
+       return x86;
+}
+
+static u8 get_x86_model(unsigned long sig)
+{
+       u8 x86, x86_model;
+
+       x86 = get_x86_family(sig);
+       x86_model = (sig >> 4) & 0xf;
+
+       if (x86 == 0x6 || x86 == 0xf)
+               x86_model += ((sig >> 16) & 0xf) << 4;
+
+       return x86_model;
+}
+
+/*
+ * Given CPU signature and a microcode patch, this function finds if the
+ * microcode patch has matching family and model with the CPU.
+ */
+static enum ucode_state
+matching_model_microcode(struct microcode_header_intel *mc_header,
+                       unsigned long sig)
+{
+       u8 x86, x86_model;
+       u8 x86_ucode, x86_model_ucode;
+       struct extended_sigtable *ext_header;
+       unsigned long total_size = get_totalsize(mc_header);
+       unsigned long data_size = get_datasize(mc_header);
+       int ext_sigcount, i;
+       struct extended_signature *ext_sig;
+
+       x86 = get_x86_family(sig);
+       x86_model = get_x86_model(sig);
+
+       x86_ucode = get_x86_family(mc_header->sig);
+       x86_model_ucode = get_x86_model(mc_header->sig);
+
+       if (x86 == x86_ucode && x86_model == x86_model_ucode)
+               return UCODE_OK;
+
+       /* Look for ext. headers: */
+       if (total_size <= data_size + MC_HEADER_SIZE)
+               return UCODE_NFOUND;
+
+       ext_header = (struct extended_sigtable *)
+                    mc_header + data_size + MC_HEADER_SIZE;
+       ext_sigcount = ext_header->count;
+       ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
+
+       for (i = 0; i < ext_sigcount; i++) {
+               x86_ucode = get_x86_family(ext_sig->sig);
+               x86_model_ucode = get_x86_model(ext_sig->sig);
+
+               if (x86 == x86_ucode && x86_model == x86_model_ucode)
+                       return UCODE_OK;
+
+               ext_sig++;
+       }
+
+       return UCODE_NFOUND;
+}
+
+static int
+save_microcode(struct mc_saved_data *mc_saved_data,
+              struct microcode_intel **mc_saved_src,
+              unsigned int mc_saved_count)
+{
+       int i, j;
+       struct microcode_intel **mc_saved_p;
+       int ret;
+
+       if (!mc_saved_count)
+               return -EINVAL;
+
+       /*
+        * Copy new microcode data.
+        */
+       mc_saved_p = kmalloc(mc_saved_count*sizeof(struct microcode_intel *),
+                            GFP_KERNEL);
+       if (!mc_saved_p)
+               return -ENOMEM;
+
+       for (i = 0; i < mc_saved_count; i++) {
+               struct microcode_intel *mc = mc_saved_src[i];
+               struct microcode_header_intel *mc_header = &mc->hdr;
+               unsigned long mc_size = get_totalsize(mc_header);
+               mc_saved_p[i] = kmalloc(mc_size, GFP_KERNEL);
+               if (!mc_saved_p[i]) {
+                       ret = -ENOMEM;
+                       goto err;
+               }
+               if (!mc_saved_src[i]) {
+                       ret = -EINVAL;
+                       goto err;
+               }
+               memcpy(mc_saved_p[i], mc, mc_size);
+       }
+
+       /*
+        * Point to newly saved microcode.
+        */
+       mc_saved_data->mc_saved = mc_saved_p;
+       mc_saved_data->mc_saved_count = mc_saved_count;
+
+       return 0;
+
+err:
+       for (j = 0; j <= i; j++)
+               kfree(mc_saved_p[j]);
+       kfree(mc_saved_p);
+
+       return ret;
+}
+
+/*
+ * A microcode patch in ucode_ptr is saved into mc_saved
+ * - if it has matching signature and newer revision compared to an existing
+ *   patch mc_saved.
+ * - or if it is a newly discovered microcode patch.
+ *
+ * The microcode patch should have matching model with CPU.
+ */
+static void _save_mc(struct microcode_intel **mc_saved, u8 *ucode_ptr,
+                    unsigned int *mc_saved_count_p)
+{
+       int i;
+       int found = 0;
+       unsigned int mc_saved_count = *mc_saved_count_p;
+       struct microcode_header_intel *mc_header;
+
+       mc_header = (struct microcode_header_intel *)ucode_ptr;
+       for (i = 0; i < mc_saved_count; i++) {
+               unsigned int sig, pf;
+               unsigned int new_rev;
+               struct microcode_header_intel *mc_saved_header =
+                            (struct microcode_header_intel *)mc_saved[i];
+               sig = mc_saved_header->sig;
+               pf = mc_saved_header->pf;
+               new_rev = mc_header->rev;
+
+               if (get_matching_sig(sig, pf, ucode_ptr, new_rev)) {
+                       found = 1;
+                       if (update_match_revision(mc_header, new_rev)) {
+                               /*
+                                * Found an older ucode saved before.
+                                * Replace the older one with this newer
+                                * one.
+                                */
+                               mc_saved[i] =
+                                       (struct microcode_intel *)ucode_ptr;
+                               break;
+                       }
+               }
+       }
+       if (i >= mc_saved_count && !found)
+               /*
+                * This ucode is first time discovered in ucode file.
+                * Save it to memory.
+                */
+               mc_saved[mc_saved_count++] =
+                                (struct microcode_intel *)ucode_ptr;
+
+       *mc_saved_count_p = mc_saved_count;
+}
+
+/*
+ * Get microcode matching with BSP's model. Only CPUs with the same model as
+ * BSP can stay in the platform.
+ */
+static enum ucode_state __init
+get_matching_model_microcode(int cpu, unsigned long start,
+                            void *data, size_t size,
+                            struct mc_saved_data *mc_saved_data,
+                            unsigned long *mc_saved_in_initrd,
+                            struct ucode_cpu_info *uci)
+{
+       u8 *ucode_ptr = data;
+       unsigned int leftover = size;
+       enum ucode_state state = UCODE_OK;
+       unsigned int mc_size;
+       struct microcode_header_intel *mc_header;
+       struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
+       unsigned int mc_saved_count = mc_saved_data->mc_saved_count;
+       int i;
+
+       while (leftover) {
+               mc_header = (struct microcode_header_intel *)ucode_ptr;
+
+               mc_size = get_totalsize(mc_header);
+               if (!mc_size || mc_size > leftover ||
+                       microcode_sanity_check(ucode_ptr, 0) < 0)
+                       break;
+
+               leftover -= mc_size;
+
+               /*
+                * Since APs with same family and model as the BSP may boot in
+                * the platform, we need to find and save microcode patches
+                * with the same family and model as the BSP.
+                */
+               if (matching_model_microcode(mc_header, uci->cpu_sig.sig) !=
+                        UCODE_OK) {
+                       ucode_ptr += mc_size;
+                       continue;
+               }
+
+               _save_mc(mc_saved_tmp, ucode_ptr, &mc_saved_count);
+
+               ucode_ptr += mc_size;
+       }
+
+       if (leftover) {
+               state = UCODE_ERROR;
+               goto out;
+       }
+
+       if (mc_saved_count == 0) {
+               state = UCODE_NFOUND;
+               goto out;
+       }
+
+       for (i = 0; i < mc_saved_count; i++)
+               mc_saved_in_initrd[i] = (unsigned long)mc_saved_tmp[i] - start;
+
+       mc_saved_data->mc_saved_count = mc_saved_count;
+out:
+       return state;
+}
+
+#define native_rdmsr(msr, val1, val2)          \
+do {                                           \
+       u64 __val = native_read_msr((msr));     \
+       (void)((val1) = (u32)__val);            \
+       (void)((val2) = (u32)(__val >> 32));    \
+} while (0)
+
+#define native_wrmsr(msr, low, high)           \
+       native_write_msr(msr, low, high);
+
+static int __cpuinit collect_cpu_info_early(struct ucode_cpu_info *uci)
+{
+       unsigned int val[2];
+       u8 x86, x86_model;
+       struct cpu_signature csig;
+       unsigned int eax, ebx, ecx, edx;
+
+       csig.sig = 0;
+       csig.pf = 0;
+       csig.rev = 0;
+
+       memset(uci, 0, sizeof(*uci));
+
+       eax = 0x00000001;
+       ecx = 0;
+       native_cpuid(&eax, &ebx, &ecx, &edx);
+       csig.sig = eax;
+
+       x86 = get_x86_family(csig.sig);
+       x86_model = get_x86_model(csig.sig);
+
+       if ((x86_model >= 5) || (x86 > 6)) {
+               /* get processor flags from MSR 0x17 */
+               native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
+               csig.pf = 1 << ((val[1] >> 18) & 7);
+       }
+       native_wrmsr(MSR_IA32_UCODE_REV, 0, 0);
+
+       /* As documented in the SDM: Do a CPUID 1 here */
+       sync_core();
+
+       /* get the current revision from MSR 0x8B */
+       native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
+
+       csig.rev = val[1];
+
+       uci->cpu_sig = csig;
+       uci->valid = 1;
+
+       return 0;
+}
+
+#ifdef DEBUG
+static void __ref show_saved_mc(void)
+{
+       int i, j;
+       unsigned int sig, pf, rev, total_size, data_size, date;
+       struct ucode_cpu_info uci;
+
+       if (mc_saved_data.mc_saved_count == 0) {
+               pr_debug("no micorcode data saved.\n");
+               return;
+       }
+       pr_debug("Total microcode saved: %d\n", mc_saved_data.mc_saved_count);
+
+       collect_cpu_info_early(&uci);
+
+       sig = uci.cpu_sig.sig;
+       pf = uci.cpu_sig.pf;
+       rev = uci.cpu_sig.rev;
+       pr_debug("CPU%d: sig=0x%x, pf=0x%x, rev=0x%x\n",
+                smp_processor_id(), sig, pf, rev);
+
+       for (i = 0; i < mc_saved_data.mc_saved_count; i++) {
+               struct microcode_header_intel *mc_saved_header;
+               struct extended_sigtable *ext_header;
+               int ext_sigcount;
+               struct extended_signature *ext_sig;
+
+               mc_saved_header = (struct microcode_header_intel *)
+                                 mc_saved_data.mc_saved[i];
+               sig = mc_saved_header->sig;
+               pf = mc_saved_header->pf;
+               rev = mc_saved_header->rev;
+               total_size = get_totalsize(mc_saved_header);
+               data_size = get_datasize(mc_saved_header);
+               date = mc_saved_header->date;
+
+               pr_debug("mc_saved[%d]: sig=0x%x, pf=0x%x, rev=0x%x, toal size=0x%x, date = %04x-%02x-%02x\n",
+                        i, sig, pf, rev, total_size,
+                        date & 0xffff,
+                        date >> 24,
+                        (date >> 16) & 0xff);
+
+               /* Look for ext. headers: */
+               if (total_size <= data_size + MC_HEADER_SIZE)
+                       continue;
+
+               ext_header = (struct extended_sigtable *)
+                            mc_saved_header + data_size + MC_HEADER_SIZE;
+               ext_sigcount = ext_header->count;
+               ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
+
+               for (j = 0; j < ext_sigcount; j++) {
+                       sig = ext_sig->sig;
+                       pf = ext_sig->pf;
+
+                       pr_debug("\tExtended[%d]: sig=0x%x, pf=0x%x\n",
+                                j, sig, pf);
+
+                       ext_sig++;
+               }
+
+       }
+}
+#else
+static inline void show_saved_mc(void)
+{
+}
+#endif
+
+#if defined(CONFIG_MICROCODE_INTEL_EARLY) && defined(CONFIG_HOTPLUG_CPU)
+/*
+ * Save this mc into mc_saved_data. So it will be loaded early when a CPU is
+ * hot added or resumes.
+ *
+ * Please make sure this mc should be a valid microcode patch before calling
+ * this function.
+ */
+int save_mc_for_early(u8 *mc)
+{
+       struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
+       unsigned int mc_saved_count_init;
+       unsigned int mc_saved_count;
+       struct microcode_intel **mc_saved;
+       int ret = 0;
+       int i;
+
+       /*
+        * Hold hotplug lock so mc_saved_data is not accessed by a CPU in
+        * hotplug.
+        */
+       cpu_hotplug_driver_lock();
+
+       mc_saved_count_init = mc_saved_data.mc_saved_count;
+       mc_saved_count = mc_saved_data.mc_saved_count;
+       mc_saved = mc_saved_data.mc_saved;
+
+       if (mc_saved && mc_saved_count)
+               memcpy(mc_saved_tmp, mc_saved,
+                      mc_saved_count * sizeof(struct mirocode_intel *));
+       /*
+        * Save the microcode patch mc in mc_save_tmp structure if it's a newer
+        * version.
+        */
+
+       _save_mc(mc_saved_tmp, mc, &mc_saved_count);
+
+       /*
+        * Save the mc_save_tmp in global mc_saved_data.
+        */
+       ret = save_microcode(&mc_saved_data, mc_saved_tmp, mc_saved_count);
+       if (ret) {
+               pr_err("Can not save microcode patch.\n");
+               goto out;
+       }
+
+       show_saved_mc();
+
+       /*
+        * Free old saved microcod data.
+        */
+       if (mc_saved) {
+               for (i = 0; i < mc_saved_count_init; i++)
+                       kfree(mc_saved[i]);
+               kfree(mc_saved);
+       }
+
+out:
+       cpu_hotplug_driver_unlock();
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(save_mc_for_early);
+#endif
+
+static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin";
+static __init enum ucode_state
+scan_microcode(unsigned long start, unsigned long end,
+               struct mc_saved_data *mc_saved_data,
+               unsigned long *mc_saved_in_initrd,
+               struct ucode_cpu_info *uci)
+{
+       unsigned int size = end - start + 1;
+       struct cpio_data cd;
+       long offset = 0;
+#ifdef CONFIG_X86_32
+       char *p = (char *)__pa_symbol(ucode_name);
+#else
+       char *p = ucode_name;
+#endif
+
+       cd.data = NULL;
+       cd.size = 0;
+
+       cd = find_cpio_data(p, (void *)start, size, &offset);
+       if (!cd.data)
+               return UCODE_ERROR;
+
+
+       return get_matching_model_microcode(0, start, cd.data, cd.size,
+                                           mc_saved_data, mc_saved_in_initrd,
+                                           uci);
+}
+
+/*
+ * Print ucode update info.
+ */
+static void __cpuinit
+print_ucode_info(struct ucode_cpu_info *uci, unsigned int date)
+{
+       int cpu = smp_processor_id();
+
+       pr_info("CPU%d microcode updated early to revision 0x%x, date = %04x-%02x-%02x\n",
+               cpu,
+               uci->cpu_sig.rev,
+               date & 0xffff,
+               date >> 24,
+               (date >> 16) & 0xff);
+}
+
+#ifdef CONFIG_X86_32
+
+static int delay_ucode_info;
+static int current_mc_date;
+
+/*
+ * Print early updated ucode info after printk works. This is delayed info dump.
+ */
+void __cpuinit show_ucode_info_early(void)
+{
+       struct ucode_cpu_info uci;
+
+       if (delay_ucode_info) {
+               collect_cpu_info_early(&uci);
+               print_ucode_info(&uci, current_mc_date);
+               delay_ucode_info = 0;
+       }
+}
+
+/*
+ * At this point, we can not call printk() yet. Keep microcode patch number in
+ * mc_saved_data.mc_saved and delay printing microcode info in
+ * show_ucode_info_early() until printk() works.
+ */
+static void __cpuinit print_ucode(struct ucode_cpu_info *uci)
+{
+       struct microcode_intel *mc_intel;
+       int *delay_ucode_info_p;
+       int *current_mc_date_p;
+
+       mc_intel = uci->mc;
+       if (mc_intel == NULL)
+               return;
+
+       delay_ucode_info_p = (int *)__pa_symbol(&delay_ucode_info);
+       current_mc_date_p = (int *)__pa_symbol(&current_mc_date);
+
+       *delay_ucode_info_p = 1;
+       *current_mc_date_p = mc_intel->hdr.date;
+}
+#else
+
+/*
+ * Flush global tlb. We only do this in x86_64 where paging has been enabled
+ * already and PGE should be enabled as well.
+ */
+static inline void __cpuinit flush_tlb_early(void)
+{
+       __native_flush_tlb_global_irq_disabled();
+}
+
+static inline void __cpuinit print_ucode(struct ucode_cpu_info *uci)
+{
+       struct microcode_intel *mc_intel;
+
+       mc_intel = uci->mc;
+       if (mc_intel == NULL)
+               return;
+
+       print_ucode_info(uci, mc_intel->hdr.date);
+}
+#endif
+
+static int apply_microcode_early(struct mc_saved_data *mc_saved_data,
+                                struct ucode_cpu_info *uci)
+{
+       struct microcode_intel *mc_intel;
+       unsigned int val[2];
+
+       mc_intel = uci->mc;
+       if (mc_intel == NULL)
+               return 0;
+
+       /* write microcode via MSR 0x79 */
+       native_wrmsr(MSR_IA32_UCODE_WRITE,
+             (unsigned long) mc_intel->bits,
+             (unsigned long) mc_intel->bits >> 16 >> 16);
+       native_wrmsr(MSR_IA32_UCODE_REV, 0, 0);
+
+       /* As documented in the SDM: Do a CPUID 1 here */
+       sync_core();
+
+       /* get the current revision from MSR 0x8B */
+       native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
+       if (val[1] != mc_intel->hdr.rev)
+               return -1;
+
+#ifdef CONFIG_X86_64
+       /* Flush global tlb. This is precaution. */
+       flush_tlb_early();
+#endif
+       uci->cpu_sig.rev = val[1];
+
+       print_ucode(uci);
+
+       return 0;
+}
+
+/*
+ * This function converts microcode patch offsets previously stored in
+ * mc_saved_in_initrd to pointers and stores the pointers in mc_saved_data.
+ */
+int __init save_microcode_in_initrd(void)
+{
+       unsigned int count = mc_saved_data.mc_saved_count;
+       struct microcode_intel *mc_saved[MAX_UCODE_COUNT];
+       int ret = 0;
+
+       if (count == 0)
+               return ret;
+
+       microcode_pointer(mc_saved, mc_saved_in_initrd, initrd_start, count);
+       ret = save_microcode(&mc_saved_data, mc_saved, count);
+       if (ret)
+               pr_err("Can not save microcod patches from initrd");
+
+       show_saved_mc();
+
+       return ret;
+}
+
+static void __init
+_load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data,
+                     unsigned long *mc_saved_in_initrd,
+                     unsigned long initrd_start_early,
+                     unsigned long initrd_end_early,
+                     struct ucode_cpu_info *uci)
+{
+       collect_cpu_info_early(uci);
+       scan_microcode(initrd_start_early, initrd_end_early, mc_saved_data,
+                      mc_saved_in_initrd, uci);
+       load_microcode(mc_saved_data, mc_saved_in_initrd,
+                      initrd_start_early, uci);
+       apply_microcode_early(mc_saved_data, uci);
+}
+
+void __init
+load_ucode_intel_bsp(void)
+{
+       u64 ramdisk_image, ramdisk_size;
+       unsigned long initrd_start_early, initrd_end_early;
+       struct ucode_cpu_info uci;
+#ifdef CONFIG_X86_32
+       struct boot_params *boot_params_p;
+
+       boot_params_p = (struct boot_params *)__pa_symbol(&boot_params);
+       ramdisk_image = boot_params_p->hdr.ramdisk_image;
+       ramdisk_size  = boot_params_p->hdr.ramdisk_size;
+       initrd_start_early = ramdisk_image;
+       initrd_end_early = initrd_start_early + ramdisk_size;
+
+       _load_ucode_intel_bsp(
+               (struct mc_saved_data *)__pa_symbol(&mc_saved_data),
+               (unsigned long *)__pa_symbol(&mc_saved_in_initrd),
+               initrd_start_early, initrd_end_early, &uci);
+#else
+       ramdisk_image = boot_params.hdr.ramdisk_image;
+       ramdisk_size  = boot_params.hdr.ramdisk_size;
+       initrd_start_early = ramdisk_image + PAGE_OFFSET;
+       initrd_end_early = initrd_start_early + ramdisk_size;
+
+       _load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd,
+                             initrd_start_early, initrd_end_early, &uci);
+#endif
+}
+
+void __cpuinit load_ucode_intel_ap(void)
+{
+       struct mc_saved_data *mc_saved_data_p;
+       struct ucode_cpu_info uci;
+       unsigned long *mc_saved_in_initrd_p;
+       unsigned long initrd_start_addr;
+#ifdef CONFIG_X86_32
+       unsigned long *initrd_start_p;
+
+       mc_saved_in_initrd_p =
+               (unsigned long *)__pa_symbol(mc_saved_in_initrd);
+       mc_saved_data_p = (struct mc_saved_data *)__pa_symbol(&mc_saved_data);
+       initrd_start_p = (unsigned long *)__pa_symbol(&initrd_start);
+       initrd_start_addr = (unsigned long)__pa_symbol(*initrd_start_p);
+#else
+       mc_saved_data_p = &mc_saved_data;
+       mc_saved_in_initrd_p = mc_saved_in_initrd;
+       initrd_start_addr = initrd_start;
+#endif
+
+       /*
+        * If there is no valid ucode previously saved in memory, no need to
+        * update ucode on this AP.
+        */
+       if (mc_saved_data_p->mc_saved_count == 0)
+               return;
+
+       collect_cpu_info_early(&uci);
+       load_microcode(mc_saved_data_p, mc_saved_in_initrd_p,
+                      initrd_start_addr, &uci);
+       apply_microcode_early(mc_saved_data_p, &uci);
+}
diff --git a/arch/x86/kernel/microcode_intel_lib.c b/arch/x86/kernel/microcode_intel_lib.c
new file mode 100644 (file)
index 0000000..ce69320
--- /dev/null
@@ -0,0 +1,174 @@
+/*
+ *     Intel CPU Microcode Update Driver for Linux
+ *
+ *     Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
+ *                        H Peter Anvin" <hpa@zytor.com>
+ *
+ *     This driver allows to upgrade microcode on Intel processors
+ *     belonging to IA-32 family - PentiumPro, Pentium II,
+ *     Pentium III, Xeon, Pentium 4, etc.
+ *
+ *     Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
+ *     Software Developer's Manual
+ *     Order Number 253668 or free download from:
+ *
+ *     http://developer.intel.com/Assets/PDF/manual/253668.pdf
+ *
+ *     For more information, go to http://www.urbanmyth.org/microcode
+ *
+ *     This program is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License
+ *     as published by the Free Software Foundation; either version
+ *     2 of the License, or (at your option) any later version.
+ *
+ */
+#include <linux/firmware.h>
+#include <linux/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#include <asm/microcode_intel.h>
+#include <asm/processor.h>
+#include <asm/msr.h>
+
+static inline int
+update_match_cpu(unsigned int csig, unsigned int cpf,
+                unsigned int sig, unsigned int pf)
+{
+       return (!sigmatch(sig, csig, pf, cpf)) ? 0 : 1;
+}
+
+int
+update_match_revision(struct microcode_header_intel *mc_header, int rev)
+{
+       return (mc_header->rev <= rev) ? 0 : 1;
+}
+
+int microcode_sanity_check(void *mc, int print_err)
+{
+       unsigned long total_size, data_size, ext_table_size;
+       struct microcode_header_intel *mc_header = mc;
+       struct extended_sigtable *ext_header = NULL;
+       int sum, orig_sum, ext_sigcount = 0, i;
+       struct extended_signature *ext_sig;
+
+       total_size = get_totalsize(mc_header);
+       data_size = get_datasize(mc_header);
+
+       if (data_size + MC_HEADER_SIZE > total_size) {
+               if (print_err)
+                       pr_err("error! Bad data size in microcode data file\n");
+               return -EINVAL;
+       }
+
+       if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
+               if (print_err)
+                       pr_err("error! Unknown microcode update format\n");
+               return -EINVAL;
+       }
+       ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
+       if (ext_table_size) {
+               if ((ext_table_size < EXT_HEADER_SIZE)
+                || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
+                       if (print_err)
+                               pr_err("error! Small exttable size in microcode data file\n");
+                       return -EINVAL;
+               }
+               ext_header = mc + MC_HEADER_SIZE + data_size;
+               if (ext_table_size != exttable_size(ext_header)) {
+                       if (print_err)
+                               pr_err("error! Bad exttable size in microcode data file\n");
+                       return -EFAULT;
+               }
+               ext_sigcount = ext_header->count;
+       }
+
+       /* check extended table checksum */
+       if (ext_table_size) {
+               int ext_table_sum = 0;
+               int *ext_tablep = (int *)ext_header;
+
+               i = ext_table_size / DWSIZE;
+               while (i--)
+                       ext_table_sum += ext_tablep[i];
+               if (ext_table_sum) {
+                       if (print_err)
+                               pr_warn("aborting, bad extended signature table checksum\n");
+                       return -EINVAL;
+               }
+       }
+
+       /* calculate the checksum */
+       orig_sum = 0;
+       i = (MC_HEADER_SIZE + data_size) / DWSIZE;
+       while (i--)
+               orig_sum += ((int *)mc)[i];
+       if (orig_sum) {
+               if (print_err)
+                       pr_err("aborting, bad checksum\n");
+               return -EINVAL;
+       }
+       if (!ext_table_size)
+               return 0;
+       /* check extended signature checksum */
+       for (i = 0; i < ext_sigcount; i++) {
+               ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
+                         EXT_SIGNATURE_SIZE * i;
+               sum = orig_sum
+                       - (mc_header->sig + mc_header->pf + mc_header->cksum)
+                       + (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
+               if (sum) {
+                       if (print_err)
+                               pr_err("aborting, bad checksum\n");
+                       return -EINVAL;
+               }
+       }
+       return 0;
+}
+EXPORT_SYMBOL_GPL(microcode_sanity_check);
+
+/*
+ * return 0 - no update found
+ * return 1 - found update
+ */
+int get_matching_sig(unsigned int csig, int cpf, void *mc, int rev)
+{
+       struct microcode_header_intel *mc_header = mc;
+       struct extended_sigtable *ext_header;
+       unsigned long total_size = get_totalsize(mc_header);
+       int ext_sigcount, i;
+       struct extended_signature *ext_sig;
+
+       if (update_match_cpu(csig, cpf, mc_header->sig, mc_header->pf))
+               return 1;
+
+       /* Look for ext. headers: */
+       if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
+               return 0;
+
+       ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE;
+       ext_sigcount = ext_header->count;
+       ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
+
+       for (i = 0; i < ext_sigcount; i++) {
+               if (update_match_cpu(csig, cpf, ext_sig->sig, ext_sig->pf))
+                       return 1;
+               ext_sig++;
+       }
+       return 0;
+}
+
+/*
+ * return 0 - no update found
+ * return 1 - found update
+ */
+int get_matching_microcode(unsigned int csig, int cpf, void *mc, int rev)
+{
+       struct microcode_header_intel *mc_header = mc;
+
+       if (!update_match_revision(mc_header, rev))
+               return 0;
+
+       return get_matching_sig(csig, cpf, mc, rev);
+}
+EXPORT_SYMBOL_GPL(get_matching_microcode);
index d418152..4903a03 100644 (file)
@@ -16,6 +16,7 @@
 #include <asm/tlb.h>
 #include <asm/proto.h>
 #include <asm/dma.h>           /* for MAX_DMA_PFN */
+#include <asm/microcode.h>
 
 #include "mm_internal.h"
 
@@ -534,6 +535,15 @@ void free_initmem(void)
 #ifdef CONFIG_BLK_DEV_INITRD
 void __init free_initrd_mem(unsigned long start, unsigned long end)
 {
+#ifdef CONFIG_MICROCODE_EARLY
+       /*
+        * Remember, initrd memory may contain microcode or other useful things.
+        * Before we lose initrd mem, we need to find a place to hold them
+        * now that normal virtual memory is enabled.
+        */
+       save_microcode_in_initrd();
+#endif
+
        /*
         * end could be not aligned, and We can not align that,
         * decompresser could be confused by aligned initrd_end
index f5e86ee..e8e3493 100644 (file)
@@ -1408,7 +1408,6 @@ static void __xen_write_cr3(bool kernel, unsigned long cr3)
                xen_mc_callback(set_current_cr3, (void *)cr3);
        }
 }
-
 static void xen_write_cr3(unsigned long cr3)
 {
        BUG_ON(preemptible());
@@ -1434,6 +1433,45 @@ static void xen_write_cr3(unsigned long cr3)
        xen_mc_issue(PARAVIRT_LAZY_CPU);  /* interrupts restored */
 }
 
+#ifdef CONFIG_X86_64
+/*
+ * At the start of the day - when Xen launches a guest, it has already
+ * built pagetables for the guest. We diligently look over them
+ * in xen_setup_kernel_pagetable and graft as appropiate them in the
+ * init_level4_pgt and its friends. Then when we are happy we load
+ * the new init_level4_pgt - and continue on.
+ *
+ * The generic code starts (start_kernel) and 'init_mem_mapping' sets
+ * up the rest of the pagetables. When it has completed it loads the cr3.
+ * N.B. that baremetal would start at 'start_kernel' (and the early
+ * #PF handler would create bootstrap pagetables) - so we are running
+ * with the same assumptions as what to do when write_cr3 is executed
+ * at this point.
+ *
+ * Since there are no user-page tables at all, we have two variants
+ * of xen_write_cr3 - the early bootup (this one), and the late one
+ * (xen_write_cr3). The reason we have to do that is that in 64-bit
+ * the Linux kernel and user-space are both in ring 3 while the
+ * hypervisor is in ring 0.
+ */
+static void __init xen_write_cr3_init(unsigned long cr3)
+{
+       BUG_ON(preemptible());
+
+       xen_mc_batch();  /* disables interrupts */
+
+       /* Update while interrupts are disabled, so its atomic with
+          respect to ipis */
+       this_cpu_write(xen_cr3, cr3);
+
+       __xen_write_cr3(true, cr3);
+
+       xen_mc_issue(PARAVIRT_LAZY_CPU);  /* interrupts restored */
+
+       pv_mmu_ops.write_cr3 = &xen_write_cr3;
+}
+#endif
+
 static int xen_pgd_alloc(struct mm_struct *mm)
 {
        pgd_t *pgd = mm->pgd;
@@ -2102,11 +2140,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
        .write_cr2 = xen_write_cr2,
 
        .read_cr3 = xen_read_cr3,
-#ifdef CONFIG_X86_32
        .write_cr3 = xen_write_cr3_init,
-#else
-       .write_cr3 = xen_write_cr3,
-#endif
 
        .flush_tlb_user = xen_flush_tlb,
        .flush_tlb_kernel = xen_flush_tlb,
index 85e81ec..594bda9 100644 (file)
@@ -445,7 +445,7 @@ static struct entropy_store input_pool = {
        .poolinfo = &poolinfo_table[0],
        .name = "input",
        .limit = 1,
-       .lock = __SPIN_LOCK_UNLOCKED(&input_pool.lock),
+       .lock = __SPIN_LOCK_UNLOCKED(input_pool.lock),
        .pool = input_pool_data
 };
 
@@ -454,7 +454,7 @@ static struct entropy_store blocking_pool = {
        .name = "blocking",
        .limit = 1,
        .pull = &input_pool,
-       .lock = __SPIN_LOCK_UNLOCKED(&blocking_pool.lock),
+       .lock = __SPIN_LOCK_UNLOCKED(blocking_pool.lock),
        .pool = blocking_pool_data
 };
 
@@ -462,7 +462,7 @@ static struct entropy_store nonblocking_pool = {
        .poolinfo = &poolinfo_table[1],
        .name = "nonblocking",
        .pull = &input_pool,
-       .lock = __SPIN_LOCK_UNLOCKED(&nonblocking_pool.lock),
+       .lock = __SPIN_LOCK_UNLOCKED(nonblocking_pool.lock),
        .pool = nonblocking_pool_data
 };
 
index fa080eb..ffeebc7 100644 (file)
@@ -75,7 +75,7 @@ static unsigned long past_skip;
 
 static struct pci_dev *fbd_dev;
 
-static spinlock_t i7300_idle_lock;
+static raw_spinlock_t i7300_idle_lock;
 static int i7300_idle_active;
 
 static u8 i7300_idle_thrtctl_saved;
@@ -457,7 +457,7 @@ static int i7300_idle_notifier(struct notifier_block *nb, unsigned long val,
                idle_begin_time = ktime_get();
        }
 
-       spin_lock_irqsave(&i7300_idle_lock, flags);
+       raw_spin_lock_irqsave(&i7300_idle_lock, flags);
        if (val == IDLE_START) {
 
                cpumask_set_cpu(smp_processor_id(), idle_cpumask);
@@ -506,7 +506,7 @@ static int i7300_idle_notifier(struct notifier_block *nb, unsigned long val,
                }
        }
 end:
-       spin_unlock_irqrestore(&i7300_idle_lock, flags);
+       raw_spin_unlock_irqrestore(&i7300_idle_lock, flags);
        return 0;
 }
 
@@ -548,7 +548,7 @@ struct debugfs_file_info {
 
 static int __init i7300_idle_init(void)
 {
-       spin_lock_init(&i7300_idle_lock);
+       raw_spin_lock_init(&i7300_idle_lock);
        total_us = 0;
 
        if (i7300_idle_platform_probe(&fbd_dev, &ioat_dev, forceload))
index 3bc244d..a62c4a4 100644 (file)
@@ -222,7 +222,7 @@ static struct {
 } dbg_data = {
        .idx = 0,
        .tty = 0,
-       .lck = __RW_LOCK_UNLOCKED(lck)
+       .lck = __RW_LOCK_UNLOCKED(dbg_data.lck)
 };
 
 /**
index 2b3570b..3906d95 100644 (file)
--- a/fs/file.c
+++ b/fs/file.c
@@ -516,7 +516,7 @@ struct files_struct init_files = {
                .close_on_exec  = init_files.close_on_exec_init,
                .open_fds       = init_files.open_fds_init,
        },
-       .file_lock      = __SPIN_LOCK_UNLOCKED(init_task.file_lock),
+       .file_lock      = __SPIN_LOCK_UNLOCKED(init_files.file_lock),
 };
 
 /*
index 2533fdd..d8d4c89 100644 (file)
@@ -21,7 +21,7 @@ static inline unsigned long __cmpxchg_local_generic(volatile void *ptr,
        if (size == 8 && sizeof(unsigned long) != 8)
                wrong_size_cmpxchg(ptr);
 
-       local_irq_save(flags);
+       raw_local_irq_save(flags);
        switch (size) {
        case 1: prev = *(u8 *)ptr;
                if (prev == old)
@@ -42,7 +42,7 @@ static inline unsigned long __cmpxchg_local_generic(volatile void *ptr,
        default:
                wrong_size_cmpxchg(ptr);
        }
-       local_irq_restore(flags);
+       raw_local_irq_restore(flags);
        return prev;
 }
 
@@ -55,11 +55,11 @@ static inline u64 __cmpxchg64_local_generic(volatile void *ptr,
        u64 prev;
        unsigned long flags;
 
-       local_irq_save(flags);
+       raw_local_irq_save(flags);
        prev = *(u64 *)ptr;
        if (prev == old)
                *(u64 *)ptr = new;
-       local_irq_restore(flags);
+       raw_local_irq_restore(flags);
        return prev;
 }
 
index de7e190..e5eb125 100644 (file)
@@ -136,7 +136,7 @@ struct ida {
        struct ida_bitmap       *free_bitmap;
 };
 
-#define IDA_INIT(name)         { .idr = IDR_INIT(name), .free_bitmap = NULL, }
+#define IDA_INIT(name)         { .idr = IDR_INIT((name).idr), .free_bitmap = NULL, }
 #define DEFINE_IDA(name)       struct ida name = IDA_INIT(name)
 
 int ida_pre_get(struct ida *ida, gfp_t gfp_mask);
index bfe88c4..f1e877b 100644 (file)
@@ -412,7 +412,7 @@ struct lock_class_key { };
 
 #define lockdep_depth(tsk)     (0)
 
-#define lockdep_assert_held(l)                 do { } while (0)
+#define lockdep_assert_held(l)                 do { (void)(l); } while (0)
 
 #define lockdep_recursing(tsk)                 (0)
 
index 600060e..1829905 100644 (file)
 #include <linux/preempt.h>
 #include <asm/processor.h>
 
-typedef struct {
-       unsigned sequence;
-       spinlock_t lock;
-} seqlock_t;
-
-/*
- * These macros triggered gcc-3.x compile-time problems.  We think these are
- * OK now.  Be cautious.
- */
-#define __SEQLOCK_UNLOCKED(lockname) \
-                { 0, __SPIN_LOCK_UNLOCKED(lockname) }
-
-#define seqlock_init(x)                                        \
-       do {                                            \
-               (x)->sequence = 0;                      \
-               spin_lock_init(&(x)->lock);             \
-       } while (0)
-
-#define DEFINE_SEQLOCK(x) \
-               seqlock_t x = __SEQLOCK_UNLOCKED(x)
-
-/* Lock out other writers and update the count.
- * Acts like a normal spin_lock/unlock.
- * Don't need preempt_disable() because that is in the spin_lock already.
- */
-static inline void write_seqlock(seqlock_t *sl)
-{
-       spin_lock(&sl->lock);
-       ++sl->sequence;
-       smp_wmb();
-}
-
-static inline void write_sequnlock(seqlock_t *sl)
-{
-       smp_wmb();
-       sl->sequence++;
-       spin_unlock(&sl->lock);
-}
-
-static inline int write_tryseqlock(seqlock_t *sl)
-{
-       int ret = spin_trylock(&sl->lock);
-
-       if (ret) {
-               ++sl->sequence;
-               smp_wmb();
-       }
-       return ret;
-}
-
-/* Start of read calculation -- fetch last complete writer token */
-static __always_inline unsigned read_seqbegin(const seqlock_t *sl)
-{
-       unsigned ret;
-
-repeat:
-       ret = ACCESS_ONCE(sl->sequence);
-       if (unlikely(ret & 1)) {
-               cpu_relax();
-               goto repeat;
-       }
-       smp_rmb();
-
-       return ret;
-}
-
-/*
- * Test if reader processed invalid data.
- *
- * If sequence value changed then writer changed data while in section.
- */
-static __always_inline int read_seqretry(const seqlock_t *sl, unsigned start)
-{
-       smp_rmb();
-
-       return unlikely(sl->sequence != start);
-}
-
-
 /*
  * Version using sequence counter only.
  * This can be used when code has its own mutex protecting the
  * updating starting before the write_seqcountbeqin() and ending
  * after the write_seqcount_end().
  */
-
 typedef struct seqcount {
        unsigned sequence;
 } seqcount_t;
@@ -218,7 +138,6 @@ static inline int __read_seqcount_retry(const seqcount_t *s, unsigned start)
 static inline int read_seqcount_retry(const seqcount_t *s, unsigned start)
 {
        smp_rmb();
-
        return __read_seqcount_retry(s, start);
 }
 
@@ -252,31 +171,101 @@ static inline void write_seqcount_barrier(seqcount_t *s)
        s->sequence+=2;
 }
 
+typedef struct {
+       struct seqcount seqcount;
+       spinlock_t lock;
+} seqlock_t;
+
 /*
- * Possible sw/hw IRQ protected versions of the interfaces.
+ * These macros triggered gcc-3.x compile-time problems.  We think these are
+ * OK now.  Be cautious.
  */
-#define write_seqlock_irqsave(lock, flags)                             \
-       do { local_irq_save(flags); write_seqlock(lock); } while (0)
-#define write_seqlock_irq(lock)                                                \
-       do { local_irq_disable();   write_seqlock(lock); } while (0)
-#define write_seqlock_bh(lock)                                         \
-        do { local_bh_disable();    write_seqlock(lock); } while (0)
+#define __SEQLOCK_UNLOCKED(lockname)                   \
+       {                                               \
+               .seqcount = SEQCNT_ZERO,                \
+               .lock = __SPIN_LOCK_UNLOCKED(lockname)  \
+       }
+
+#define seqlock_init(x)                                        \
+       do {                                            \
+               seqcount_init(&(x)->seqcount);          \
+               spin_lock_init(&(x)->lock);             \
+       } while (0)
 
-#define write_sequnlock_irqrestore(lock, flags)                                \
-       do { write_sequnlock(lock); local_irq_restore(flags); } while(0)
-#define write_sequnlock_irq(lock)                                      \
-       do { write_sequnlock(lock); local_irq_enable(); } while(0)
-#define write_sequnlock_bh(lock)                                       \
-       do { write_sequnlock(lock); local_bh_enable(); } while(0)
+#define DEFINE_SEQLOCK(x) \
+               seqlock_t x = __SEQLOCK_UNLOCKED(x)
 
-#define read_seqbegin_irqsave(lock, flags)                             \
-       ({ local_irq_save(flags);   read_seqbegin(lock); })
+/*
+ * Read side functions for starting and finalizing a read side section.
+ */
+static inline unsigned read_seqbegin(const seqlock_t *sl)
+{
+       return read_seqcount_begin(&sl->seqcount);
+}
 
-#define read_seqretry_irqrestore(lock, iv, flags)                      \
-       ({                                                              \
-               int ret = read_seqretry(lock, iv);                      \
-               local_irq_restore(flags);                               \
-               ret;                                                    \
-       })
+static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
+{
+       return read_seqcount_retry(&sl->seqcount, start);
+}
+
+/*
+ * Lock out other writers and update the count.
+ * Acts like a normal spin_lock/unlock.
+ * Don't need preempt_disable() because that is in the spin_lock already.
+ */
+static inline void write_seqlock(seqlock_t *sl)
+{
+       spin_lock(&sl->lock);
+       write_seqcount_begin(&sl->seqcount);
+}
+
+static inline void write_sequnlock(seqlock_t *sl)
+{
+       write_seqcount_end(&sl->seqcount);
+       spin_unlock(&sl->lock);
+}
+
+static inline void write_seqlock_bh(seqlock_t *sl)
+{
+       spin_lock_bh(&sl->lock);
+       write_seqcount_begin(&sl->seqcount);
+}
+
+static inline void write_sequnlock_bh(seqlock_t *sl)
+{
+       write_seqcount_end(&sl->seqcount);
+       spin_unlock_bh(&sl->lock);
+}
+
+static inline void write_seqlock_irq(seqlock_t *sl)
+{
+       spin_lock_irq(&sl->lock);
+       write_seqcount_begin(&sl->seqcount);
+}
+
+static inline void write_sequnlock_irq(seqlock_t *sl)
+{
+       write_seqcount_end(&sl->seqcount);
+       spin_unlock_irq(&sl->lock);
+}
+
+static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&sl->lock, flags);
+       write_seqcount_begin(&sl->seqcount);
+       return flags;
+}
+
+#define write_seqlock_irqsave(lock, flags)                             \
+       do { flags = __write_seqlock_irqsave(lock); } while (0)
+
+static inline void
+write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags)
+{
+       write_seqcount_end(&sl->seqcount);
+       spin_unlock_irqrestore(&sl->lock, flags);
+}
 
 #endif /* __LINUX_SEQLOCK_H */
index 9618b6e..fbc07a2 100644 (file)
@@ -2472,8 +2472,6 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
        if (!futex_cmpxchg_enabled)
                return -ENOSYS;
 
-       WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n");
-
        rcu_read_lock();
 
        ret = -ESRCH;
index 83e368b..a9642d5 100644 (file)
@@ -142,8 +142,6 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
        if (!futex_cmpxchg_enabled)
                return -ENOSYS;
 
-       WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n");
-
        rcu_read_lock();
 
        ret = -ESRCH;
index 7981e5b..8a0efac 100644 (file)
@@ -3190,9 +3190,14 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 #endif
        if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) {
                debug_locks_off();
-               printk("BUG: MAX_LOCK_DEPTH too low!\n");
+               printk("BUG: MAX_LOCK_DEPTH too low, depth: %i  max: %lu!\n",
+                      curr->lockdep_depth, MAX_LOCK_DEPTH);
                printk("turning off the locking correctness validator.\n");
+
+               lockdep_print_held_locks(current);
+               debug_show_all_locks();
                dump_stack();
+
                return 0;
        }
 
@@ -3203,7 +3208,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 }
 
 static int
-print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
+print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
                           unsigned long ip)
 {
        if (!debug_locks_off())
@@ -3246,7 +3251,7 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
                return 0;
 
        if (curr->lockdep_depth <= 0)
-               return print_unlock_inbalance_bug(curr, lock, ip);
+               return print_unlock_imbalance_bug(curr, lock, ip);
 
        return 1;
 }
@@ -3317,7 +3322,7 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
                        goto found_it;
                prev_hlock = hlock;
        }
-       return print_unlock_inbalance_bug(curr, lock, ip);
+       return print_unlock_imbalance_bug(curr, lock, ip);
 
 found_it:
        lockdep_init_map(lock, name, key, 0);
@@ -3384,7 +3389,7 @@ lock_release_non_nested(struct task_struct *curr,
                        goto found_it;
                prev_hlock = hlock;
        }
-       return print_unlock_inbalance_bug(curr, lock, ip);
+       return print_unlock_imbalance_bug(curr, lock, ip);
 
 found_it:
        if (hlock->instance == lock)
index b10a42b..072bb06 100644 (file)
@@ -23,7 +23,7 @@
  * NTP timekeeping variables:
  */
 
-DEFINE_SPINLOCK(ntp_lock);
+DEFINE_RAW_SPINLOCK(ntp_lock);
 
 
 /* USER_HZ period (usecs): */
@@ -348,7 +348,7 @@ void ntp_clear(void)
 {
        unsigned long flags;
 
-       spin_lock_irqsave(&ntp_lock, flags);
+       raw_spin_lock_irqsave(&ntp_lock, flags);
 
        time_adjust     = 0;            /* stop active adjtime() */
        time_status     |= STA_UNSYNC;
@@ -362,7 +362,7 @@ void ntp_clear(void)
 
        /* Clear PPS state variables */
        pps_clear();
-       spin_unlock_irqrestore(&ntp_lock, flags);
+       raw_spin_unlock_irqrestore(&ntp_lock, flags);
 
 }
 
@@ -372,9 +372,9 @@ u64 ntp_tick_length(void)
        unsigned long flags;
        s64 ret;
 
-       spin_lock_irqsave(&ntp_lock, flags);
+       raw_spin_lock_irqsave(&ntp_lock, flags);
        ret = tick_length;
-       spin_unlock_irqrestore(&ntp_lock, flags);
+       raw_spin_unlock_irqrestore(&ntp_lock, flags);
        return ret;
 }
 
@@ -395,7 +395,7 @@ int second_overflow(unsigned long secs)
        int leap = 0;
        unsigned long flags;
 
-       spin_lock_irqsave(&ntp_lock, flags);
+       raw_spin_lock_irqsave(&ntp_lock, flags);
 
        /*
         * Leap second processing. If in leap-insert state at the end of the
@@ -479,7 +479,7 @@ int second_overflow(unsigned long secs)
        time_adjust = 0;
 
 out:
-       spin_unlock_irqrestore(&ntp_lock, flags);
+       raw_spin_unlock_irqrestore(&ntp_lock, flags);
 
        return leap;
 }
@@ -672,7 +672,7 @@ int do_adjtimex(struct timex *txc)
 
        getnstimeofday(&ts);
 
-       spin_lock_irq(&ntp_lock);
+       raw_spin_lock_irq(&ntp_lock);
 
        if (txc->modes & ADJ_ADJTIME) {
                long save_adjust = time_adjust;
@@ -714,7 +714,7 @@ int do_adjtimex(struct timex *txc)
        /* fill PPS status fields */
        pps_fill_timex(txc);
 
-       spin_unlock_irq(&ntp_lock);
+       raw_spin_unlock_irq(&ntp_lock);
 
        txc->time.tv_sec = ts.tv_sec;
        txc->time.tv_usec = ts.tv_nsec;
@@ -912,7 +912,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
 
        pts_norm = pps_normalize_ts(*phase_ts);
 
-       spin_lock_irqsave(&ntp_lock, flags);
+       raw_spin_lock_irqsave(&ntp_lock, flags);
 
        /* clear the error bits, they will be set again if needed */
        time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
@@ -925,7 +925,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
         * just start the frequency interval */
        if (unlikely(pps_fbase.tv_sec == 0)) {
                pps_fbase = *raw_ts;
-               spin_unlock_irqrestore(&ntp_lock, flags);
+               raw_spin_unlock_irqrestore(&ntp_lock, flags);
                return;
        }
 
@@ -940,7 +940,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
                time_status |= STA_PPSJITTER;
                /* restart the frequency calibration interval */
                pps_fbase = *raw_ts;
-               spin_unlock_irqrestore(&ntp_lock, flags);
+               raw_spin_unlock_irqrestore(&ntp_lock, flags);
                pr_err("hardpps: PPSJITTER: bad pulse\n");
                return;
        }
@@ -957,7 +957,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
 
        hardpps_update_phase(pts_norm.nsec);
 
-       spin_unlock_irqrestore(&ntp_lock, flags);
+       raw_spin_unlock_irqrestore(&ntp_lock, flags);
 }
 EXPORT_SYMBOL(hardpps);
 
index 2768942..4a94467 100644 (file)
@@ -113,9 +113,9 @@ static int get_softlockup_thresh(void)
  * resolution, and we don't need to waste time with a big divide when
  * 2^30ns == 1.074s.
  */
-static unsigned long get_timestamp(int this_cpu)
+static unsigned long get_timestamp(void)
 {
-       return cpu_clock(this_cpu) >> 30LL;  /* 2^30 ~= 10^9 */
+       return local_clock() >> 30LL;  /* 2^30 ~= 10^9 */
 }
 
 static void set_sample_period(void)
@@ -133,9 +133,7 @@ static void set_sample_period(void)
 /* Commands for resetting the watchdog */
 static void __touch_watchdog(void)
 {
-       int this_cpu = smp_processor_id();
-
-       __this_cpu_write(watchdog_touch_ts, get_timestamp(this_cpu));
+       __this_cpu_write(watchdog_touch_ts, get_timestamp());
 }
 
 void touch_softlockup_watchdog(void)
@@ -196,7 +194,7 @@ static int is_hardlockup(void)
 
 static int is_softlockup(unsigned long touch_ts)
 {
-       unsigned long now = get_timestamp(smp_processor_id());
+       unsigned long now = get_timestamp();
 
        /* Warn about unreasonable delays: */
        if (time_after(now, touch_ts + get_softlockup_thresh()))
index 7aae0f2..c3eb261 100644 (file)
@@ -47,10 +47,10 @@ __setup("debug_locks_verbose=", setup_debug_locks_verbose);
  * Normal standalone locks, for the circular and irq-context
  * dependency tests:
  */
-static DEFINE_SPINLOCK(lock_A);
-static DEFINE_SPINLOCK(lock_B);
-static DEFINE_SPINLOCK(lock_C);
-static DEFINE_SPINLOCK(lock_D);
+static DEFINE_RAW_SPINLOCK(lock_A);
+static DEFINE_RAW_SPINLOCK(lock_B);
+static DEFINE_RAW_SPINLOCK(lock_C);
+static DEFINE_RAW_SPINLOCK(lock_D);
 
 static DEFINE_RWLOCK(rwlock_A);
 static DEFINE_RWLOCK(rwlock_B);
@@ -73,12 +73,12 @@ static DECLARE_RWSEM(rwsem_D);
  * but X* and Y* are different classes. We do this so that
  * we do not trigger a real lockup:
  */
-static DEFINE_SPINLOCK(lock_X1);
-static DEFINE_SPINLOCK(lock_X2);
-static DEFINE_SPINLOCK(lock_Y1);
-static DEFINE_SPINLOCK(lock_Y2);
-static DEFINE_SPINLOCK(lock_Z1);
-static DEFINE_SPINLOCK(lock_Z2);
+static DEFINE_RAW_SPINLOCK(lock_X1);
+static DEFINE_RAW_SPINLOCK(lock_X2);
+static DEFINE_RAW_SPINLOCK(lock_Y1);
+static DEFINE_RAW_SPINLOCK(lock_Y2);
+static DEFINE_RAW_SPINLOCK(lock_Z1);
+static DEFINE_RAW_SPINLOCK(lock_Z2);
 
 static DEFINE_RWLOCK(rwlock_X1);
 static DEFINE_RWLOCK(rwlock_X2);
@@ -107,10 +107,10 @@ static DECLARE_RWSEM(rwsem_Z2);
  */
 #define INIT_CLASS_FUNC(class)                                 \
 static noinline void                                   \
-init_class_##class(spinlock_t *lock, rwlock_t *rwlock, struct mutex *mutex, \
-                struct rw_semaphore *rwsem)            \
+init_class_##class(raw_spinlock_t *lock, rwlock_t *rwlock, \
+       struct mutex *mutex, struct rw_semaphore *rwsem)\
 {                                                      \
-       spin_lock_init(lock);                           \
+       raw_spin_lock_init(lock);                       \
        rwlock_init(rwlock);                            \
        mutex_init(mutex);                              \
        init_rwsem(rwsem);                              \
@@ -168,10 +168,10 @@ static void init_shared_classes(void)
  * Shortcuts for lock/unlock API variants, to keep
  * the testcases compact:
  */
-#define L(x)                   spin_lock(&lock_##x)
-#define U(x)                   spin_unlock(&lock_##x)
+#define L(x)                   raw_spin_lock(&lock_##x)
+#define U(x)                   raw_spin_unlock(&lock_##x)
 #define LU(x)                  L(x); U(x)
-#define SI(x)                  spin_lock_init(&lock_##x)
+#define SI(x)                  raw_spin_lock_init(&lock_##x)
 
 #define WL(x)                  write_lock(&rwlock_##x)
 #define WU(x)                  write_unlock(&rwlock_##x)
@@ -911,7 +911,7 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft)
 
 #define I2(x)                                  \
        do {                                    \
-               spin_lock_init(&lock_##x);      \
+               raw_spin_lock_init(&lock_##x);  \
                rwlock_init(&rwlock_##x);       \
                mutex_init(&mutex_##x);         \
                init_rwsem(&rwsem_##x);         \
index 7e0d6a5..7542afb 100644 (file)
@@ -73,20 +73,13 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
                goto dont_wake_writers;
        }
 
-       /* if we are allowed to wake writers try to grant a single write lock
-        * if there's a writer at the front of the queue
-        * - we leave the 'waiting count' incremented to signify potential
-        *   contention
+       /*
+        * as we support write lock stealing, we can't set sem->activity
+        * to -1 here to indicate we get the lock. Instead, we wake it up
+        * to let it go get it again.
         */
        if (waiter->flags & RWSEM_WAITING_FOR_WRITE) {
-               sem->activity = -1;
-               list_del(&waiter->list);
-               tsk = waiter->task;
-               /* Don't touch waiter after ->task has been NULLed */
-               smp_mb();
-               waiter->task = NULL;
-               wake_up_process(tsk);
-               put_task_struct(tsk);
+               wake_up_process(waiter->task);
                goto out;
        }
 
@@ -121,18 +114,10 @@ static inline struct rw_semaphore *
 __rwsem_wake_one_writer(struct rw_semaphore *sem)
 {
        struct rwsem_waiter *waiter;
-       struct task_struct *tsk;
-
-       sem->activity = -1;
 
        waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
-       list_del(&waiter->list);
+       wake_up_process(waiter->task);
 
-       tsk = waiter->task;
-       smp_mb();
-       waiter->task = NULL;
-       wake_up_process(tsk);
-       put_task_struct(tsk);
        return sem;
 }
 
@@ -204,7 +189,6 @@ int __down_read_trylock(struct rw_semaphore *sem)
 
 /*
  * get a write lock on the semaphore
- * - we increment the waiting count anyway to indicate an exclusive lock
  */
 void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
 {
@@ -214,37 +198,32 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
 
        raw_spin_lock_irqsave(&sem->wait_lock, flags);
 
-       if (sem->activity == 0 && list_empty(&sem->wait_list)) {
-               /* granted */
-               sem->activity = -1;
-               raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-               goto out;
-       }
-
-       tsk = current;
-       set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-
        /* set up my own style of waitqueue */
+       tsk = current;
        waiter.task = tsk;
        waiter.flags = RWSEM_WAITING_FOR_WRITE;
-       get_task_struct(tsk);
-
        list_add_tail(&waiter.list, &sem->wait_list);
 
-       /* we don't need to touch the semaphore struct anymore */
-       raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-
-       /* wait to be given the lock */
+       /* wait for someone to release the lock */
        for (;;) {
-               if (!waiter.task)
+               /*
+                * That is the key to support write lock stealing: allows the
+                * task already on CPU to get the lock soon rather than put
+                * itself into sleep and waiting for system woke it or someone
+                * else in the head of the wait list up.
+                */
+               if (sem->activity == 0)
                        break;
-               schedule();
                set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+               raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+               schedule();
+               raw_spin_lock_irqsave(&sem->wait_lock, flags);
        }
+       /* got the lock */
+       sem->activity = -1;
+       list_del(&waiter.list);
 
-       tsk->state = TASK_RUNNING;
- out:
-       ;
+       raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
 }
 
 void __sched __down_write(struct rw_semaphore *sem)
@@ -262,8 +241,8 @@ int __down_write_trylock(struct rw_semaphore *sem)
 
        raw_spin_lock_irqsave(&sem->wait_lock, flags);
 
-       if (sem->activity == 0 && list_empty(&sem->wait_list)) {
-               /* granted */
+       if (sem->activity == 0) {
+               /* got the lock */
                sem->activity = -1;
                ret = 1;
        }
index 8337e1b..ad5e0df 100644 (file)
@@ -2,6 +2,8 @@
  *
  * Written by David Howells (dhowells@redhat.com).
  * Derived from arch/i386/kernel/semaphore.c
+ *
+ * Writer lock-stealing by Alex Shi <alex.shi@intel.com>
  */
 #include <linux/rwsem.h>
 #include <linux/sched.h>
@@ -60,7 +62,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type)
        struct rwsem_waiter *waiter;
        struct task_struct *tsk;
        struct list_head *next;
-       signed long oldcount, woken, loop, adjustment;
+       signed long woken, loop, adjustment;
 
        waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
        if (!(waiter->flags & RWSEM_WAITING_FOR_WRITE))
@@ -72,30 +74,8 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type)
                 */
                goto out;
 
-       /* There's a writer at the front of the queue - try to grant it the
-        * write lock.  However, we only wake this writer if we can transition
-        * the active part of the count from 0 -> 1
-        */
-       adjustment = RWSEM_ACTIVE_WRITE_BIAS;
-       if (waiter->list.next == &sem->wait_list)
-               adjustment -= RWSEM_WAITING_BIAS;
-
- try_again_write:
-       oldcount = rwsem_atomic_update(adjustment, sem) - adjustment;
-       if (oldcount & RWSEM_ACTIVE_MASK)
-               /* Someone grabbed the sem already */
-               goto undo_write;
-
-       /* We must be careful not to touch 'waiter' after we set ->task = NULL.
-        * It is an allocated on the waiter's stack and may become invalid at
-        * any time after that point (due to a wakeup from another source).
-        */
-       list_del(&waiter->list);
-       tsk = waiter->task;
-       smp_mb();
-       waiter->task = NULL;
-       wake_up_process(tsk);
-       put_task_struct(tsk);
+       /* Wake up the writing waiter and let the task grab the sem: */
+       wake_up_process(waiter->task);
        goto out;
 
  readers_only:
@@ -157,12 +137,40 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type)
 
  out:
        return sem;
+}
+
+/* Try to get write sem, caller holds sem->wait_lock: */
+static int try_get_writer_sem(struct rw_semaphore *sem,
+                                       struct rwsem_waiter *waiter)
+{
+       struct rwsem_waiter *fwaiter;
+       long oldcount, adjustment;
 
-       /* undo the change to the active count, but check for a transition
-        * 1->0 */
- undo_write:
+       /* only steal when first waiter is writing */
+       fwaiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
+       if (!(fwaiter->flags & RWSEM_WAITING_FOR_WRITE))
+               return 0;
+
+       adjustment = RWSEM_ACTIVE_WRITE_BIAS;
+       /* Only one waiter in the queue: */
+       if (fwaiter == waiter && waiter->list.next == &sem->wait_list)
+               adjustment -= RWSEM_WAITING_BIAS;
+
+try_again_write:
+       oldcount = rwsem_atomic_update(adjustment, sem) - adjustment;
+       if (!(oldcount & RWSEM_ACTIVE_MASK)) {
+               /* No active lock: */
+               struct task_struct *tsk = waiter->task;
+
+               list_del(&waiter->list);
+               smp_mb();
+               put_task_struct(tsk);
+               tsk->state = TASK_RUNNING;
+               return 1;
+       }
+       /* some one grabbed the sem already */
        if (rwsem_atomic_update(-adjustment, sem) & RWSEM_ACTIVE_MASK)
-               goto out;
+               return 0;
        goto try_again_write;
 }
 
@@ -210,6 +218,15 @@ rwsem_down_failed_common(struct rw_semaphore *sem,
        for (;;) {
                if (!waiter.task)
                        break;
+
+               raw_spin_lock_irq(&sem->wait_lock);
+               /* Try to get the writer sem, may steal from the head writer: */
+               if (flags == RWSEM_WAITING_FOR_WRITE)
+                       if (try_get_writer_sem(sem, &waiter)) {
+                               raw_spin_unlock_irq(&sem->wait_lock);
+                               return sem;
+                       }
+               raw_spin_unlock_irq(&sem->wait_lock);
                schedule();
                set_task_state(tsk, TASK_UNINTERRUPTIBLE);
        }