Merge branch 'kvm-updates/3.4' of git://git.kernel.org/pub/scm/virt/kvm/kvm
authorLinus Torvalds <torvalds@linux-foundation.org>
Wed, 28 Mar 2012 21:35:31 +0000 (14:35 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 28 Mar 2012 21:35:31 +0000 (14:35 -0700)
Pull kvm updates from Avi Kivity:
 "Changes include timekeeping improvements, support for assigning host
  PCI devices that share interrupt lines, s390 user-controlled guests, a
  large ppc update, and random fixes."

This is with the sign-off's fixed, hopefully next merge window we won't
have rebased commits.

* 'kvm-updates/3.4' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (130 commits)
  KVM: Convert intx_mask_lock to spin lock
  KVM: x86: fix kvm_write_tsc() TSC matching thinko
  x86: kvmclock: abstract save/restore sched_clock_state
  KVM: nVMX: Fix erroneous exception bitmap check
  KVM: Ignore the writes to MSR_K7_HWCR(3)
  KVM: MMU: make use of ->root_level in reset_rsvds_bits_mask
  KVM: PMU: add proper support for fixed counter 2
  KVM: PMU: Fix raw event check
  KVM: PMU: warn when pin control is set in eventsel msr
  KVM: VMX: Fix delayed load of shared MSRs
  KVM: use correct tlbs dirty type in cmpxchg
  KVM: Allow host IRQ sharing for assigned PCI 2.3 devices
  KVM: Ensure all vcpus are consistent with in-kernel irqchip settings
  KVM: x86 emulator: Allow PM/VM86 switch during task switch
  KVM: SVM: Fix CPL updates
  KVM: x86 emulator: VM86 segments must have DPL 3
  KVM: x86 emulator: Fix task switch privilege checks
  arch/powerpc/kvm/book3s_hv.c: included linux/sched.h twice
  KVM: x86 emulator: correctly mask pmc index bits in RDPMC instruction emulation
  KVM: mmu_notifier: Flush TLBs before releasing mmu_lock
  ...

16 files changed:
1  2 
arch/powerpc/include/asm/reg.h
arch/powerpc/kernel/asm-offsets.c
arch/powerpc/kernel/exceptions-64s.S
arch/powerpc/kvm/book3s_pr.c
arch/powerpc/mm/hugetlbpage.c
arch/s390/kvm/interrupt.c
arch/x86/include/asm/perf_event.h
arch/x86/kernel/smpboot.c
arch/x86/kernel/tsc.c
arch/x86/kvm/lapic.c
arch/x86/kvm/mmu_audit.c
arch/x86/kvm/svm.c
arch/x86/kvm/vmx.c
arch/x86/kvm/x86.c
arch/x86/power/cpu.c
include/linux/kvm_host.h

index b1a215eabef6abc872ae94107a1be3af3c2bdad6,35c9309bf038cfc2f843d999bec663a5b6922ba0..9d7f0fb690285bd0bb183e375080518e695792cb
  #define   DSISR_ISSTORE               0x02000000      /* access was a store */
  #define   DSISR_DABRMATCH     0x00400000      /* hit data breakpoint */
  #define   DSISR_NOSEGMENT     0x00200000      /* STAB/SLB miss */
+ #define   DSISR_KEYFAULT      0x00200000      /* Key fault */
  #define SPRN_TBRL     0x10C   /* Time Base Read Lower Register (user, R/O) */
  #define SPRN_TBRU     0x10D   /* Time Base Read Upper Register (user, R/O) */
  #define SPRN_TBWL     0x11C   /* Time Base Lower Register (super, R/W) */
  #define   LPCR_ISL    (1ul << (63-2))
  #define   LPCR_VC_SH  (63-2)
  #define   LPCR_DPFD_SH        (63-11)
+ #define   LPCR_VRMASD (0x1ful << (63-16))
  #define   LPCR_VRMA_L (1ul << (63-12))
  #define   LPCR_VRMA_LP0       (1ul << (63-15))
  #define   LPCR_VRMA_LP1       (1ul << (63-16))
  #define SPRN_SPRG7    0x117   /* Special Purpose Register General 7 */
  #define SPRN_SRR0     0x01A   /* Save/Restore Register 0 */
  #define SPRN_SRR1     0x01B   /* Save/Restore Register 1 */
+ #define   SRR1_ISI_NOPT               0x40000000 /* ISI: Not found in hash */
+ #define   SRR1_ISI_N_OR_G     0x10000000 /* ISI: Access is no-exec or G */
+ #define   SRR1_ISI_PROT               0x08000000 /* ISI: Other protection fault */
  #define   SRR1_WAKEMASK               0x00380000 /* reason for wakeup */
  #define   SRR1_WAKESYSERR     0x00300000 /* System error */
  #define   SRR1_WAKEEE         0x00200000 /* External interrupt */
  
  #define proc_trap()   asm volatile("trap")
  
 -#ifdef CONFIG_PPC64
 -
 -extern void ppc64_runlatch_on(void);
 -extern void __ppc64_runlatch_off(void);
 -
 -#define ppc64_runlatch_off()                                  \
 -      do {                                                    \
 -              if (cpu_has_feature(CPU_FTR_CTRL) &&            \
 -                  test_thread_flag(TIF_RUNLATCH))             \
 -                      __ppc64_runlatch_off();                 \
 -      } while (0)
 +#define __get_SP()    ({unsigned long sp; \
 +                      asm volatile("mr %0,1": "=r" (sp)); sp;})
  
  extern unsigned long scom970_read(unsigned int address);
  extern void scom970_write(unsigned int address, unsigned long value);
  
 -#else
 -#define ppc64_runlatch_on()
 -#define ppc64_runlatch_off()
 -
 -#endif /* CONFIG_PPC64 */
 -
 -#define __get_SP()    ({unsigned long sp; \
 -                      asm volatile("mr %0,1": "=r" (sp)); sp;})
 -
  struct pt_regs;
  
  extern void ppc_save_regs(struct pt_regs *regs);
index cc492e48ddfac1bf1db71b16b739799a7f929111,8e0db0b12dd05e6701bc2efec08c23d9fefc5021..34b8afe94a500f1b7191bce06ad23b2b32427725
@@@ -46,6 -46,9 +46,6 @@@
  #include <asm/hvcall.h>
  #include <asm/xics.h>
  #endif
 -#ifdef CONFIG_PPC_ISERIES
 -#include <asm/iseries/alpaca.h>
 -#endif
  #ifdef CONFIG_PPC_POWERNV
  #include <asm/opal.h>
  #endif
@@@ -144,7 -147,7 +144,7 @@@ int main(void
        DEFINE(PACAKBASE, offsetof(struct paca_struct, kernelbase));
        DEFINE(PACAKMSR, offsetof(struct paca_struct, kernel_msr));
        DEFINE(PACASOFTIRQEN, offsetof(struct paca_struct, soft_enabled));
 -      DEFINE(PACAHARDIRQEN, offsetof(struct paca_struct, hard_enabled));
 +      DEFINE(PACAIRQHAPPENED, offsetof(struct paca_struct, irq_happened));
        DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id));
  #ifdef CONFIG_PPC_MM_SLICES
        DEFINE(PACALOWSLICESPSIZE, offsetof(struct paca_struct,
        DEFINE(BUG_ENTRY_SIZE, sizeof(struct bug_entry));
  #endif
  
 -#ifdef CONFIG_PPC_ISERIES
 -      /* the assembler miscalculates the VSID values */
 -      DEFINE(PAGE_OFFSET_ESID, GET_ESID(PAGE_OFFSET));
 -      DEFINE(PAGE_OFFSET_VSID, KERNEL_VSID(PAGE_OFFSET));
 -      DEFINE(VMALLOC_START_ESID, GET_ESID(VMALLOC_START));
 -      DEFINE(VMALLOC_START_VSID, KERNEL_VSID(VMALLOC_START));
 -
 -      /* alpaca */
 -      DEFINE(ALPACA_SIZE, sizeof(struct alpaca));
 -#endif
 -
        DEFINE(PGD_TABLE_SIZE, PGD_TABLE_SIZE);
        DEFINE(PTE_SIZE, sizeof(pte_t));
  
        DEFINE(VCPU_SPRG2, offsetof(struct kvm_vcpu, arch.shregs.sprg2));
        DEFINE(VCPU_SPRG3, offsetof(struct kvm_vcpu, arch.shregs.sprg3));
  #endif
-       DEFINE(VCPU_SPRG4, offsetof(struct kvm_vcpu, arch.sprg4));
-       DEFINE(VCPU_SPRG5, offsetof(struct kvm_vcpu, arch.sprg5));
-       DEFINE(VCPU_SPRG6, offsetof(struct kvm_vcpu, arch.sprg6));
-       DEFINE(VCPU_SPRG7, offsetof(struct kvm_vcpu, arch.sprg7));
+       DEFINE(VCPU_SHARED_SPRG4, offsetof(struct kvm_vcpu_arch_shared, sprg4));
+       DEFINE(VCPU_SHARED_SPRG5, offsetof(struct kvm_vcpu_arch_shared, sprg5));
+       DEFINE(VCPU_SHARED_SPRG6, offsetof(struct kvm_vcpu_arch_shared, sprg6));
+       DEFINE(VCPU_SHARED_SPRG7, offsetof(struct kvm_vcpu_arch_shared, sprg7));
        DEFINE(VCPU_SHADOW_PID, offsetof(struct kvm_vcpu, arch.shadow_pid));
        DEFINE(VCPU_SHADOW_PID1, offsetof(struct kvm_vcpu, arch.shadow_pid1));
        DEFINE(VCPU_SHARED, offsetof(struct kvm_vcpu, arch.shared));
        DEFINE(VCPU_SHARED_MSR, offsetof(struct kvm_vcpu_arch_shared, msr));
        DEFINE(VCPU_SHADOW_MSR, offsetof(struct kvm_vcpu, arch.shadow_msr));
  
+       DEFINE(VCPU_SHARED_MAS0, offsetof(struct kvm_vcpu_arch_shared, mas0));
+       DEFINE(VCPU_SHARED_MAS1, offsetof(struct kvm_vcpu_arch_shared, mas1));
+       DEFINE(VCPU_SHARED_MAS2, offsetof(struct kvm_vcpu_arch_shared, mas2));
+       DEFINE(VCPU_SHARED_MAS7_3, offsetof(struct kvm_vcpu_arch_shared, mas7_3));
+       DEFINE(VCPU_SHARED_MAS4, offsetof(struct kvm_vcpu_arch_shared, mas4));
+       DEFINE(VCPU_SHARED_MAS6, offsetof(struct kvm_vcpu_arch_shared, mas6));
        /* book3s */
  #ifdef CONFIG_KVM_BOOK3S_64_HV
        DEFINE(KVM_LPID, offsetof(struct kvm, arch.lpid));
        DEFINE(KVM_LAST_VCPU, offsetof(struct kvm, arch.last_vcpu));
        DEFINE(KVM_LPCR, offsetof(struct kvm, arch.lpcr));
        DEFINE(KVM_RMOR, offsetof(struct kvm, arch.rmor));
+       DEFINE(KVM_VRMA_SLB_V, offsetof(struct kvm, arch.vrma_slb_v));
        DEFINE(VCPU_DSISR, offsetof(struct kvm_vcpu, arch.shregs.dsisr));
        DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar));
  #endif
index 2d0868a4e2f0d3d4b346721c1a25fad8ef717316,8bea12086b6763f27a7c447e3e08513416e739ad..cb705fdbb4583b6b162c9c2cb8991be43525eea4
@@@ -12,7 -12,6 +12,7 @@@
   *
   */
  
 +#include <asm/hw_irq.h>
  #include <asm/exception-64s.h>
  #include <asm/ptrace.h>
  
@@@ -20,7 -19,7 +20,7 @@@
   * We layout physical memory as follows:
   * 0x0000 - 0x00ff : Secondary processor spin code
   * 0x0100 - 0x2fff : pSeries Interrupt prologs
 - * 0x3000 - 0x5fff : interrupt support, iSeries and common interrupt prologs
 + * 0x3000 - 0x5fff : interrupt support common interrupt prologs
   * 0x6000 - 0x6fff : Initial (CPU0) segment table
   * 0x7000 - 0x7fff : FWNMI data area
   * 0x8000 -        : Early init and support code
@@@ -101,14 -100,14 +101,14 @@@ data_access_not_stab
  END_MMU_FTR_SECTION_IFCLR(MMU_FTR_SLB)
  #endif
        EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, data_access_common, EXC_STD,
-                                KVMTEST_PR, 0x300)
+                                KVMTEST, 0x300)
  
        . = 0x380
        .globl data_access_slb_pSeries
  data_access_slb_pSeries:
        HMT_MEDIUM
        SET_SCRATCH0(r13)
-       EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x380)
+       EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST, 0x380)
        std     r3,PACA_EXSLB+EX_R3(r13)
        mfspr   r3,SPRN_DAR
  #ifdef __DISABLED__
@@@ -330,8 -329,8 +330,8 @@@ do_stab_bolted_pSeries
        EXCEPTION_PROLOG_PSERIES_1(.do_stab_bolted, EXC_STD)
  #endif /* CONFIG_POWER4_ONLY */
  
-       KVM_HANDLER_PR_SKIP(PACA_EXGEN, EXC_STD, 0x300)
-       KVM_HANDLER_PR_SKIP(PACA_EXSLB, EXC_STD, 0x380)
+       KVM_HANDLER_SKIP(PACA_EXGEN, EXC_STD, 0x300)
+       KVM_HANDLER_SKIP(PACA_EXSLB, EXC_STD, 0x380)
        KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x400)
        KVM_HANDLER_PR(PACA_EXSLB, EXC_STD, 0x480)
        KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x900)
        KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf40)
  
  /*
 - * An interrupt came in while soft-disabled; clear EE in SRR1,
 - * clear paca->hard_enabled and return.
 + * An interrupt came in while soft-disabled. We set paca->irq_happened,
 + * then, if it was a decrementer interrupt, we bump the dec to max and
 + * and return, else we hard disable and return. This is called with
 + * r10 containing the value to OR to the paca field.
   */
 -masked_interrupt:
 -      stb     r10,PACAHARDIRQEN(r13)
 -      mtcrf   0x80,r9
 -      ld      r9,PACA_EXGEN+EX_R9(r13)
 -      mfspr   r10,SPRN_SRR1
 -      rldicl  r10,r10,48,1            /* clear MSR_EE */
 -      rotldi  r10,r10,16
 -      mtspr   SPRN_SRR1,r10
 -      ld      r10,PACA_EXGEN+EX_R10(r13)
 -      GET_SCRATCH0(r13)
 -      rfid
 +#define MASKED_INTERRUPT(_H)                          \
 +masked_##_H##interrupt:                                       \
 +      std     r11,PACA_EXGEN+EX_R11(r13);             \
 +      lbz     r11,PACAIRQHAPPENED(r13);               \
 +      or      r11,r11,r10;                            \
 +      stb     r11,PACAIRQHAPPENED(r13);               \
 +      andi.   r10,r10,PACA_IRQ_DEC;                   \
 +      beq     1f;                                     \
 +      lis     r10,0x7fff;                             \
 +      ori     r10,r10,0xffff;                         \
 +      mtspr   SPRN_DEC,r10;                           \
 +      b       2f;                                     \
 +1:    mfspr   r10,SPRN_##_H##SRR1;                    \
 +      rldicl  r10,r10,48,1; /* clear MSR_EE */        \
 +      rotldi  r10,r10,16;                             \
 +      mtspr   SPRN_##_H##SRR1,r10;                    \
 +2:    mtcrf   0x80,r9;                                \
 +      ld      r9,PACA_EXGEN+EX_R9(r13);               \
 +      ld      r10,PACA_EXGEN+EX_R10(r13);             \
 +      ld      r11,PACA_EXGEN+EX_R11(r13);             \
 +      GET_SCRATCH0(r13);                              \
 +      ##_H##rfid;                                     \
        b       .
 +      
 +      MASKED_INTERRUPT()
 +      MASKED_INTERRUPT(H)
  
 -masked_Hinterrupt:
 -      stb     r10,PACAHARDIRQEN(r13)
 -      mtcrf   0x80,r9
 -      ld      r9,PACA_EXGEN+EX_R9(r13)
 -      mfspr   r10,SPRN_HSRR1
 -      rldicl  r10,r10,48,1            /* clear MSR_EE */
 -      rotldi  r10,r10,16
 -      mtspr   SPRN_HSRR1,r10
 -      ld      r10,PACA_EXGEN+EX_R10(r13)
 -      GET_SCRATCH0(r13)
 -      hrfid
 -      b       .
 +/*
 + * Called from arch_local_irq_enable when an interrupt needs
 + * to be resent. r3 contains 0x500 or 0x900 to indicate which
 + * kind of interrupt. MSR:EE is already off. We generate a
 + * stackframe like if a real interrupt had happened.
 + *
 + * Note: While MSR:EE is off, we need to make sure that _MSR
 + * in the generated frame has EE set to 1 or the exception
 + * handler will not properly re-enable them.
 + */
 +_GLOBAL(__replay_interrupt)
 +      /* We are going to jump to the exception common code which
 +       * will retrieve various register values from the PACA which
 +       * we don't give a damn about, so we don't bother storing them.
 +       */
 +      mfmsr   r12
 +      mflr    r11
 +      mfcr    r9
 +      ori     r12,r12,MSR_EE
 +      andi.   r3,r3,0x0800
 +      bne     decrementer_common
 +      b       hardware_interrupt_common
  
  #ifdef CONFIG_PPC_PSERIES
  /*
@@@ -485,15 -458,14 +485,15 @@@ machine_check_common
        bl      .machine_check_exception
        b       .ret_from_except
  
 -      STD_EXCEPTION_COMMON_LITE(0x900, decrementer, .timer_interrupt)
 +      STD_EXCEPTION_COMMON_ASYNC(0x500, hardware_interrupt, do_IRQ)
 +      STD_EXCEPTION_COMMON_ASYNC(0x900, decrementer, .timer_interrupt)
        STD_EXCEPTION_COMMON(0xa00, trap_0a, .unknown_exception)
        STD_EXCEPTION_COMMON(0xb00, trap_0b, .unknown_exception)
        STD_EXCEPTION_COMMON(0xd00, single_step, .single_step_exception)
        STD_EXCEPTION_COMMON(0xe00, trap_0e, .unknown_exception)
          STD_EXCEPTION_COMMON(0xe40, emulation_assist, .program_check_exception)
          STD_EXCEPTION_COMMON(0xe60, hmi_exception, .unknown_exception)
 -      STD_EXCEPTION_COMMON_IDLE(0xf00, performance_monitor, .performance_monitor_exception)
 +      STD_EXCEPTION_COMMON_ASYNC(0xf00, performance_monitor, .performance_monitor_exception)
        STD_EXCEPTION_COMMON(0x1300, instruction_breakpoint, .instruction_breakpoint_exception)
  #ifdef CONFIG_ALTIVEC
        STD_EXCEPTION_COMMON(0x1700, altivec_assist, .altivec_assist_exception)
  system_call_entry:
        b       system_call_common
  
 +ppc64_runlatch_on_trampoline:
 +      b       .__ppc64_runlatch_on
 +
  /*
   * Here we have detected that the kernel stack pointer is bad.
   * R9 contains the saved CR, r13 points to the paca,
@@@ -586,8 -555,6 +586,8 @@@ data_access_common
        mfspr   r10,SPRN_DSISR
        stw     r10,PACA_EXGEN+EX_DSISR(r13)
        EXCEPTION_PROLOG_COMMON(0x300, PACA_EXGEN)
 +      DISABLE_INTS
 +      ld      r12,_MSR(r1)
        ld      r3,PACA_EXGEN+EX_DAR(r13)
        lwz     r4,PACA_EXGEN+EX_DSISR(r13)
        li      r5,0x300
@@@ -602,7 -569,6 +602,7 @@@ h_data_storage_common
          stw     r10,PACA_EXGEN+EX_DSISR(r13)
          EXCEPTION_PROLOG_COMMON(0xe00, PACA_EXGEN)
          bl      .save_nvgprs
 +      DISABLE_INTS
          addi    r3,r1,STACK_FRAME_OVERHEAD
          bl      .unknown_exception
          b       .ret_from_except
        .globl instruction_access_common
  instruction_access_common:
        EXCEPTION_PROLOG_COMMON(0x400, PACA_EXGEN)
 +      DISABLE_INTS
 +      ld      r12,_MSR(r1)
        ld      r3,_NIP(r1)
        andis.  r4,r12,0x5820
        li      r5,0x400
@@@ -708,6 -672,12 +708,6 @@@ _GLOBAL(slb_miss_realmode
        ld      r10,PACA_EXSLB+EX_LR(r13)
        ld      r3,PACA_EXSLB+EX_R3(r13)
        lwz     r9,PACA_EXSLB+EX_CCR(r13)       /* get saved CR */
 -#ifdef CONFIG_PPC_ISERIES
 -BEGIN_FW_FTR_SECTION
 -      ld      r11,PACALPPACAPTR(r13)
 -      ld      r11,LPPACASRR0(r11)             /* get SRR0 value */
 -END_FW_FTR_SECTION_IFSET(FW_FEATURE_ISERIES)
 -#endif /* CONFIG_PPC_ISERIES */
  
        mtlr    r10
  
        mtcrf   0x01,r9         /* slb_allocate uses cr0 and cr7 */
  .machine      pop
  
 -#ifdef CONFIG_PPC_ISERIES
 -BEGIN_FW_FTR_SECTION
 -      mtspr   SPRN_SRR0,r11
 -      mtspr   SPRN_SRR1,r12
 -END_FW_FTR_SECTION_IFSET(FW_FEATURE_ISERIES)
 -#endif /* CONFIG_PPC_ISERIES */
        ld      r9,PACA_EXSLB+EX_R9(r13)
        ld      r10,PACA_EXSLB+EX_R10(r13)
        ld      r11,PACA_EXSLB+EX_R11(r13)
        rfid
        b       .       /* prevent speculative execution */
  
 -2:
 -#ifdef CONFIG_PPC_ISERIES
 -BEGIN_FW_FTR_SECTION
 -      b       unrecov_slb
 -END_FW_FTR_SECTION_IFSET(FW_FEATURE_ISERIES)
 -#endif /* CONFIG_PPC_ISERIES */
 -      mfspr   r11,SPRN_SRR0
 +2:    mfspr   r11,SPRN_SRR0
        ld      r10,PACAKBASE(r13)
        LOAD_HANDLER(r10,unrecov_slb)
        mtspr   SPRN_SRR0,r10
@@@ -745,6 -727,20 +745,6 @@@ unrecov_slb
        bl      .unrecoverable_exception
        b       1b
  
 -      .align  7
 -      .globl hardware_interrupt_common
 -      .globl hardware_interrupt_entry
 -hardware_interrupt_common:
 -      EXCEPTION_PROLOG_COMMON(0x500, PACA_EXGEN)
 -      FINISH_NAP
 -hardware_interrupt_entry:
 -      DISABLE_INTS
 -BEGIN_FTR_SECTION
 -      bl      .ppc64_runlatch_on
 -END_FTR_SECTION_IFSET(CPU_FTR_CTRL)
 -      addi    r3,r1,STACK_FRAME_OVERHEAD
 -      bl      .do_IRQ
 -      b       .ret_from_except_lite
  
  #ifdef CONFIG_PPC_970_NAP
  power4_fixup_nap:
@@@ -789,8 -785,8 +789,8 @@@ fp_unavailable_common
        EXCEPTION_PROLOG_COMMON(0x800, PACA_EXGEN)
        bne     1f                      /* if from user, just load it up */
        bl      .save_nvgprs
 +      DISABLE_INTS
        addi    r3,r1,STACK_FRAME_OVERHEAD
 -      ENABLE_INTS
        bl      .kernel_fp_unavailable_exception
        BUG_OPCODE
  1:    bl      .load_up_fpu
@@@ -809,8 -805,8 +809,8 @@@ BEGIN_FTR_SECTIO
  END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
  #endif
        bl      .save_nvgprs
 +      DISABLE_INTS
        addi    r3,r1,STACK_FRAME_OVERHEAD
 -      ENABLE_INTS
        bl      .altivec_unavailable_exception
        b       .ret_from_except
  
@@@ -820,14 -816,13 +820,14 @@@ vsx_unavailable_common
        EXCEPTION_PROLOG_COMMON(0xf40, PACA_EXGEN)
  #ifdef CONFIG_VSX
  BEGIN_FTR_SECTION
 -      bne     .load_up_vsx
 +      beq     1f
 +      b       .load_up_vsx
  1:
  END_FTR_SECTION_IFSET(CPU_FTR_VSX)
  #endif
        bl      .save_nvgprs
 +      DISABLE_INTS
        addi    r3,r1,STACK_FRAME_OVERHEAD
 -      ENABLE_INTS
        bl      .vsx_unavailable_exception
        b       .ret_from_except
  
        .globl  __end_handlers
  __end_handlers:
  
 -/*
 - * Return from an exception with minimal checks.
 - * The caller is assumed to have done EXCEPTION_PROLOG_COMMON.
 - * If interrupts have been enabled, or anything has been
 - * done that might have changed the scheduling status of
 - * any task or sent any task a signal, you should use
 - * ret_from_except or ret_from_except_lite instead of this.
 - */
 -fast_exc_return_irq:                  /* restores irq state too */
 -      ld      r3,SOFTE(r1)
 -      TRACE_AND_RESTORE_IRQ(r3);
 -      ld      r12,_MSR(r1)
 -      rldicl  r4,r12,49,63            /* get MSR_EE to LSB */
 -      stb     r4,PACAHARDIRQEN(r13)   /* restore paca->hard_enabled */
 -      b       1f
 -
 -      .globl  fast_exception_return
 -fast_exception_return:
 -      ld      r12,_MSR(r1)
 -1:    ld      r11,_NIP(r1)
 -      andi.   r3,r12,MSR_RI           /* check if RI is set */
 -      beq-    unrecov_fer
 -
 -#ifdef CONFIG_VIRT_CPU_ACCOUNTING
 -      andi.   r3,r12,MSR_PR
 -      beq     2f
 -      ACCOUNT_CPU_USER_EXIT(r3, r4)
 -2:
 -#endif
 -
 -      ld      r3,_CCR(r1)
 -      ld      r4,_LINK(r1)
 -      ld      r5,_CTR(r1)
 -      ld      r6,_XER(r1)
 -      mtcr    r3
 -      mtlr    r4
 -      mtctr   r5
 -      mtxer   r6
 -      REST_GPR(0, r1)
 -      REST_8GPRS(2, r1)
 -
 -      mfmsr   r10
 -      rldicl  r10,r10,48,1            /* clear EE */
 -      rldicr  r10,r10,16,61           /* clear RI (LE is 0 already) */
 -      mtmsrd  r10,1
 -
 -      mtspr   SPRN_SRR1,r12
 -      mtspr   SPRN_SRR0,r11
 -      REST_4GPRS(10, r1)
 -      ld      r1,GPR1(r1)
 -      rfid
 -      b       .       /* prevent speculative execution */
 -
 -unrecov_fer:
 -      bl      .save_nvgprs
 -1:    addi    r3,r1,STACK_FRAME_OVERHEAD
 -      bl      .unrecoverable_exception
 -      b       1b
 -
 -
  /*
   * Hash table stuff
   */
@@@ -857,6 -912,28 +857,6 @@@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_SLB
        lwz     r0,TI_PREEMPT(r11)      /* If we're in an "NMI" */
        andis.  r0,r0,NMI_MASK@h        /* (i.e. an irq when soft-disabled) */
        bne     77f                     /* then don't call hash_page now */
 -
 -      /*
 -       * On iSeries, we soft-disable interrupts here, then
 -       * hard-enable interrupts so that the hash_page code can spin on
 -       * the hash_table_lock without problems on a shared processor.
 -       */
 -      DISABLE_INTS
 -
 -      /*
 -       * Currently, trace_hardirqs_off() will be called by DISABLE_INTS
 -       * and will clobber volatile registers when irq tracing is enabled
 -       * so we need to reload them. It may be possible to be smarter here
 -       * and move the irq tracing elsewhere but let's keep it simple for
 -       * now
 -       */
 -#ifdef CONFIG_TRACE_IRQFLAGS
 -      ld      r3,_DAR(r1)
 -      ld      r4,_DSISR(r1)
 -      ld      r5,_TRAP(r1)
 -      ld      r12,_MSR(r1)
 -      clrrdi  r5,r5,4
 -#endif /* CONFIG_TRACE_IRQFLAGS */
        /*
         * We need to set the _PAGE_USER bit if MSR_PR is set or if we are
         * accessing a userspace segment (even from the kernel). We assume
         * r4 contains the required access permissions
         * r5 contains the trap number
         *
 -       * at return r3 = 0 for success
 +       * at return r3 = 0 for success, 1 for page fault, negative for error
         */
        bl      .hash_page              /* build HPTE if possible */
        cmpdi   r3,0                    /* see if hash_page succeeded */
  
 -BEGIN_FW_FTR_SECTION
 -      /*
 -       * If we had interrupts soft-enabled at the point where the
 -       * DSI/ISI occurred, and an interrupt came in during hash_page,
 -       * handle it now.
 -       * We jump to ret_from_except_lite rather than fast_exception_return
 -       * because ret_from_except_lite will check for and handle pending
 -       * interrupts if necessary.
 -       */
 -      beq     13f
 -END_FW_FTR_SECTION_IFSET(FW_FEATURE_ISERIES)
 -
 -BEGIN_FW_FTR_SECTION
 -      /*
 -       * Here we have interrupts hard-disabled, so it is sufficient
 -       * to restore paca->{soft,hard}_enable and get out.
 -       */
 +      /* Success */
        beq     fast_exc_return_irq     /* Return from exception on success */
 -END_FW_FTR_SECTION_IFCLR(FW_FEATURE_ISERIES)
 -
 -      /* For a hash failure, we don't bother re-enabling interrupts */
 -      ble-    12f
 -
 -      /*
 -       * hash_page couldn't handle it, set soft interrupt enable back
 -       * to what it was before the trap.  Note that .arch_local_irq_restore
 -       * handles any interrupts pending at this point.
 -       */
 -      ld      r3,SOFTE(r1)
 -      TRACE_AND_RESTORE_IRQ_PARTIAL(r3, 11f)
 -      bl      .arch_local_irq_restore
 -      b       11f
  
 -/* We have a data breakpoint exception - handle it */
 -handle_dabr_fault:
 -      bl      .save_nvgprs
 -      ld      r4,_DAR(r1)
 -      ld      r5,_DSISR(r1)
 -      addi    r3,r1,STACK_FRAME_OVERHEAD
 -      bl      .do_dabr
 -      b       .ret_from_except_lite
 +      /* Error */
 +      blt-    13f
  
  /* Here we have a page fault that hash_page can't handle. */
  handle_page_fault:
 -      ENABLE_INTS
  11:   ld      r4,_DAR(r1)
        ld      r5,_DSISR(r1)
        addi    r3,r1,STACK_FRAME_OVERHEAD
        bl      .do_page_fault
        cmpdi   r3,0
 -      beq+    13f
 +      beq+    12f
        bl      .save_nvgprs
        mr      r5,r3
        addi    r3,r1,STACK_FRAME_OVERHEAD
        bl      .bad_page_fault
        b       .ret_from_except
  
 -13:   b       .ret_from_except_lite
 +/* We have a data breakpoint exception - handle it */
 +handle_dabr_fault:
 +      bl      .save_nvgprs
 +      ld      r4,_DAR(r1)
 +      ld      r5,_DSISR(r1)
 +      addi    r3,r1,STACK_FRAME_OVERHEAD
 +      bl      .do_dabr
 +12:   b       .ret_from_except_lite
 +
  
  /* We have a page fault that hash_page could handle but HV refused
   * the PTE insertion
   */
 -12:   bl      .save_nvgprs
 +13:   bl      .save_nvgprs
        mr      r5,r3
        addi    r3,r1,STACK_FRAME_OVERHEAD
        ld      r4,_DAR(r1)
@@@ -1035,19 -1141,51 +1035,19 @@@ _GLOBAL(do_stab_bolted
        .= 0x7000
        .globl fwnmi_data_area
  fwnmi_data_area:
 -#endif /* defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) */
  
 -      /* iSeries does not use the FWNMI stuff, so it is safe to put
 -       * this here, even if we later allow kernels that will boot on
 -       * both pSeries and iSeries */
 -#ifdef CONFIG_PPC_ISERIES
 -        . = LPARMAP_PHYS
 -      .globl xLparMap
 -xLparMap:
 -      .quad   HvEsidsToMap            /* xNumberEsids */
 -      .quad   HvRangesToMap           /* xNumberRanges */
 -      .quad   STAB0_PAGE              /* xSegmentTableOffs */
 -      .zero   40                      /* xRsvd */
 -      /* xEsids (HvEsidsToMap entries of 2 quads) */
 -      .quad   PAGE_OFFSET_ESID        /* xKernelEsid */
 -      .quad   PAGE_OFFSET_VSID        /* xKernelVsid */
 -      .quad   VMALLOC_START_ESID      /* xKernelEsid */
 -      .quad   VMALLOC_START_VSID      /* xKernelVsid */
 -      /* xRanges (HvRangesToMap entries of 3 quads) */
 -      .quad   HvPagesToMap            /* xPages */
 -      .quad   0                       /* xOffset */
 -      .quad   PAGE_OFFSET_VSID << (SID_SHIFT - HW_PAGE_SHIFT) /* xVPN */
 -
 -#endif /* CONFIG_PPC_ISERIES */
 -
 -#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV)
        /* pseries and powernv need to keep the whole page from
         * 0x7000 to 0x8000 free for use by the firmware
         */
          . = 0x8000
  #endif /* defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) */
  
 -/*
 - * Space for CPU0's segment table.
 - *
 - * On iSeries, the hypervisor must fill in at least one entry before
 - * we get control (with relocate on).  The address is given to the hv
 - * as a page number (see xLparMap above), so this must be at a
 - * fixed address (the linker can't compute (u64)&initial_stab >>
 - * PAGE_SHIFT).
 - */
 -      . = STAB0_OFFSET        /* 0x8000 */
 +/* Space for CPU0's segment table */
 +      .balign 4096
        .globl initial_stab
  initial_stab:
        .space  4096
 +
  #ifdef CONFIG_PPC_POWERNV
  _GLOBAL(opal_mc_secondary_handler)
        HMT_MEDIUM
index 220fcdf26978efbf3bfd485772cfd7fb2453519b,ee222ec7c95ce268c960e190be6d7a5b81c5e2d6..7340e1090b770302cdb055728b0c7740241c0ca6
@@@ -51,15 -51,19 +51,19 @@@ static int kvmppc_handle_ext(struct kvm
  #define MSR_USER32 MSR_USER
  #define MSR_USER64 MSR_USER
  #define HW_PAGE_SIZE PAGE_SIZE
+ #define __hard_irq_disable local_irq_disable
+ #define __hard_irq_enable local_irq_enable
  #endif
  
  void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
  {
  #ifdef CONFIG_PPC_BOOK3S_64
-       memcpy(to_svcpu(vcpu)->slb, to_book3s(vcpu)->slb_shadow, sizeof(to_svcpu(vcpu)->slb));
+       struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+       memcpy(svcpu->slb, to_book3s(vcpu)->slb_shadow, sizeof(svcpu->slb));
        memcpy(&get_paca()->shadow_vcpu, to_book3s(vcpu)->shadow_vcpu,
               sizeof(get_paca()->shadow_vcpu));
-       to_svcpu(vcpu)->slb_max = to_book3s(vcpu)->slb_shadow_max;
+       svcpu->slb_max = to_book3s(vcpu)->slb_shadow_max;
+       svcpu_put(svcpu);
  #endif
  
  #ifdef CONFIG_PPC_BOOK3S_32
  void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
  {
  #ifdef CONFIG_PPC_BOOK3S_64
-       memcpy(to_book3s(vcpu)->slb_shadow, to_svcpu(vcpu)->slb, sizeof(to_svcpu(vcpu)->slb));
+       struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+       memcpy(to_book3s(vcpu)->slb_shadow, svcpu->slb, sizeof(svcpu->slb));
        memcpy(to_book3s(vcpu)->shadow_vcpu, &get_paca()->shadow_vcpu,
               sizeof(get_paca()->shadow_vcpu));
-       to_book3s(vcpu)->slb_shadow_max = to_svcpu(vcpu)->slb_max;
+       to_book3s(vcpu)->slb_shadow_max = svcpu->slb_max;
+       svcpu_put(svcpu);
  #endif
  
        kvmppc_giveup_ext(vcpu, MSR_FP);
@@@ -151,14 -157,16 +157,16 @@@ void kvmppc_set_pvr(struct kvm_vcpu *vc
  #ifdef CONFIG_PPC_BOOK3S_64
        if ((pvr >= 0x330000) && (pvr < 0x70330000)) {
                kvmppc_mmu_book3s_64_init(vcpu);
-               to_book3s(vcpu)->hior = 0xfff00000;
+               if (!to_book3s(vcpu)->hior_explicit)
+                       to_book3s(vcpu)->hior = 0xfff00000;
                to_book3s(vcpu)->msr_mask = 0xffffffffffffffffULL;
                vcpu->arch.cpu_type = KVM_CPU_3S_64;
        } else
  #endif
        {
                kvmppc_mmu_book3s_32_init(vcpu);
-               to_book3s(vcpu)->hior = 0;
+               if (!to_book3s(vcpu)->hior_explicit)
+                       to_book3s(vcpu)->hior = 0;
                to_book3s(vcpu)->msr_mask = 0xffffffffULL;
                vcpu->arch.cpu_type = KVM_CPU_3S_32;
        }
@@@ -227,14 -235,14 +235,14 @@@ static void kvmppc_patch_dcbz(struct kv
        hpage_offset /= 4;
  
        get_page(hpage);
 -      page = kmap_atomic(hpage, KM_USER0);
 +      page = kmap_atomic(hpage);
  
        /* patch dcbz into reserved instruction, so we trap */
        for (i=hpage_offset; i < hpage_offset + (HW_PAGE_SIZE / 4); i++)
                if ((page[i] & 0xff0007ff) == INS_DCBZ)
                        page[i] &= 0xfffffff7;
  
 -      kunmap_atomic(page, KM_USER0);
 +      kunmap_atomic(page);
        put_page(hpage);
  }
  
@@@ -308,19 -316,22 +316,22 @@@ int kvmppc_handle_pagefault(struct kvm_
  
        if (page_found == -ENOENT) {
                /* Page not found in guest PTE entries */
+               struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
                vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
-               vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr;
+               vcpu->arch.shared->dsisr = svcpu->fault_dsisr;
                vcpu->arch.shared->msr |=
-                       (to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL);
+                       (svcpu->shadow_srr1 & 0x00000000f8000000ULL);
+               svcpu_put(svcpu);
                kvmppc_book3s_queue_irqprio(vcpu, vec);
        } else if (page_found == -EPERM) {
                /* Storage protection */
+               struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
                vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
-               vcpu->arch.shared->dsisr =
-                       to_svcpu(vcpu)->fault_dsisr & ~DSISR_NOHPTE;
+               vcpu->arch.shared->dsisr = svcpu->fault_dsisr & ~DSISR_NOHPTE;
                vcpu->arch.shared->dsisr |= DSISR_PROTFAULT;
                vcpu->arch.shared->msr |=
-                       (to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL);
+                       svcpu->shadow_srr1 & 0x00000000f8000000ULL;
+               svcpu_put(svcpu);
                kvmppc_book3s_queue_irqprio(vcpu, vec);
        } else if (page_found == -EINVAL) {
                /* Page not found in guest SLB */
@@@ -517,24 -528,29 +528,29 @@@ int kvmppc_handle_exit(struct kvm_run *
        run->ready_for_interrupt_injection = 1;
  
        trace_kvm_book3s_exit(exit_nr, vcpu);
+       preempt_enable();
        kvm_resched(vcpu);
        switch (exit_nr) {
        case BOOK3S_INTERRUPT_INST_STORAGE:
+       {
+               struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+               ulong shadow_srr1 = svcpu->shadow_srr1;
                vcpu->stat.pf_instruc++;
  
  #ifdef CONFIG_PPC_BOOK3S_32
                /* We set segments as unused segments when invalidating them. So
                 * treat the respective fault as segment fault. */
-               if (to_svcpu(vcpu)->sr[kvmppc_get_pc(vcpu) >> SID_SHIFT]
-                   == SR_INVALID) {
+               if (svcpu->sr[kvmppc_get_pc(vcpu) >> SID_SHIFT] == SR_INVALID) {
                        kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu));
                        r = RESUME_GUEST;
+                       svcpu_put(svcpu);
                        break;
                }
  #endif
+               svcpu_put(svcpu);
  
                /* only care about PTEG not found errors, but leave NX alone */
-               if (to_svcpu(vcpu)->shadow_srr1 & 0x40000000) {
+               if (shadow_srr1 & 0x40000000) {
                        r = kvmppc_handle_pagefault(run, vcpu, kvmppc_get_pc(vcpu), exit_nr);
                        vcpu->stat.sp_instruc++;
                } else if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
                        kvmppc_mmu_pte_flush(vcpu, kvmppc_get_pc(vcpu), ~0xFFFUL);
                        r = RESUME_GUEST;
                } else {
-                       vcpu->arch.shared->msr |=
-                               to_svcpu(vcpu)->shadow_srr1 & 0x58000000;
+                       vcpu->arch.shared->msr |= shadow_srr1 & 0x58000000;
                        kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
                        r = RESUME_GUEST;
                }
                break;
+       }
        case BOOK3S_INTERRUPT_DATA_STORAGE:
        {
                ulong dar = kvmppc_get_fault_dar(vcpu);
+               struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+               u32 fault_dsisr = svcpu->fault_dsisr;
                vcpu->stat.pf_storage++;
  
  #ifdef CONFIG_PPC_BOOK3S_32
                /* We set segments as unused segments when invalidating them. So
                 * treat the respective fault as segment fault. */
-               if ((to_svcpu(vcpu)->sr[dar >> SID_SHIFT]) == SR_INVALID) {
+               if ((svcpu->sr[dar >> SID_SHIFT]) == SR_INVALID) {
                        kvmppc_mmu_map_segment(vcpu, dar);
                        r = RESUME_GUEST;
+                       svcpu_put(svcpu);
                        break;
                }
  #endif
+               svcpu_put(svcpu);
  
                /* The only case we need to handle is missing shadow PTEs */
-               if (to_svcpu(vcpu)->fault_dsisr & DSISR_NOHPTE) {
+               if (fault_dsisr & DSISR_NOHPTE) {
                        r = kvmppc_handle_pagefault(run, vcpu, dar, exit_nr);
                } else {
                        vcpu->arch.shared->dar = dar;
-                       vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr;
+                       vcpu->arch.shared->dsisr = fault_dsisr;
                        kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
                        r = RESUME_GUEST;
                }
        case BOOK3S_INTERRUPT_PROGRAM:
        {
                enum emulation_result er;
+               struct kvmppc_book3s_shadow_vcpu *svcpu;
                ulong flags;
  
  program_interrupt:
-               flags = to_svcpu(vcpu)->shadow_srr1 & 0x1f0000ull;
+               svcpu = svcpu_get(vcpu);
+               flags = svcpu->shadow_srr1 & 0x1f0000ull;
+               svcpu_put(svcpu);
  
                if (vcpu->arch.shared->msr & MSR_PR) {
  #ifdef EXIT_DEBUG
                r = RESUME_GUEST;
                break;
        default:
+       {
+               struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+               ulong shadow_srr1 = svcpu->shadow_srr1;
+               svcpu_put(svcpu);
                /* Ugh - bork here! What did we get? */
                printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | msr=0x%lx\n",
-                       exit_nr, kvmppc_get_pc(vcpu), to_svcpu(vcpu)->shadow_srr1);
+                       exit_nr, kvmppc_get_pc(vcpu), shadow_srr1);
                r = RESUME_HOST;
                BUG();
                break;
        }
+       }
  
        if (!(r & RESUME_HOST)) {
                /* To avoid clobbering exit_reason, only check for signals if
                 * we aren't already exiting to userspace for some other
                 * reason. */
+               /*
+                * Interrupts could be timers for the guest which we have to
+                * inject again, so let's postpone them until we're in the guest
+                * and if we really did time things so badly, then we just exit
+                * again due to a host external interrupt.
+                */
+               __hard_irq_disable();
                if (signal_pending(current)) {
+                       __hard_irq_enable();
  #ifdef EXIT_DEBUG
                        printk(KERN_EMERG "KVM: Going back to host\n");
  #endif
                        run->exit_reason = KVM_EXIT_INTR;
                        r = -EINTR;
                } else {
+                       preempt_disable();
                        /* In case an interrupt came in that was triggered
                         * from userspace (like DEC), we need to check what
                         * to inject now! */
-                       kvmppc_core_deliver_interrupts(vcpu);
+                       kvmppc_core_prepare_to_enter(vcpu);
                }
        }
  
@@@ -836,6 -874,38 +874,38 @@@ int kvm_arch_vcpu_ioctl_set_sregs(struc
        return 0;
  }
  
+ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
+ {
+       int r = -EINVAL;
+       switch (reg->id) {
+       case KVM_REG_PPC_HIOR:
+               r = put_user(to_book3s(vcpu)->hior, (u64 __user *)reg->addr);
+               break;
+       default:
+               break;
+       }
+       return r;
+ }
+ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
+ {
+       int r = -EINVAL;
+       switch (reg->id) {
+       case KVM_REG_PPC_HIOR:
+               r = get_user(to_book3s(vcpu)->hior, (u64 __user *)reg->addr);
+               if (!r)
+                       to_book3s(vcpu)->hior_explicit = true;
+               break;
+       default:
+               break;
+       }
+       return r;
+ }
  int kvmppc_core_check_processor_compat(void)
  {
        return 0;
@@@ -923,16 -993,31 +993,31 @@@ int kvmppc_vcpu_run(struct kvm_run *kvm
  #endif
        ulong ext_msr;
  
+       preempt_disable();
        /* Check if we can run the vcpu at all */
        if (!vcpu->arch.sane) {
                kvm_run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-               return -EINVAL;
+               ret = -EINVAL;
+               goto out;
        }
  
+       kvmppc_core_prepare_to_enter(vcpu);
+       /*
+        * Interrupts could be timers for the guest which we have to inject
+        * again, so let's postpone them until we're in the guest and if we
+        * really did time things so badly, then we just exit again due to
+        * a host external interrupt.
+        */
+       __hard_irq_disable();
        /* No need to go into the guest when all we do is going out */
        if (signal_pending(current)) {
+               __hard_irq_enable();
                kvm_run->exit_reason = KVM_EXIT_INTR;
-               return -EINTR;
+               ret = -EINTR;
+               goto out;
        }
  
        /* Save FPU state in stack */
  
        kvm_guest_exit();
  
-       local_irq_disable();
        current->thread.regs->msr = ext_msr;
  
        /* Make sure we save the guest FPU/Altivec/VSX state */
        current->thread.used_vsr = used_vsr;
  #endif
  
+ out:
+       preempt_enable();
        return ret;
  }
  
+ /*
+  * Get (and clear) the dirty memory log for a memory slot.
+  */
+ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
+                                     struct kvm_dirty_log *log)
+ {
+       struct kvm_memory_slot *memslot;
+       struct kvm_vcpu *vcpu;
+       ulong ga, ga_end;
+       int is_dirty = 0;
+       int r;
+       unsigned long n;
+       mutex_lock(&kvm->slots_lock);
+       r = kvm_get_dirty_log(kvm, log, &is_dirty);
+       if (r)
+               goto out;
+       /* If nothing is dirty, don't bother messing with page tables. */
+       if (is_dirty) {
+               memslot = id_to_memslot(kvm->memslots, log->slot);
+               ga = memslot->base_gfn << PAGE_SHIFT;
+               ga_end = ga + (memslot->npages << PAGE_SHIFT);
+               kvm_for_each_vcpu(n, vcpu, kvm)
+                       kvmppc_mmu_pte_pflush(vcpu, ga, ga_end);
+               n = kvm_dirty_bitmap_bytes(memslot);
+               memset(memslot->dirty_bitmap, 0, n);
+       }
+       r = 0;
+ out:
+       mutex_unlock(&kvm->slots_lock);
+       return r;
+ }
  int kvmppc_core_prepare_memory_region(struct kvm *kvm,
                                      struct kvm_userspace_memory_region *mem)
  {
index a3e628727697577a285ddc37e955353443496a59,f348c3d904046eaefd74b6cbdb2955535b602c0b..fb05b123218f389eaea96a9022d5b39f8cd18188
@@@ -12,6 -12,7 +12,7 @@@
  #include <linux/io.h>
  #include <linux/slab.h>
  #include <linux/hugetlb.h>
+ #include <linux/export.h>
  #include <linux/of_fdt.h>
  #include <linux/memblock.h>
  #include <linux/bootmem.h>
@@@ -103,6 -104,7 +104,7 @@@ pte_t *find_linux_pte_or_hugepte(pgd_t 
                *shift = hugepd_shift(*hpdp);
        return hugepte_offset(hpdp, ea, pdshift);
  }
+ EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte);
  
  pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
  {
@@@ -310,8 -312,7 +312,8 @@@ void __init reserve_hugetlb_gpages(void
        int i;
  
        strlcpy(cmdline, boot_command_line, COMMAND_LINE_SIZE);
 -      parse_args("hugetlb gpages", cmdline, NULL, 0, &do_gpage_early_setup);
 +      parse_args("hugetlb gpages", cmdline, NULL, 0, 0, 0,
 +                      &do_gpage_early_setup);
  
        /*
         * Walk gpage list in reverse, allocating larger page sizes first.
@@@ -911,9 -912,9 +913,9 @@@ void flush_dcache_icache_hugepage(struc
                if (!PageHighMem(page)) {
                        __flush_dcache_icache(page_address(page+i));
                } else {
 -                      start = kmap_atomic(page+i, KM_PPC_SYNC_ICACHE);
 +                      start = kmap_atomic(page+i);
                        __flush_dcache_icache(start);
 -                      kunmap_atomic(start, KM_PPC_SYNC_ICACHE);
 +                      kunmap_atomic(start);
                }
        }
  }
index f0647ce6da21db24bb926afe148f8f3bc828fcf2,c6366cfb3bf072305724d0548d0fc683533a4e7d..2d9f9a72bb8108ddb3ea4c03c3339eb0d3f5d118
@@@ -134,7 -134,7 +134,7 @@@ static void __do_deliver_interrupt(stru
                if (rc == -EFAULT)
                        exception = 1;
  
 -              rc = put_guest_u16(vcpu, __LC_CPU_ADDRESS, inti->emerg.code);
 +              rc = put_guest_u16(vcpu, __LC_EXT_CPU_ADDR, inti->emerg.code);
                if (rc == -EFAULT)
                        exception = 1;
  
                if (rc == -EFAULT)
                        exception = 1;
  
 -              rc = put_guest_u16(vcpu, __LC_CPU_ADDRESS, inti->extcall.code);
 +              rc = put_guest_u16(vcpu, __LC_EXT_CPU_ADDR, inti->extcall.code);
                if (rc == -EFAULT)
                        exception = 1;
  
                if (rc == -EFAULT)
                        exception = 1;
  
 -              rc = put_guest_u16(vcpu, __LC_CPU_ADDRESS, 0x0d00);
 +              rc = put_guest_u16(vcpu, __LC_EXT_CPU_ADDR, 0x0d00);
                if (rc == -EFAULT)
                        exception = 1;
  
                VCPU_EVENT(vcpu, 4, "interrupt: set prefix to %x",
                           inti->prefix.address);
                vcpu->stat.deliver_prefix_signal++;
-               vcpu->arch.sie_block->prefix = inti->prefix.address;
-               vcpu->arch.sie_block->ihcpu = 0xffff;
+               kvm_s390_set_prefix(vcpu, inti->prefix.address);
                break;
  
        case KVM_S390_RESTART:
index e8fb2c7a5f4ff1792c16dc1c8df498a0801b8eed,f1f71823f6828b5cd88640a6d6144bbb3a63e066..2291895b1836a4acf9b9313722fb670717f0d4c5
@@@ -23,6 -23,7 +23,7 @@@
  #define ARCH_PERFMON_EVENTSEL_USR                     (1ULL << 16)
  #define ARCH_PERFMON_EVENTSEL_OS                      (1ULL << 17)
  #define ARCH_PERFMON_EVENTSEL_EDGE                    (1ULL << 18)
+ #define ARCH_PERFMON_EVENTSEL_PIN_CONTROL             (1ULL << 19)
  #define ARCH_PERFMON_EVENTSEL_INT                     (1ULL << 20)
  #define ARCH_PERFMON_EVENTSEL_ANY                     (1ULL << 21)
  #define ARCH_PERFMON_EVENTSEL_ENABLE                  (1ULL << 22)
@@@ -188,6 -189,8 +189,6 @@@ extern u32 get_ibs_caps(void)
  #ifdef CONFIG_PERF_EVENTS
  extern void perf_events_lapic_init(void);
  
 -#define PERF_EVENT_INDEX_OFFSET                       0
 -
  /*
   * Abuse bit 3 of the cpu eflags register to indicate proper PEBS IP fixups.
   * This flag is otherwise unused and ABI specified to be 0, so nobody should
@@@ -240,12 -243,4 +241,12 @@@ static inline void perf_get_x86_pmu_cap
  static inline void perf_events_lapic_init(void)       { }
  #endif
  
 +#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD)
 + extern void amd_pmu_enable_virt(void);
 + extern void amd_pmu_disable_virt(void);
 +#else
 + static inline void amd_pmu_enable_virt(void) { }
 + static inline void amd_pmu_disable_virt(void) { }
 +#endif
 +
  #endif /* _ASM_X86_PERF_EVENT_H */
index e578a79a3093253038f6b0d9aef4292e9540104a,a05d6fd5e06d5167f0afb9b06aad11dd9c145ba0..5104a2b685cf4b5a538d451cecc694bdf6427285
@@@ -255,6 -255,7 +255,7 @@@ notrace static void __cpuinit start_sec
         * most necessary things.
         */
        cpu_init();
+       x86_cpuinit.early_percpu_clock_init();
        preempt_disable();
        smp_callin();
  
        per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
        x86_platform.nmi_init();
  
 -      /*
 -       * Wait until the cpu which brought this one up marked it
 -       * online before enabling interrupts. If we don't do that then
 -       * we can end up waking up the softirq thread before this cpu
 -       * reached the active state, which makes the scheduler unhappy
 -       * and schedule the softirq thread on the wrong cpu. This is
 -       * only observable with forced threaded interrupts, but in
 -       * theory it could also happen w/o them. It's just way harder
 -       * to achieve.
 -       */
 -      while (!cpumask_test_cpu(smp_processor_id(), cpu_active_mask))
 -              cpu_relax();
 -
        /* enable local interrupts */
        local_irq_enable();
  
@@@ -727,6 -741,8 +728,6 @@@ do_rest
         * the targeted processor.
         */
  
 -      printk(KERN_DEBUG "smpboot cpu %d: start_ip = %lx\n", cpu, start_ip);
 -
        atomic_set(&init_deasserted, 0);
  
        if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
                        schedule();
                }
  
 -              if (cpumask_test_cpu(cpu, cpu_callin_mask))
 +              if (cpumask_test_cpu(cpu, cpu_callin_mask)) {
 +                      print_cpu_msr(&cpu_data(cpu));
                        pr_debug("CPU%d: has booted.\n", cpu);
 -              else {
 +              else {
                        boot_error = 1;
                        if (*(volatile u32 *)TRAMPOLINE_SYM(trampoline_status)
                            == 0xA5A5A5A5)
@@@ -833,7 -848,7 +834,7 @@@ int __cpuinit native_cpu_up(unsigned in
  
        if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid ||
            !physid_isset(apicid, phys_cpu_present_map) ||
 -          (!x2apic_mode && apicid >= 255)) {
 +          !apic->apic_id_valid(apicid)) {
                printk(KERN_ERR "%s: bad cpu %d\n", __func__, cpu);
                return -EINVAL;
        }
diff --combined arch/x86/kernel/tsc.c
index 183c5925a9fec5f44b46476468bebc69f4e82ffd,aed2aa1088f11cd5c0838fbdef0920bdb2afe6d3..899a03f2d1813e756d38fe9d19ed187947dc654e
@@@ -620,8 -620,7 +620,8 @@@ static void set_cyc2ns_scale(unsigned l
  
        if (cpu_khz) {
                *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz;
 -              *offset = ns_now - (tsc_now * *scale >> CYC2NS_SCALE_FACTOR);
 +              *offset = ns_now - mult_frac(tsc_now, *scale,
 +                                           (1UL << CYC2NS_SCALE_FACTOR));
        }
  
        sched_clock_idle_wakeup_event(0);
  
  static unsigned long long cyc2ns_suspend;
  
- void save_sched_clock_state(void)
+ void tsc_save_sched_clock_state(void)
  {
        if (!sched_clock_stable)
                return;
   * that sched_clock() continues from the point where it was left off during
   * suspend.
   */
- void restore_sched_clock_state(void)
+ void tsc_restore_sched_clock_state(void)
  {
        unsigned long long offset;
        unsigned long flags;
diff --combined arch/x86/kvm/lapic.c
index 31bfc6927bc0a25eae47da65e0884365e8fc4439,72975f758c83023b277be6e82d44830d4be11db6..858432287ab626dee9ce4568404fdf4f968a89a2
@@@ -433,7 -433,7 +433,7 @@@ static int __apic_accept_irq(struct kvm
                break;
  
        case APIC_DM_INIT:
-               if (level) {
+               if (!trig_mode || level) {
                        result = 1;
                        vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
                        kvm_make_request(KVM_REQ_EVENT, vcpu);
@@@ -731,7 -731,7 +731,7 @@@ static void start_apic_timer(struct kvm
                u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
                u64 ns = 0;
                struct kvm_vcpu *vcpu = apic->vcpu;
-               unsigned long this_tsc_khz = vcpu_tsc_khz(vcpu);
+               unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
                unsigned long flags;
  
                if (unlikely(!tscdeadline || !this_tsc_khz))
@@@ -1283,9 -1283,9 +1283,9 @@@ void kvm_lapic_sync_from_vapic(struct k
        if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr)
                return;
  
 -      vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0);
 +      vapic = kmap_atomic(vcpu->arch.apic->vapic_page);
        data = *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr));
 -      kunmap_atomic(vapic, KM_USER0);
 +      kunmap_atomic(vapic);
  
        apic_set_tpr(vcpu->arch.apic, data & 0xff);
  }
@@@ -1310,9 -1310,9 +1310,9 @@@ void kvm_lapic_sync_to_vapic(struct kvm
                max_isr = 0;
        data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24);
  
 -      vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0);
 +      vapic = kmap_atomic(vcpu->arch.apic->vapic_page);
        *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr)) = data;
 -      kunmap_atomic(vapic, KM_USER0);
 +      kunmap_atomic(vapic);
  }
  
  void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
diff --combined arch/x86/kvm/mmu_audit.c
index ea7b4fd34676fe08062e8e76e0825a4170fc17fe,6eabae3d77ff7d3711adfe63cf114d2eb2cc8121..715da5a19a5b6cf8a6abec0a96fddf1d2260cc85
@@@ -200,13 -200,13 +200,13 @@@ static void audit_write_protection(stru
        slot = gfn_to_memslot(kvm, sp->gfn);
        rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
  
-       spte = rmap_next(kvm, rmapp, NULL);
+       spte = rmap_next(rmapp, NULL);
        while (spte) {
                if (is_writable_pte(*spte))
                        audit_printk(kvm, "shadow page has writable "
                                     "mappings: gfn %llx role %x\n",
                                     sp->gfn, sp->role.word);
-               spte = rmap_next(kvm, rmapp, spte);
+               spte = rmap_next(rmapp, spte);
        }
  }
  
@@@ -234,7 -234,7 +234,7 @@@ static void audit_vcpu_spte(struct kvm_
  }
  
  static bool mmu_audit;
 -static struct jump_label_key mmu_audit_key;
 +static struct static_key mmu_audit_key;
  
  static void __kvm_mmu_audit(struct kvm_vcpu *vcpu, int point)
  {
  
  static inline void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point)
  {
 -      if (static_branch((&mmu_audit_key)))
 +      if (static_key_false((&mmu_audit_key)))
                __kvm_mmu_audit(vcpu, point);
  }
  
@@@ -259,7 -259,7 +259,7 @@@ static void mmu_audit_enable(void
        if (mmu_audit)
                return;
  
 -      jump_label_inc(&mmu_audit_key);
 +      static_key_slow_inc(&mmu_audit_key);
        mmu_audit = true;
  }
  
@@@ -268,7 -268,7 +268,7 @@@ static void mmu_audit_disable(void
        if (!mmu_audit)
                return;
  
 -      jump_label_dec(&mmu_audit_key);
 +      static_key_slow_dec(&mmu_audit_key);
        mmu_audit = false;
  }
  
diff --combined arch/x86/kvm/svm.c
index e385214711cbcf005846c1d0e335502a6c7d9709,53efd597f39e23e192f548adcf984e38e6f0c17a..e334389e1c755eb471c146264e4bf1021ca5333a
@@@ -29,7 -29,6 +29,7 @@@
  #include <linux/ftrace_event.h>
  #include <linux/slab.h>
  
 +#include <asm/perf_event.h>
  #include <asm/tlbflush.h>
  #include <asm/desc.h>
  #include <asm/kvm_para.h>
@@@ -111,6 -110,12 +111,12 @@@ struct nested_state 
  #define MSRPM_OFFSETS 16
  static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
  
+ /*
+  * Set osvw_len to higher value when updated Revision Guides
+  * are published and we know what the new status bits are
+  */
+ static uint64_t osvw_len = 4, osvw_status;
  struct vcpu_svm {
        struct kvm_vcpu vcpu;
        struct vmcb *vmcb;
@@@ -177,11 -182,13 +183,13 @@@ static bool npt_enabled = true
  #else
  static bool npt_enabled;
  #endif
- static int npt = 1;
  
+ /* allow nested paging (virtualized MMU) for all guests */
+ static int npt = true;
  module_param(npt, int, S_IRUGO);
  
- static int nested = 1;
+ /* allow nested virtualization in KVM/SVM */
+ static int nested = true;
  module_param(nested, int, S_IRUGO);
  
  static void svm_flush_tlb(struct kvm_vcpu *vcpu);
@@@ -557,6 -564,27 +565,27 @@@ static void svm_init_erratum_383(void
        erratum_383_found = true;
  }
  
+ static void svm_init_osvw(struct kvm_vcpu *vcpu)
+ {
+       /*
+        * Guests should see errata 400 and 415 as fixed (assuming that
+        * HLT and IO instructions are intercepted).
+        */
+       vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
+       vcpu->arch.osvw.status = osvw_status & ~(6ULL);
+       /*
+        * By increasing VCPU's osvw.length to 3 we are telling the guest that
+        * all osvw.status bits inside that length, including bit 0 (which is
+        * reserved for erratum 298), are valid. However, if host processor's
+        * osvw_len is 0 then osvw_status[0] carries no information. We need to
+        * be conservative here and therefore we tell the guest that erratum 298
+        * is present (because we really don't know).
+        */
+       if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
+               vcpu->arch.osvw.status |= 1;
+ }
  static int has_svm(void)
  {
        const char *msg;
@@@ -576,8 -604,6 +605,8 @@@ static void svm_hardware_disable(void *
                wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
  
        cpu_svm_disable();
 +
 +      amd_pmu_disable_virt();
  }
  
  static int svm_hardware_enable(void *garbage)
                __get_cpu_var(current_tsc_ratio) = TSC_RATIO_DEFAULT;
        }
  
+       /*
+        * Get OSVW bits.
+        *
+        * Note that it is possible to have a system with mixed processor
+        * revisions and therefore different OSVW bits. If bits are not the same
+        * on different processors then choose the worst case (i.e. if erratum
+        * is present on one processor and not on another then assume that the
+        * erratum is present everywhere).
+        */
+       if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
+               uint64_t len, status = 0;
+               int err;
+               len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err);
+               if (!err)
+                       status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS,
+                                                     &err);
+               if (err)
+                       osvw_status = osvw_len = 0;
+               else {
+                       if (len < osvw_len)
+                               osvw_len = len;
+                       osvw_status |= status;
+                       osvw_status &= (1ULL << osvw_len) - 1;
+               }
+       } else
+               osvw_status = osvw_len = 0;
        svm_init_erratum_383();
  
 +      amd_pmu_enable_virt();
 +
        return 0;
  }
  
@@@ -910,20 -964,25 +969,25 @@@ static u64 svm_scale_tsc(struct kvm_vcp
        return _tsc;
  }
  
- static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
+ static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
  {
        struct vcpu_svm *svm = to_svm(vcpu);
        u64 ratio;
        u64 khz;
  
-       /* TSC scaling supported? */
-       if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR))
+       /* Guest TSC same frequency as host TSC? */
+       if (!scale) {
+               svm->tsc_ratio = TSC_RATIO_DEFAULT;
                return;
+       }
  
-       /* TSC-Scaling disabled or guest TSC same frequency as host TSC? */
-       if (user_tsc_khz == 0) {
-               vcpu->arch.virtual_tsc_khz = 0;
-               svm->tsc_ratio = TSC_RATIO_DEFAULT;
+       /* TSC scaling supported? */
+       if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
+               if (user_tsc_khz > tsc_khz) {
+                       vcpu->arch.tsc_catchup = 1;
+                       vcpu->arch.tsc_always_catchup = 1;
+               } else
+                       WARN(1, "user requested TSC rate below hardware speed\n");
                return;
        }
  
                                user_tsc_khz);
                return;
        }
-       vcpu->arch.virtual_tsc_khz = user_tsc_khz;
        svm->tsc_ratio             = ratio;
  }
  
@@@ -958,10 -1016,14 +1021,14 @@@ static void svm_write_tsc_offset(struc
        mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
  }
  
- static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
+ static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host)
  {
        struct vcpu_svm *svm = to_svm(vcpu);
  
+       WARN_ON(adjustment < 0);
+       if (host)
+               adjustment = svm_scale_tsc(vcpu, adjustment);
        svm->vmcb->control.tsc_offset += adjustment;
        if (is_guest_mode(vcpu))
                svm->nested.hsave->control.tsc_offset += adjustment;
@@@ -1191,6 -1253,8 +1258,8 @@@ static struct kvm_vcpu *svm_create_vcpu
        if (kvm_vcpu_is_bsp(&svm->vcpu))
                svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
  
+       svm_init_osvw(&svm->vcpu);
        return &svm->vcpu;
  
  free_page4:
@@@ -1268,6 -1332,21 +1337,21 @@@ static void svm_vcpu_put(struct kvm_vcp
                wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
  }
  
+ static void svm_update_cpl(struct kvm_vcpu *vcpu)
+ {
+       struct vcpu_svm *svm = to_svm(vcpu);
+       int cpl;
+       if (!is_protmode(vcpu))
+               cpl = 0;
+       else if (svm->vmcb->save.rflags & X86_EFLAGS_VM)
+               cpl = 3;
+       else
+               cpl = svm->vmcb->save.cs.selector & 0x3;
+       svm->vmcb->save.cpl = cpl;
+ }
  static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
  {
        return to_svm(vcpu)->vmcb->save.rflags;
  
  static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
  {
+       unsigned long old_rflags = to_svm(vcpu)->vmcb->save.rflags;
        to_svm(vcpu)->vmcb->save.rflags = rflags;
+       if ((old_rflags ^ rflags) & X86_EFLAGS_VM)
+               svm_update_cpl(vcpu);
  }
  
  static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
@@@ -1543,9 -1626,7 +1631,7 @@@ static void svm_set_segment(struct kvm_
                s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
        }
        if (seg == VCPU_SREG_CS)
-               svm->vmcb->save.cpl
-                       = (svm->vmcb->save.cs.attrib
-                          >> SVM_SELECTOR_DPL_SHIFT) & 3;
+               svm_update_cpl(vcpu);
  
        mark_dirty(svm->vmcb, VMCB_SEG);
  }
@@@ -2735,7 -2816,10 +2821,10 @@@ static int task_switch_interception(str
             (int_vec == OF_VECTOR || int_vec == BP_VECTOR)))
                skip_emulated_instruction(&svm->vcpu);
  
-       if (kvm_task_switch(&svm->vcpu, tss_selector, reason,
+       if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
+               int_vec = -1;
+       if (kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason,
                                has_error_code, error_code) == EMULATE_FAIL) {
                svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
                svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
diff --combined arch/x86/kvm/vmx.c
index 246490f643b64d06940d40e0a6bd3ec0cb431f7f,2c22fc788da23e5a70f45d621cfdd512a68bb38f..280751c84724a087ce5a6d9bbeb9c3dba67541bc
@@@ -70,9 -70,6 +70,6 @@@ module_param(emulate_invalid_guest_stat
  static bool __read_mostly vmm_exclusive = 1;
  module_param(vmm_exclusive, bool, S_IRUGO);
  
- static bool __read_mostly yield_on_hlt = 1;
- module_param(yield_on_hlt, bool, S_IRUGO);
  static bool __read_mostly fasteoi = 1;
  module_param(fasteoi, bool, S_IRUGO);
  
@@@ -1457,7 -1454,7 +1454,7 @@@ static void __vmx_load_host_state(struc
  #ifdef CONFIG_X86_64
        wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
  #endif
 -      if (__thread_has_fpu(current))
 +      if (user_has_fpu())
                clts();
        load_gdt(&__get_cpu_var(host_gdt));
  }
@@@ -1655,17 -1652,6 +1652,6 @@@ static void skip_emulated_instruction(s
        vmx_set_interrupt_shadow(vcpu, 0);
  }
  
- static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
- {
-       /* Ensure that we clear the HLT state in the VMCS.  We don't need to
-        * explicitly skip the instruction because if the HLT state is set, then
-        * the instruction is already executing and RIP has already been
-        * advanced. */
-       if (!yield_on_hlt &&
-           vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
-               vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
- }
  /*
   * KVM wants to inject page-faults which it got to the guest. This function
   * checks whether in a nested guest, we need to inject them to L1 or L2.
@@@ -1678,7 -1664,7 +1664,7 @@@ static int nested_pf_handled(struct kvm
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
  
        /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */
-       if (!(vmcs12->exception_bitmap & PF_VECTOR))
+       if (!(vmcs12->exception_bitmap & (1u << PF_VECTOR)))
                return 0;
  
        nested_vmx_vmexit(vcpu);
@@@ -1718,7 -1704,6 +1704,6 @@@ static void vmx_queue_exception(struct 
                intr_info |= INTR_TYPE_HARD_EXCEPTION;
  
        vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
-       vmx_clear_hlt(vcpu);
  }
  
  static bool vmx_rdtscp_supported(void)
@@@ -1817,13 -1802,19 +1802,19 @@@ u64 vmx_read_l1_tsc(struct kvm_vcpu *vc
  }
  
  /*
-  * Empty call-back. Needs to be implemented when VMX enables the SET_TSC_KHZ
-  * ioctl. In this case the call-back should update internal vmx state to make
-  * the changes effective.
+  * Engage any workarounds for mis-matched TSC rates.  Currently limited to
+  * software catchup for faster rates on slower CPUs.
   */
- static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
+ static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
  {
-       /* Nothing to do here */
+       if (!scale)
+               return;
+       if (user_tsc_khz > tsc_khz) {
+               vcpu->arch.tsc_catchup = 1;
+               vcpu->arch.tsc_always_catchup = 1;
+       } else
+               WARN(1, "user requested TSC rate below hardware speed\n");
  }
  
  /*
@@@ -1850,7 -1841,7 +1841,7 @@@ static void vmx_write_tsc_offset(struc
        }
  }
  
- static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
+ static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host)
  {
        u64 offset = vmcs_read64(TSC_OFFSET);
        vmcs_write64(TSC_OFFSET, offset + adjustment);
@@@ -2219,6 -2210,9 +2210,9 @@@ static int vmx_set_msr(struct kvm_vcpu 
                msr = find_msr_entry(vmx, msr_index);
                if (msr) {
                        msr->data = data;
+                       if (msr - vmx->guest_msrs < vmx->save_nmsrs)
+                               kvm_set_shared_msr(msr->index, msr->data,
+                                                  msr->mask);
                        break;
                }
                ret = kvm_set_msr_common(vcpu, msr_index, data);
@@@ -2399,7 -2393,7 +2393,7 @@@ static __init int setup_vmcs_config(str
                                &_pin_based_exec_control) < 0)
                return -EIO;
  
-       min =
+       min = CPU_BASED_HLT_EXITING |
  #ifdef CONFIG_X86_64
              CPU_BASED_CR8_LOAD_EXITING |
              CPU_BASED_CR8_STORE_EXITING |
              CPU_BASED_INVLPG_EXITING |
              CPU_BASED_RDPMC_EXITING;
  
-       if (yield_on_hlt)
-               min |= CPU_BASED_HLT_EXITING;
        opt = CPU_BASED_TPR_SHADOW |
              CPU_BASED_USE_MSR_BITMAPS |
              CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
@@@ -4003,7 -3994,6 +3994,6 @@@ static void vmx_inject_irq(struct kvm_v
        } else
                intr |= INTR_TYPE_EXT_INTR;
        vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
-       vmx_clear_hlt(vcpu);
  }
  
  static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
        }
        vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
                        INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
-       vmx_clear_hlt(vcpu);
  }
  
  static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
@@@ -4672,9 -4661,10 +4661,10 @@@ static int handle_task_switch(struct kv
        bool has_error_code = false;
        u32 error_code = 0;
        u16 tss_selector;
-       int reason, type, idt_v;
+       int reason, type, idt_v, idt_index;
  
        idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
+       idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
        type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
  
        exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
                       type != INTR_TYPE_NMI_INTR))
                skip_emulated_instruction(vcpu);
  
-       if (kvm_task_switch(vcpu, tss_selector, reason,
-                               has_error_code, error_code) == EMULATE_FAIL) {
+       if (kvm_task_switch(vcpu, tss_selector,
+                           type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason,
+                           has_error_code, error_code) == EMULATE_FAIL) {
                vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
                vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
                vcpu->run->internal.ndata = 0;
diff --combined arch/x86/kvm/x86.c
index 54696b5f8443509eb9ed94ab77c9617b46f4407b,7287812eeb729745ba0db6f0be2ec4a6f2a025a3..4044ce0bf7c1e7741620b8fcda9a78c9cb1bc775
@@@ -57,7 -57,6 +57,7 @@@
  #include <asm/mtrr.h>
  #include <asm/mce.h>
  #include <asm/i387.h>
 +#include <asm/fpu-internal.h> /* Ugh! */
  #include <asm/xcr.h>
  #include <asm/pvclock.h>
  #include <asm/div64.h>
@@@ -97,6 -96,10 +97,10 @@@ EXPORT_SYMBOL_GPL(kvm_has_tsc_control)
  u32  kvm_max_guest_tsc_khz;
  EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
  
+ /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
+ static u32 tsc_tolerance_ppm = 250;
+ module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
  #define KVM_NR_SHARED_MSRS 16
  
  struct kvm_shared_msrs_global {
@@@ -969,50 -972,51 +973,51 @@@ static inline u64 get_kernel_ns(void
  static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
  unsigned long max_tsc_khz;
  
- static inline int kvm_tsc_changes_freq(void)
+ static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
  {
-       int cpu = get_cpu();
-       int ret = !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
-                 cpufreq_quick_get(cpu) != 0;
-       put_cpu();
-       return ret;
+       return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,
+                                  vcpu->arch.virtual_tsc_shift);
  }
  
u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu)
static u32 adjust_tsc_khz(u32 khz, s32 ppm)
  {
-       if (vcpu->arch.virtual_tsc_khz)
-               return vcpu->arch.virtual_tsc_khz;
-       else
-               return __this_cpu_read(cpu_tsc_khz);
+       u64 v = (u64)khz * (1000000 + ppm);
+       do_div(v, 1000000);
+       return v;
  }
  
- static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
+ static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
  {
-       u64 ret;
+       u32 thresh_lo, thresh_hi;
+       int use_scaling = 0;
  
-       WARN_ON(preemptible());
-       if (kvm_tsc_changes_freq())
-               printk_once(KERN_WARNING
-                "kvm: unreliable cycle conversion on adjustable rate TSC\n");
-       ret = nsec * vcpu_tsc_khz(vcpu);
-       do_div(ret, USEC_PER_SEC);
-       return ret;
- }
- static void kvm_init_tsc_catchup(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
- {
        /* Compute a scale to convert nanoseconds in TSC cycles */
        kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
-                          &vcpu->arch.tsc_catchup_shift,
-                          &vcpu->arch.tsc_catchup_mult);
+                          &vcpu->arch.virtual_tsc_shift,
+                          &vcpu->arch.virtual_tsc_mult);
+       vcpu->arch.virtual_tsc_khz = this_tsc_khz;
+       /*
+        * Compute the variation in TSC rate which is acceptable
+        * within the range of tolerance and decide if the
+        * rate being applied is within that bounds of the hardware
+        * rate.  If so, no scaling or compensation need be done.
+        */
+       thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
+       thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
+       if (this_tsc_khz < thresh_lo || this_tsc_khz > thresh_hi) {
+               pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi);
+               use_scaling = 1;
+       }
+       kvm_x86_ops->set_tsc_khz(vcpu, this_tsc_khz, use_scaling);
  }
  
  static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
  {
-       u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec,
-                                     vcpu->arch.tsc_catchup_mult,
-                                     vcpu->arch.tsc_catchup_shift);
-       tsc += vcpu->arch.last_tsc_write;
+       u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
+                                     vcpu->arch.virtual_tsc_mult,
+                                     vcpu->arch.virtual_tsc_shift);
+       tsc += vcpu->arch.this_tsc_write;
        return tsc;
  }
  
@@@ -1021,48 -1025,88 +1026,88 @@@ void kvm_write_tsc(struct kvm_vcpu *vcp
        struct kvm *kvm = vcpu->kvm;
        u64 offset, ns, elapsed;
        unsigned long flags;
-       s64 sdiff;
+       s64 usdiff;
  
        raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
        offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
        ns = get_kernel_ns();
        elapsed = ns - kvm->arch.last_tsc_nsec;
-       sdiff = data - kvm->arch.last_tsc_write;
-       if (sdiff < 0)
-               sdiff = -sdiff;
+       /* n.b - signed multiplication and division required */
+       usdiff = data - kvm->arch.last_tsc_write;
+ #ifdef CONFIG_X86_64
+       usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz;
+ #else
+       /* do_div() only does unsigned */
+       asm("idivl %2; xor %%edx, %%edx"
+           : "=A"(usdiff)
+           : "A"(usdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz));
+ #endif
+       do_div(elapsed, 1000);
+       usdiff -= elapsed;
+       if (usdiff < 0)
+               usdiff = -usdiff;
  
        /*
-        * Special case: close write to TSC within 5 seconds of
-        * another CPU is interpreted as an attempt to synchronize
-        * The 5 seconds is to accommodate host load / swapping as
-        * well as any reset of TSC during the boot process.
-        *
-        * In that case, for a reliable TSC, we can match TSC offsets,
-        * or make a best guest using elapsed value.
-        */
-       if (sdiff < nsec_to_cycles(vcpu, 5ULL * NSEC_PER_SEC) &&
-           elapsed < 5ULL * NSEC_PER_SEC) {
+        * Special case: TSC write with a small delta (1 second) of virtual
+        * cycle time against real time is interpreted as an attempt to
+        * synchronize the CPU.
+          *
+        * For a reliable TSC, we can match TSC offsets, and for an unstable
+        * TSC, we add elapsed time in this computation.  We could let the
+        * compensation code attempt to catch up if we fall behind, but
+        * it's better to try to match offsets from the beginning.
+          */
+       if (usdiff < USEC_PER_SEC &&
+           vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
                if (!check_tsc_unstable()) {
-                       offset = kvm->arch.last_tsc_offset;
+                       offset = kvm->arch.cur_tsc_offset;
                        pr_debug("kvm: matched tsc offset for %llu\n", data);
                } else {
                        u64 delta = nsec_to_cycles(vcpu, elapsed);
-                       offset += delta;
+                       data += delta;
+                       offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
                        pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
                }
-               ns = kvm->arch.last_tsc_nsec;
+       } else {
+               /*
+                * We split periods of matched TSC writes into generations.
+                * For each generation, we track the original measured
+                * nanosecond time, offset, and write, so if TSCs are in
+                * sync, we can match exact offset, and if not, we can match
+                * exact software computaion in compute_guest_tsc()
+                *
+                * These values are tracked in kvm->arch.cur_xxx variables.
+                */
+               kvm->arch.cur_tsc_generation++;
+               kvm->arch.cur_tsc_nsec = ns;
+               kvm->arch.cur_tsc_write = data;
+               kvm->arch.cur_tsc_offset = offset;
+               pr_debug("kvm: new tsc generation %u, clock %llu\n",
+                        kvm->arch.cur_tsc_generation, data);
        }
+       /*
+        * We also track th most recent recorded KHZ, write and time to
+        * allow the matching interval to be extended at each write.
+        */
        kvm->arch.last_tsc_nsec = ns;
        kvm->arch.last_tsc_write = data;
-       kvm->arch.last_tsc_offset = offset;
-       kvm_x86_ops->write_tsc_offset(vcpu, offset);
-       raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
+       kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
  
        /* Reset of TSC must disable overshoot protection below */
        vcpu->arch.hv_clock.tsc_timestamp = 0;
-       vcpu->arch.last_tsc_write = data;
-       vcpu->arch.last_tsc_nsec = ns;
+       vcpu->arch.last_guest_tsc = data;
+       /* Keep track of which generation this VCPU has synchronized to */
+       vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
+       vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
+       vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
+       kvm_x86_ops->write_tsc_offset(vcpu, offset);
+       raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
  }
  EXPORT_SYMBOL_GPL(kvm_write_tsc);
  
  static int kvm_guest_time_update(struct kvm_vcpu *v)
        local_irq_save(flags);
        tsc_timestamp = kvm_x86_ops->read_l1_tsc(v);
        kernel_ns = get_kernel_ns();
-       this_tsc_khz = vcpu_tsc_khz(v);
+       this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
        if (unlikely(this_tsc_khz == 0)) {
                local_irq_restore(flags);
                kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
        if (vcpu->tsc_catchup) {
                u64 tsc = compute_guest_tsc(v, kernel_ns);
                if (tsc > tsc_timestamp) {
-                       kvm_x86_ops->adjust_tsc_offset(v, tsc - tsc_timestamp);
+                       adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
                        tsc_timestamp = tsc;
                }
        }
         * observed by the guest and ensure the new system time is greater.
         */
        max_kernel_ns = 0;
-       if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) {
+       if (vcpu->hv_clock.tsc_timestamp) {
                max_kernel_ns = vcpu->last_guest_tsc -
                                vcpu->hv_clock.tsc_timestamp;
                max_kernel_ns = pvclock_scale_delta(max_kernel_ns,
         */
        vcpu->hv_clock.version += 2;
  
 -      shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
 +      shared_kaddr = kmap_atomic(vcpu->time_page);
  
        memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
               sizeof(vcpu->hv_clock));
  
 -      kunmap_atomic(shared_kaddr, KM_USER0);
 +      kunmap_atomic(shared_kaddr);
  
        mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
        return 0;
@@@ -1504,6 -1548,7 +1549,7 @@@ int kvm_set_msr_common(struct kvm_vcpu 
        case MSR_K7_HWCR:
                data &= ~(u64)0x40;     /* ignore flush filter disable */
                data &= ~(u64)0x100;    /* ignore ignne emulation enable */
+               data &= ~(u64)0x8;      /* ignore TLB cache disable */
                if (data != 0) {
                        pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
                                data);
                 */
                pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data);
                break;
+       case MSR_AMD64_OSVW_ID_LENGTH:
+               if (!guest_cpuid_has_osvw(vcpu))
+                       return 1;
+               vcpu->arch.osvw.length = data;
+               break;
+       case MSR_AMD64_OSVW_STATUS:
+               if (!guest_cpuid_has_osvw(vcpu))
+                       return 1;
+               vcpu->arch.osvw.status = data;
+               break;
        default:
                if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
                        return xen_hvm_config(vcpu, data);
@@@ -1960,6 -2015,16 +2016,16 @@@ int kvm_get_msr_common(struct kvm_vcpu 
                 */
                data = 0xbe702111;
                break;
+       case MSR_AMD64_OSVW_ID_LENGTH:
+               if (!guest_cpuid_has_osvw(vcpu))
+                       return 1;
+               data = vcpu->arch.osvw.length;
+               break;
+       case MSR_AMD64_OSVW_STATUS:
+               if (!guest_cpuid_has_osvw(vcpu))
+                       return 1;
+               data = vcpu->arch.osvw.status;
+               break;
        default:
                if (kvm_pmu_msr(vcpu, msr))
                        return kvm_pmu_get_msr(vcpu, msr, pdata);
@@@ -2080,6 -2145,7 +2146,7 @@@ int kvm_dev_ioctl_check_extension(long 
        case KVM_CAP_XSAVE:
        case KVM_CAP_ASYNC_PF:
        case KVM_CAP_GET_TSC_KHZ:
+       case KVM_CAP_PCI_2_3:
                r = 1;
                break;
        case KVM_CAP_COALESCED_MMIO:
@@@ -2214,19 -2280,23 +2281,23 @@@ void kvm_arch_vcpu_load(struct kvm_vcp
        }
  
        kvm_x86_ops->vcpu_load(vcpu, cpu);
-       if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
-               /* Make sure TSC doesn't go backwards */
-               s64 tsc_delta;
-               u64 tsc;
  
-               tsc = kvm_x86_ops->read_l1_tsc(vcpu);
-               tsc_delta = !vcpu->arch.last_guest_tsc ? 0 :
-                            tsc - vcpu->arch.last_guest_tsc;
+       /* Apply any externally detected TSC adjustments (due to suspend) */
+       if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
+               adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
+               vcpu->arch.tsc_offset_adjustment = 0;
+               set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
+       }
  
+       if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
+               s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
+                               native_read_tsc() - vcpu->arch.last_host_tsc;
                if (tsc_delta < 0)
                        mark_tsc_unstable("KVM discovered backwards TSC");
                if (check_tsc_unstable()) {
-                       kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta);
+                       u64 offset = kvm_x86_ops->compute_tsc_offset(vcpu,
+                                               vcpu->arch.last_guest_tsc);
+                       kvm_x86_ops->write_tsc_offset(vcpu, offset);
                        vcpu->arch.tsc_catchup = 1;
                }
                kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@@@ -2243,7 -2313,7 +2314,7 @@@ void kvm_arch_vcpu_put(struct kvm_vcpu 
  {
        kvm_x86_ops->vcpu_put(vcpu);
        kvm_put_guest_fpu(vcpu);
-       vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu);
+       vcpu->arch.last_host_tsc = native_read_tsc();
  }
  
  static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
@@@ -2785,26 -2855,21 +2856,21 @@@ long kvm_arch_vcpu_ioctl(struct file *f
                u32 user_tsc_khz;
  
                r = -EINVAL;
-               if (!kvm_has_tsc_control)
-                       break;
                user_tsc_khz = (u32)arg;
  
                if (user_tsc_khz >= kvm_max_guest_tsc_khz)
                        goto out;
  
-               kvm_x86_ops->set_tsc_khz(vcpu, user_tsc_khz);
+               if (user_tsc_khz == 0)
+                       user_tsc_khz = tsc_khz;
+               kvm_set_tsc_khz(vcpu, user_tsc_khz);
  
                r = 0;
                goto out;
        }
        case KVM_GET_TSC_KHZ: {
-               r = -EIO;
-               if (check_tsc_unstable())
-                       goto out;
-               r = vcpu_tsc_khz(vcpu);
+               r = vcpu->arch.virtual_tsc_khz;
                goto out;
        }
        default:
        return r;
  }
  
+ int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
+ {
+       return VM_FAULT_SIGBUS;
+ }
  static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
  {
        int ret;
@@@ -2998,6 -3068,8 +3069,8 @@@ static void write_protect_slot(struct k
                               unsigned long *dirty_bitmap,
                               unsigned long nr_dirty_pages)
  {
+       spin_lock(&kvm->mmu_lock);
        /* Not many dirty pages compared to # of shadow pages. */
        if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) {
                unsigned long gfn_offset;
                for_each_set_bit(gfn_offset, dirty_bitmap, memslot->npages) {
                        unsigned long gfn = memslot->base_gfn + gfn_offset;
  
-                       spin_lock(&kvm->mmu_lock);
                        kvm_mmu_rmap_write_protect(kvm, gfn, memslot);
-                       spin_unlock(&kvm->mmu_lock);
                }
                kvm_flush_remote_tlbs(kvm);
-       } else {
-               spin_lock(&kvm->mmu_lock);
+       } else
                kvm_mmu_slot_remove_write_access(kvm, memslot->id);
-               spin_unlock(&kvm->mmu_lock);
-       }
+       spin_unlock(&kvm->mmu_lock);
  }
  
  /*
@@@ -3133,6 -3202,9 +3203,9 @@@ long kvm_arch_vm_ioctl(struct file *fil
                r = -EEXIST;
                if (kvm->arch.vpic)
                        goto create_irqchip_unlock;
+               r = -EINVAL;
+               if (atomic_read(&kvm->online_vcpus))
+                       goto create_irqchip_unlock;
                r = -ENOMEM;
                vpic = kvm_create_pic(kvm);
                if (vpic) {
@@@ -3849,7 -3921,7 +3922,7 @@@ static int emulator_cmpxchg_emulated(st
                goto emul_write;
        }
  
 -      kaddr = kmap_atomic(page, KM_USER0);
 +      kaddr = kmap_atomic(page);
        kaddr += offset_in_page(gpa);
        switch (bytes) {
        case 1:
        default:
                BUG();
        }
 -      kunmap_atomic(kaddr, KM_USER0);
 +      kunmap_atomic(kaddr);
        kvm_release_page_dirty(page);
  
        if (!exchanged)
@@@ -4063,6 -4135,11 +4136,11 @@@ static int emulator_set_cr(struct x86_e
        return res;
  }
  
+ static void emulator_set_rflags(struct x86_emulate_ctxt *ctxt, ulong val)
+ {
+       kvm_set_rflags(emul_to_vcpu(ctxt), val);
+ }
  static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
  {
        return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt));
@@@ -4244,6 -4321,7 +4322,7 @@@ static struct x86_emulate_ops emulate_o
        .set_idt             = emulator_set_idt,
        .get_cr              = emulator_get_cr,
        .set_cr              = emulator_set_cr,
+       .set_rflags          = emulator_set_rflags,
        .cpl                 = emulator_get_cpl,
        .get_dr              = emulator_get_dr,
        .set_dr              = emulator_set_dr,
@@@ -5288,6 -5366,8 +5367,8 @@@ static int vcpu_enter_guest(struct kvm_
                profile_hit(KVM_PROFILING, (void *)rip);
        }
  
+       if (unlikely(vcpu->arch.tsc_always_catchup))
+               kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
  
        kvm_lapic_sync_from_vapic(vcpu);
  
@@@ -5587,15 -5667,15 +5668,15 @@@ int kvm_arch_vcpu_ioctl_set_mpstate(str
        return 0;
  }
  
- int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
-                   bool has_error_code, u32 error_code)
+ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
+                   int reason, bool has_error_code, u32 error_code)
  {
        struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
        int ret;
  
        init_emulate_ctxt(vcpu);
  
-       ret = emulator_task_switch(ctxt, tss_selector, reason,
+       ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason,
                                   has_error_code, error_code);
  
        if (ret)
@@@ -5928,13 -6008,88 +6009,88 @@@ int kvm_arch_hardware_enable(void *garb
        struct kvm *kvm;
        struct kvm_vcpu *vcpu;
        int i;
+       int ret;
+       u64 local_tsc;
+       u64 max_tsc = 0;
+       bool stable, backwards_tsc = false;
  
        kvm_shared_msr_cpu_online();
-       list_for_each_entry(kvm, &vm_list, vm_list)
-               kvm_for_each_vcpu(i, vcpu, kvm)
-                       if (vcpu->cpu == smp_processor_id())
-                               kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
-       return kvm_x86_ops->hardware_enable(garbage);
+       ret = kvm_x86_ops->hardware_enable(garbage);
+       if (ret != 0)
+               return ret;
+       local_tsc = native_read_tsc();
+       stable = !check_tsc_unstable();
+       list_for_each_entry(kvm, &vm_list, vm_list) {
+               kvm_for_each_vcpu(i, vcpu, kvm) {
+                       if (!stable && vcpu->cpu == smp_processor_id())
+                               set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
+                       if (stable && vcpu->arch.last_host_tsc > local_tsc) {
+                               backwards_tsc = true;
+                               if (vcpu->arch.last_host_tsc > max_tsc)
+                                       max_tsc = vcpu->arch.last_host_tsc;
+                       }
+               }
+       }
+       /*
+        * Sometimes, even reliable TSCs go backwards.  This happens on
+        * platforms that reset TSC during suspend or hibernate actions, but
+        * maintain synchronization.  We must compensate.  Fortunately, we can
+        * detect that condition here, which happens early in CPU bringup,
+        * before any KVM threads can be running.  Unfortunately, we can't
+        * bring the TSCs fully up to date with real time, as we aren't yet far
+        * enough into CPU bringup that we know how much real time has actually
+        * elapsed; our helper function, get_kernel_ns() will be using boot
+        * variables that haven't been updated yet.
+        *
+        * So we simply find the maximum observed TSC above, then record the
+        * adjustment to TSC in each VCPU.  When the VCPU later gets loaded,
+        * the adjustment will be applied.  Note that we accumulate
+        * adjustments, in case multiple suspend cycles happen before some VCPU
+        * gets a chance to run again.  In the event that no KVM threads get a
+        * chance to run, we will miss the entire elapsed period, as we'll have
+        * reset last_host_tsc, so VCPUs will not have the TSC adjusted and may
+        * loose cycle time.  This isn't too big a deal, since the loss will be
+        * uniform across all VCPUs (not to mention the scenario is extremely
+        * unlikely). It is possible that a second hibernate recovery happens
+        * much faster than a first, causing the observed TSC here to be
+        * smaller; this would require additional padding adjustment, which is
+        * why we set last_host_tsc to the local tsc observed here.
+        *
+        * N.B. - this code below runs only on platforms with reliable TSC,
+        * as that is the only way backwards_tsc is set above.  Also note
+        * that this runs for ALL vcpus, which is not a bug; all VCPUs should
+        * have the same delta_cyc adjustment applied if backwards_tsc
+        * is detected.  Note further, this adjustment is only done once,
+        * as we reset last_host_tsc on all VCPUs to stop this from being
+        * called multiple times (one for each physical CPU bringup).
+        *
+        * Platforms with unnreliable TSCs don't have to deal with this, they
+        * will be compensated by the logic in vcpu_load, which sets the TSC to
+        * catchup mode.  This will catchup all VCPUs to real time, but cannot
+        * guarantee that they stay in perfect synchronization.
+        */
+       if (backwards_tsc) {
+               u64 delta_cyc = max_tsc - local_tsc;
+               list_for_each_entry(kvm, &vm_list, vm_list) {
+                       kvm_for_each_vcpu(i, vcpu, kvm) {
+                               vcpu->arch.tsc_offset_adjustment += delta_cyc;
+                               vcpu->arch.last_host_tsc = local_tsc;
+                       }
+                       /*
+                        * We have to disable TSC offset matching.. if you were
+                        * booting a VM while issuing an S4 host suspend....
+                        * you may have some problem.  Solving this issue is
+                        * left as an exercise to the reader.
+                        */
+                       kvm->arch.last_tsc_nsec = 0;
+                       kvm->arch.last_tsc_write = 0;
+               }
+       }
+       return 0;
  }
  
  void kvm_arch_hardware_disable(void *garbage)
@@@ -5958,6 -6113,11 +6114,11 @@@ void kvm_arch_check_processor_compat(vo
        kvm_x86_ops->check_processor_compatibility(rtn);
  }
  
+ bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
+ {
+       return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
+ }
  int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
  {
        struct page *page;
        }
        vcpu->arch.pio_data = page_address(page);
  
-       kvm_init_tsc_catchup(vcpu, max_tsc_khz);
+       kvm_set_tsc_khz(vcpu, max_tsc_khz);
  
        r = kvm_mmu_create(vcpu);
        if (r < 0)
@@@ -6032,8 -6192,11 +6193,11 @@@ void kvm_arch_vcpu_uninit(struct kvm_vc
        free_page((unsigned long)vcpu->arch.pio_data);
  }
  
- int kvm_arch_init_vm(struct kvm *kvm)
+ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
  {
+       if (type)
+               return -EINVAL;
        INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
        INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
  
@@@ -6093,6 -6256,65 +6257,65 @@@ void kvm_arch_destroy_vm(struct kvm *kv
                put_page(kvm->arch.ept_identity_pagetable);
  }
  
+ void kvm_arch_free_memslot(struct kvm_memory_slot *free,
+                          struct kvm_memory_slot *dont)
+ {
+       int i;
+       for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
+               if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) {
+                       vfree(free->arch.lpage_info[i]);
+                       free->arch.lpage_info[i] = NULL;
+               }
+       }
+ }
+ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
+ {
+       int i;
+       for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
+               unsigned long ugfn;
+               int lpages;
+               int level = i + 2;
+               lpages = gfn_to_index(slot->base_gfn + npages - 1,
+                                     slot->base_gfn, level) + 1;
+               slot->arch.lpage_info[i] =
+                       vzalloc(lpages * sizeof(*slot->arch.lpage_info[i]));
+               if (!slot->arch.lpage_info[i])
+                       goto out_free;
+               if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
+                       slot->arch.lpage_info[i][0].write_count = 1;
+               if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
+                       slot->arch.lpage_info[i][lpages - 1].write_count = 1;
+               ugfn = slot->userspace_addr >> PAGE_SHIFT;
+               /*
+                * If the gfn and userspace address are not aligned wrt each
+                * other, or if explicitly asked to, disable large page
+                * support for this slot
+                */
+               if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
+                   !kvm_largepages_enabled()) {
+                       unsigned long j;
+                       for (j = 0; j < lpages; ++j)
+                               slot->arch.lpage_info[i][j].write_count = 1;
+               }
+       }
+       return 0;
+ out_free:
+       for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
+               vfree(slot->arch.lpage_info[i]);
+               slot->arch.lpage_info[i] = NULL;
+       }
+       return -ENOMEM;
+ }
  int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                struct kvm_memory_slot *memslot,
                                struct kvm_memory_slot old,
diff --combined arch/x86/power/cpu.c
index 4889655ba784d49736f29fa42f855edbfabef3ca,0e76a2814127154f31bc4b373ed2bbbb95425f8d..47936830968c5240497725557be336f6fed112c9
@@@ -20,7 -20,6 +20,7 @@@
  #include <asm/xcr.h>
  #include <asm/suspend.h>
  #include <asm/debugreg.h>
 +#include <asm/fpu-internal.h> /* pcntxt_mask */
  
  #ifdef CONFIG_X86_32
  static struct saved_context saved_context;
@@@ -115,7 -114,7 +115,7 @@@ static void __save_processor_state(stru
  void save_processor_state(void)
  {
        __save_processor_state(&saved_context);
-       save_sched_clock_state();
+       x86_platform.save_sched_clock_state();
  }
  #ifdef CONFIG_X86_32
  EXPORT_SYMBOL(save_processor_state);
@@@ -231,8 -230,8 +231,8 @@@ static void __restore_processor_state(s
  /* Needed by apm.c */
  void restore_processor_state(void)
  {
+       x86_platform.restore_sched_clock_state();
        __restore_processor_state(&saved_context);
-       restore_sched_clock_state();
  }
  #ifdef CONFIG_X86_32
  EXPORT_SYMBOL(restore_processor_state);
diff --combined include/linux/kvm_host.h
index ca1b153585d3e6e2196f5d5c147d965c6bd84f14,40bb1c661a6e3ed1f3b808829d1dea1083648f3c..665a260c7e09948fa3b5e3516efca2890edef840
@@@ -13,7 -13,6 +13,7 @@@
  #include <linux/spinlock.h>
  #include <linux/signal.h>
  #include <linux/sched.h>
 +#include <linux/bug.h>
  #include <linux/mm.h>
  #include <linux/mmu_notifier.h>
  #include <linux/preempt.h>
@@@ -172,11 -171,6 +172,6 @@@ static inline int kvm_vcpu_exiting_gues
   */
  #define KVM_MEM_MAX_NR_PAGES ((1UL << 31) - 1)
  
- struct kvm_lpage_info {
-       unsigned long rmap_pde;
-       int write_count;
- };
  struct kvm_memory_slot {
        gfn_t base_gfn;
        unsigned long npages;
        unsigned long *dirty_bitmap;
        unsigned long *dirty_bitmap_head;
        unsigned long nr_dirty_pages;
-       struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
+       struct kvm_arch_memory_slot arch;
        unsigned long userspace_addr;
        int user_alloc;
        int id;
@@@ -377,6 -371,9 +372,9 @@@ int kvm_set_memory_region(struct kvm *k
  int __kvm_set_memory_region(struct kvm *kvm,
                            struct kvm_userspace_memory_region *mem,
                            int user_alloc);
+ void kvm_arch_free_memslot(struct kvm_memory_slot *free,
+                          struct kvm_memory_slot *dont);
+ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages);
  int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                struct kvm_memory_slot *memslot,
                                struct kvm_memory_slot old,
@@@ -386,6 -383,7 +384,7 @@@ void kvm_arch_commit_memory_region(stru
                                struct kvm_userspace_memory_region *mem,
                                struct kvm_memory_slot old,
                                int user_alloc);
+ bool kvm_largepages_enabled(void);
  void kvm_disable_largepages(void);
  void kvm_arch_flush_shadow(struct kvm *kvm);
  
@@@ -451,6 -449,7 +450,7 @@@ long kvm_arch_dev_ioctl(struct file *fi
                        unsigned int ioctl, unsigned long arg);
  long kvm_arch_vcpu_ioctl(struct file *filp,
                         unsigned int ioctl, unsigned long arg);
+ int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf);
  
  int kvm_dev_ioctl_check_extension(long ext);
  
@@@ -521,7 -520,7 +521,7 @@@ static inline void kvm_arch_free_vm(str
  }
  #endif
  
- int kvm_arch_init_vm(struct kvm *kvm);
+ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type);
  void kvm_arch_destroy_vm(struct kvm *kvm);
  void kvm_free_all_assigned_devices(struct kvm *kvm);
  void kvm_arch_sync_events(struct kvm *kvm);
@@@ -547,6 -546,7 +547,7 @@@ struct kvm_assigned_dev_kernel 
        unsigned int entries_nr;
        int host_irq;
        bool host_irq_disabled;
+       bool pci_2_3;
        struct msix_entry *host_msix_entries;
        int guest_irq;
        struct msix_entry *guest_msix_entries;
        struct pci_dev *dev;
        struct kvm *kvm;
        spinlock_t intx_lock;
+       spinlock_t intx_mask_lock;
        char irq_name[32];
        struct pci_saved_state *pci_saved_state;
  };
@@@ -651,11 -652,43 +653,43 @@@ static inline void kvm_guest_exit(void
        current->flags &= ~PF_VCPU;
  }
  
+ /*
+  * search_memslots() and __gfn_to_memslot() are here because they are
+  * used in non-modular code in arch/powerpc/kvm/book3s_hv_rm_mmu.c.
+  * gfn_to_memslot() itself isn't here as an inline because that would
+  * bloat other code too much.
+  */
+ static inline struct kvm_memory_slot *
+ search_memslots(struct kvm_memslots *slots, gfn_t gfn)
+ {
+       struct kvm_memory_slot *memslot;
+       kvm_for_each_memslot(memslot, slots)
+               if (gfn >= memslot->base_gfn &&
+                     gfn < memslot->base_gfn + memslot->npages)
+                       return memslot;
+       return NULL;
+ }
+ static inline struct kvm_memory_slot *
+ __gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn)
+ {
+       return search_memslots(slots, gfn);
+ }
  static inline int memslot_id(struct kvm *kvm, gfn_t gfn)
  {
        return gfn_to_memslot(kvm, gfn)->id;
  }
  
+ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
+ {
+       /* KVM_HPAGE_GFN_SHIFT(PT_PAGE_TABLE_LEVEL) must be 0. */
+       return (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
+               (base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
+ }
  static inline unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
                                               gfn_t gfn)
  {
@@@ -702,12 -735,16 +736,16 @@@ static inline int mmu_notifier_retry(st
        if (unlikely(vcpu->kvm->mmu_notifier_count))
                return 1;
        /*
-        * Both reads happen under the mmu_lock and both values are
-        * modified under mmu_lock, so there's no need of smb_rmb()
-        * here in between, otherwise mmu_notifier_count should be
-        * read before mmu_notifier_seq, see
-        * mmu_notifier_invalidate_range_end write side.
+        * Ensure the read of mmu_notifier_count happens before the read
+        * of mmu_notifier_seq.  This interacts with the smp_wmb() in
+        * mmu_notifier_invalidate_range_end to make sure that the caller
+        * either sees the old (non-zero) value of mmu_notifier_count or
+        * the new (incremented) value of mmu_notifier_seq.
+        * PowerPC Book3s HV KVM calls this under a per-page lock
+        * rather than under kvm->mmu_lock, for scalability, so
+        * can't rely on kvm->mmu_lock to keep things ordered.
         */
+       smp_rmb();
        if (vcpu->kvm->mmu_notifier_seq != mmu_seq)
                return 1;
        return 0;
@@@ -770,6 -807,13 +808,13 @@@ static inline bool kvm_vcpu_is_bsp(stru
  {
        return vcpu->kvm->bsp_vcpu_id == vcpu->vcpu_id;
  }
+ bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu);
+ #else
+ static inline bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) { return true; }
  #endif
  
  #ifdef __KVM_HAVE_DEVICE_ASSIGNMENT