memcg: fix Bad page state after replace_page_cache
[~shefty/rdma-dev.git] / arch / x86 / kvm / pmu.c
1 /*
2  * Kernel-based Virtual Machine -- Performane Monitoring Unit support
3  *
4  * Copyright 2011 Red Hat, Inc. and/or its affiliates.
5  *
6  * Authors:
7  *   Avi Kivity   <avi@redhat.com>
8  *   Gleb Natapov <gleb@redhat.com>
9  *
10  * This work is licensed under the terms of the GNU GPL, version 2.  See
11  * the COPYING file in the top-level directory.
12  *
13  */
14
15 #include <linux/types.h>
16 #include <linux/kvm_host.h>
17 #include <linux/perf_event.h>
18 #include "x86.h"
19 #include "cpuid.h"
20 #include "lapic.h"
21
22 static struct kvm_arch_event_perf_mapping {
23         u8 eventsel;
24         u8 unit_mask;
25         unsigned event_type;
26         bool inexact;
27 } arch_events[] = {
28         /* Index must match CPUID 0x0A.EBX bit vector */
29         [0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES },
30         [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
31         [2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES  },
32         [3] = { 0x2e, 0x4f, PERF_COUNT_HW_CACHE_REFERENCES },
33         [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES },
34         [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
35         [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
36         [7] = { 0x00, 0x30, PERF_COUNT_HW_REF_CPU_CYCLES },
37 };
38
39 /* mapping between fixed pmc index and arch_events array */
40 int fixed_pmc_events[] = {1, 0, 7};
41
42 static bool pmc_is_gp(struct kvm_pmc *pmc)
43 {
44         return pmc->type == KVM_PMC_GP;
45 }
46
47 static inline u64 pmc_bitmask(struct kvm_pmc *pmc)
48 {
49         struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
50
51         return pmu->counter_bitmask[pmc->type];
52 }
53
54 static inline bool pmc_enabled(struct kvm_pmc *pmc)
55 {
56         struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
57         return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl);
58 }
59
60 static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr,
61                                          u32 base)
62 {
63         if (msr >= base && msr < base + pmu->nr_arch_gp_counters)
64                 return &pmu->gp_counters[msr - base];
65         return NULL;
66 }
67
68 static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr)
69 {
70         int base = MSR_CORE_PERF_FIXED_CTR0;
71         if (msr >= base && msr < base + pmu->nr_arch_fixed_counters)
72                 return &pmu->fixed_counters[msr - base];
73         return NULL;
74 }
75
76 static inline struct kvm_pmc *get_fixed_pmc_idx(struct kvm_pmu *pmu, int idx)
77 {
78         return get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + idx);
79 }
80
81 static struct kvm_pmc *global_idx_to_pmc(struct kvm_pmu *pmu, int idx)
82 {
83         if (idx < X86_PMC_IDX_FIXED)
84                 return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + idx, MSR_P6_EVNTSEL0);
85         else
86                 return get_fixed_pmc_idx(pmu, idx - X86_PMC_IDX_FIXED);
87 }
88
89 void kvm_deliver_pmi(struct kvm_vcpu *vcpu)
90 {
91         if (vcpu->arch.apic)
92                 kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
93 }
94
95 static void trigger_pmi(struct irq_work *irq_work)
96 {
97         struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu,
98                         irq_work);
99         struct kvm_vcpu *vcpu = container_of(pmu, struct kvm_vcpu,
100                         arch.pmu);
101
102         kvm_deliver_pmi(vcpu);
103 }
104
105 static void kvm_perf_overflow(struct perf_event *perf_event,
106                               struct perf_sample_data *data,
107                               struct pt_regs *regs)
108 {
109         struct kvm_pmc *pmc = perf_event->overflow_handler_context;
110         struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
111         __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
112 }
113
114 static void kvm_perf_overflow_intr(struct perf_event *perf_event,
115                 struct perf_sample_data *data, struct pt_regs *regs)
116 {
117         struct kvm_pmc *pmc = perf_event->overflow_handler_context;
118         struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
119         if (!test_and_set_bit(pmc->idx, (unsigned long *)&pmu->reprogram_pmi)) {
120                 kvm_perf_overflow(perf_event, data, regs);
121                 kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
122                 /*
123                  * Inject PMI. If vcpu was in a guest mode during NMI PMI
124                  * can be ejected on a guest mode re-entry. Otherwise we can't
125                  * be sure that vcpu wasn't executing hlt instruction at the
126                  * time of vmexit and is not going to re-enter guest mode until,
127                  * woken up. So we should wake it, but this is impossible from
128                  * NMI context. Do it from irq work instead.
129                  */
130                 if (!kvm_is_in_guest())
131                         irq_work_queue(&pmc->vcpu->arch.pmu.irq_work);
132                 else
133                         kvm_make_request(KVM_REQ_PMI, pmc->vcpu);
134         }
135 }
136
137 static u64 read_pmc(struct kvm_pmc *pmc)
138 {
139         u64 counter, enabled, running;
140
141         counter = pmc->counter;
142
143         if (pmc->perf_event)
144                 counter += perf_event_read_value(pmc->perf_event,
145                                                  &enabled, &running);
146
147         /* FIXME: Scaling needed? */
148
149         return counter & pmc_bitmask(pmc);
150 }
151
152 static void stop_counter(struct kvm_pmc *pmc)
153 {
154         if (pmc->perf_event) {
155                 pmc->counter = read_pmc(pmc);
156                 perf_event_release_kernel(pmc->perf_event);
157                 pmc->perf_event = NULL;
158         }
159 }
160
161 static void reprogram_counter(struct kvm_pmc *pmc, u32 type,
162                 unsigned config, bool exclude_user, bool exclude_kernel,
163                 bool intr)
164 {
165         struct perf_event *event;
166         struct perf_event_attr attr = {
167                 .type = type,
168                 .size = sizeof(attr),
169                 .pinned = true,
170                 .exclude_idle = true,
171                 .exclude_host = 1,
172                 .exclude_user = exclude_user,
173                 .exclude_kernel = exclude_kernel,
174                 .config = config,
175         };
176
177         attr.sample_period = (-pmc->counter) & pmc_bitmask(pmc);
178
179         event = perf_event_create_kernel_counter(&attr, -1, current,
180                                                  intr ? kvm_perf_overflow_intr :
181                                                  kvm_perf_overflow, pmc);
182         if (IS_ERR(event)) {
183                 printk_once("kvm: pmu event creation failed %ld\n",
184                                 PTR_ERR(event));
185                 return;
186         }
187
188         pmc->perf_event = event;
189         clear_bit(pmc->idx, (unsigned long*)&pmc->vcpu->arch.pmu.reprogram_pmi);
190 }
191
192 static unsigned find_arch_event(struct kvm_pmu *pmu, u8 event_select,
193                 u8 unit_mask)
194 {
195         int i;
196
197         for (i = 0; i < ARRAY_SIZE(arch_events); i++)
198                 if (arch_events[i].eventsel == event_select
199                                 && arch_events[i].unit_mask == unit_mask
200                                 && (pmu->available_event_types & (1 << i)))
201                         break;
202
203         if (i == ARRAY_SIZE(arch_events))
204                 return PERF_COUNT_HW_MAX;
205
206         return arch_events[i].event_type;
207 }
208
209 static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
210 {
211         unsigned config, type = PERF_TYPE_RAW;
212         u8 event_select, unit_mask;
213
214         if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
215                 printk_once("kvm pmu: pin control bit is ignored\n");
216
217         pmc->eventsel = eventsel;
218
219         stop_counter(pmc);
220
221         if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_enabled(pmc))
222                 return;
223
224         event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
225         unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
226
227         if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE |
228                                 ARCH_PERFMON_EVENTSEL_INV |
229                                 ARCH_PERFMON_EVENTSEL_CMASK))) {
230                 config = find_arch_event(&pmc->vcpu->arch.pmu, event_select,
231                                 unit_mask);
232                 if (config != PERF_COUNT_HW_MAX)
233                         type = PERF_TYPE_HARDWARE;
234         }
235
236         if (type == PERF_TYPE_RAW)
237                 config = eventsel & X86_RAW_EVENT_MASK;
238
239         reprogram_counter(pmc, type, config,
240                         !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
241                         !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
242                         eventsel & ARCH_PERFMON_EVENTSEL_INT);
243 }
244
245 static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx)
246 {
247         unsigned en = en_pmi & 0x3;
248         bool pmi = en_pmi & 0x8;
249
250         stop_counter(pmc);
251
252         if (!en || !pmc_enabled(pmc))
253                 return;
254
255         reprogram_counter(pmc, PERF_TYPE_HARDWARE,
256                         arch_events[fixed_pmc_events[idx]].event_type,
257                         !(en & 0x2), /* exclude user */
258                         !(en & 0x1), /* exclude kernel */
259                         pmi);
260 }
261
262 static inline u8 fixed_en_pmi(u64 ctrl, int idx)
263 {
264         return (ctrl >> (idx * 4)) & 0xf;
265 }
266
267 static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data)
268 {
269         int i;
270
271         for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
272                 u8 en_pmi = fixed_en_pmi(data, i);
273                 struct kvm_pmc *pmc = get_fixed_pmc_idx(pmu, i);
274
275                 if (fixed_en_pmi(pmu->fixed_ctr_ctrl, i) == en_pmi)
276                         continue;
277
278                 reprogram_fixed_counter(pmc, en_pmi, i);
279         }
280
281         pmu->fixed_ctr_ctrl = data;
282 }
283
284 static void reprogram_idx(struct kvm_pmu *pmu, int idx)
285 {
286         struct kvm_pmc *pmc = global_idx_to_pmc(pmu, idx);
287
288         if (!pmc)
289                 return;
290
291         if (pmc_is_gp(pmc))
292                 reprogram_gp_counter(pmc, pmc->eventsel);
293         else {
294                 int fidx = idx - X86_PMC_IDX_FIXED;
295                 reprogram_fixed_counter(pmc,
296                                 fixed_en_pmi(pmu->fixed_ctr_ctrl, fidx), fidx);
297         }
298 }
299
300 static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data)
301 {
302         int bit;
303         u64 diff = pmu->global_ctrl ^ data;
304
305         pmu->global_ctrl = data;
306
307         for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX)
308                 reprogram_idx(pmu, bit);
309 }
310
311 bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr)
312 {
313         struct kvm_pmu *pmu = &vcpu->arch.pmu;
314         int ret;
315
316         switch (msr) {
317         case MSR_CORE_PERF_FIXED_CTR_CTRL:
318         case MSR_CORE_PERF_GLOBAL_STATUS:
319         case MSR_CORE_PERF_GLOBAL_CTRL:
320         case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
321                 ret = pmu->version > 1;
322                 break;
323         default:
324                 ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)
325                         || get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0)
326                         || get_fixed_pmc(pmu, msr);
327                 break;
328         }
329         return ret;
330 }
331
332 int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data)
333 {
334         struct kvm_pmu *pmu = &vcpu->arch.pmu;
335         struct kvm_pmc *pmc;
336
337         switch (index) {
338         case MSR_CORE_PERF_FIXED_CTR_CTRL:
339                 *data = pmu->fixed_ctr_ctrl;
340                 return 0;
341         case MSR_CORE_PERF_GLOBAL_STATUS:
342                 *data = pmu->global_status;
343                 return 0;
344         case MSR_CORE_PERF_GLOBAL_CTRL:
345                 *data = pmu->global_ctrl;
346                 return 0;
347         case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
348                 *data = pmu->global_ovf_ctrl;
349                 return 0;
350         default:
351                 if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0)) ||
352                                 (pmc = get_fixed_pmc(pmu, index))) {
353                         *data = read_pmc(pmc);
354                         return 0;
355                 } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) {
356                         *data = pmc->eventsel;
357                         return 0;
358                 }
359         }
360         return 1;
361 }
362
363 int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
364 {
365         struct kvm_pmu *pmu = &vcpu->arch.pmu;
366         struct kvm_pmc *pmc;
367
368         switch (index) {
369         case MSR_CORE_PERF_FIXED_CTR_CTRL:
370                 if (pmu->fixed_ctr_ctrl == data)
371                         return 0;
372                 if (!(data & 0xfffffffffffff444ull)) {
373                         reprogram_fixed_counters(pmu, data);
374                         return 0;
375                 }
376                 break;
377         case MSR_CORE_PERF_GLOBAL_STATUS:
378                 break; /* RO MSR */
379         case MSR_CORE_PERF_GLOBAL_CTRL:
380                 if (pmu->global_ctrl == data)
381                         return 0;
382                 if (!(data & pmu->global_ctrl_mask)) {
383                         global_ctrl_changed(pmu, data);
384                         return 0;
385                 }
386                 break;
387         case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
388                 if (!(data & (pmu->global_ctrl_mask & ~(3ull<<62)))) {
389                         pmu->global_status &= ~data;
390                         pmu->global_ovf_ctrl = data;
391                         return 0;
392                 }
393                 break;
394         default:
395                 if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0)) ||
396                                 (pmc = get_fixed_pmc(pmu, index))) {
397                         data = (s64)(s32)data;
398                         pmc->counter += data - read_pmc(pmc);
399                         return 0;
400                 } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) {
401                         if (data == pmc->eventsel)
402                                 return 0;
403                         if (!(data & 0xffffffff00200000ull)) {
404                                 reprogram_gp_counter(pmc, data);
405                                 return 0;
406                         }
407                 }
408         }
409         return 1;
410 }
411
412 int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data)
413 {
414         struct kvm_pmu *pmu = &vcpu->arch.pmu;
415         bool fast_mode = pmc & (1u << 31);
416         bool fixed = pmc & (1u << 30);
417         struct kvm_pmc *counters;
418         u64 ctr;
419
420         pmc &= ~(3u << 30);
421         if (!fixed && pmc >= pmu->nr_arch_gp_counters)
422                 return 1;
423         if (fixed && pmc >= pmu->nr_arch_fixed_counters)
424                 return 1;
425         counters = fixed ? pmu->fixed_counters : pmu->gp_counters;
426         ctr = read_pmc(&counters[pmc]);
427         if (fast_mode)
428                 ctr = (u32)ctr;
429         *data = ctr;
430
431         return 0;
432 }
433
434 void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu)
435 {
436         struct kvm_pmu *pmu = &vcpu->arch.pmu;
437         struct kvm_cpuid_entry2 *entry;
438         unsigned bitmap_len;
439
440         pmu->nr_arch_gp_counters = 0;
441         pmu->nr_arch_fixed_counters = 0;
442         pmu->counter_bitmask[KVM_PMC_GP] = 0;
443         pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
444         pmu->version = 0;
445
446         entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
447         if (!entry)
448                 return;
449
450         pmu->version = entry->eax & 0xff;
451         if (!pmu->version)
452                 return;
453
454         pmu->nr_arch_gp_counters = min((int)(entry->eax >> 8) & 0xff,
455                         X86_PMC_MAX_GENERIC);
456         pmu->counter_bitmask[KVM_PMC_GP] =
457                 ((u64)1 << ((entry->eax >> 16) & 0xff)) - 1;
458         bitmap_len = (entry->eax >> 24) & 0xff;
459         pmu->available_event_types = ~entry->ebx & ((1ull << bitmap_len) - 1);
460
461         if (pmu->version == 1) {
462                 pmu->global_ctrl = (1 << pmu->nr_arch_gp_counters) - 1;
463                 return;
464         }
465
466         pmu->nr_arch_fixed_counters = min((int)(entry->edx & 0x1f),
467                         X86_PMC_MAX_FIXED);
468         pmu->counter_bitmask[KVM_PMC_FIXED] =
469                 ((u64)1 << ((entry->edx >> 5) & 0xff)) - 1;
470         pmu->global_ctrl_mask = ~(((1 << pmu->nr_arch_gp_counters) - 1)
471                         | (((1ull << pmu->nr_arch_fixed_counters) - 1)
472                                 << X86_PMC_IDX_FIXED));
473 }
474
475 void kvm_pmu_init(struct kvm_vcpu *vcpu)
476 {
477         int i;
478         struct kvm_pmu *pmu = &vcpu->arch.pmu;
479
480         memset(pmu, 0, sizeof(*pmu));
481         for (i = 0; i < X86_PMC_MAX_GENERIC; i++) {
482                 pmu->gp_counters[i].type = KVM_PMC_GP;
483                 pmu->gp_counters[i].vcpu = vcpu;
484                 pmu->gp_counters[i].idx = i;
485         }
486         for (i = 0; i < X86_PMC_MAX_FIXED; i++) {
487                 pmu->fixed_counters[i].type = KVM_PMC_FIXED;
488                 pmu->fixed_counters[i].vcpu = vcpu;
489                 pmu->fixed_counters[i].idx = i + X86_PMC_IDX_FIXED;
490         }
491         init_irq_work(&pmu->irq_work, trigger_pmi);
492         kvm_pmu_cpuid_update(vcpu);
493 }
494
495 void kvm_pmu_reset(struct kvm_vcpu *vcpu)
496 {
497         struct kvm_pmu *pmu = &vcpu->arch.pmu;
498         int i;
499
500         irq_work_sync(&pmu->irq_work);
501         for (i = 0; i < X86_PMC_MAX_GENERIC; i++) {
502                 struct kvm_pmc *pmc = &pmu->gp_counters[i];
503                 stop_counter(pmc);
504                 pmc->counter = pmc->eventsel = 0;
505         }
506
507         for (i = 0; i < X86_PMC_MAX_FIXED; i++)
508                 stop_counter(&pmu->fixed_counters[i]);
509
510         pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status =
511                 pmu->global_ovf_ctrl = 0;
512 }
513
514 void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
515 {
516         kvm_pmu_reset(vcpu);
517 }
518
519 void kvm_handle_pmu_event(struct kvm_vcpu *vcpu)
520 {
521         struct kvm_pmu *pmu = &vcpu->arch.pmu;
522         u64 bitmask;
523         int bit;
524
525         bitmask = pmu->reprogram_pmi;
526
527         for_each_set_bit(bit, (unsigned long *)&bitmask, X86_PMC_IDX_MAX) {
528                 struct kvm_pmc *pmc = global_idx_to_pmc(pmu, bit);
529
530                 if (unlikely(!pmc || !pmc->perf_event)) {
531                         clear_bit(bit, (unsigned long *)&pmu->reprogram_pmi);
532                         continue;
533                 }
534
535                 reprogram_idx(pmu, bit);
536         }
537 }