kernel/arch/x86/kernel/cpu/mcheck/mce.c

   1 /*
   2  * Machine check handler.
   3  *
   4  * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
   5  * Rest from unknown author(s).
   6  * 2004 Andi Kleen. Rewrote most of it.
   7  * Copyright 2008 Intel Corporation
   8  * Author: Andi Kleen
   9  */
  10
  11 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  12
  13 #include <linux/thread_info.h>
  14 #include <linux/capability.h>
  15 #include <linux/miscdevice.h>
  16 #include <linux/ratelimit.h>
  17 #include <linux/kallsyms.h>
  18 #include <linux/rcupdate.h>
  19 #include <linux/kobject.h>
  20 #include <linux/uaccess.h>
  21 #include <linux/kdebug.h>
  22 #include <linux/kernel.h>
  23 #include <linux/percpu.h>
  24 #include <linux/string.h>
  25 #include <linux/device.h>
  26 #include <linux/syscore_ops.h>
  27 #include <linux/delay.h>
  28 #include <linux/ctype.h>
  29 #include <linux/sched.h>
  30 #include <linux/sysfs.h>
  31 #include <linux/types.h>
  32 #include <linux/slab.h>
  33 #include <linux/init.h>
  34 #include <linux/kmod.h>
  35 #include <linux/poll.h>
  36 #include <linux/nmi.h>
  37 #include <linux/cpu.h>
  38 #include <linux/smp.h>
  39 #include <linux/fs.h>
  40 #include <linux/mm.h>
  41 #include <linux/debugfs.h>
  42 #include <linux/irq_work.h>
  43 #include <linux/export.h>
  44 #include <linux/jiffies.h>
  45 #include <linux/swork.h>
  46
  47 #include <asm/processor.h>
  48 #include <asm/traps.h>
  49 #include <asm/tlbflush.h>
  50 #include <asm/mce.h>
  51 #include <asm/msr.h>
  52
  53 #include "mce-internal.h"
  54
  55 static DEFINE_MUTEX(mce_chrdev_read_mutex);
  56
  57 #define mce_log_get_idx_check(p) \
  58 ({ \
  59         RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
  60                          !lockdep_is_held(&mce_chrdev_read_mutex), \
  61                          "suspicious mce_log_get_idx_check() usage"); \
  62         smp_load_acquire(&(p)); \
  63 })
  64
  65 #define CREATE_TRACE_POINTS
  66 #include <trace/events/mce.h>
  67
  68 #define SPINUNIT                100     /* 100ns */
  69
  70 DEFINE_PER_CPU(unsigned, mce_exception_count);
  71
  72 struct mce_bank *mce_banks __read_mostly;
  73 struct mce_vendor_flags mce_flags __read_mostly;
  74
  75 struct mca_config mca_cfg __read_mostly = {
  76         .bootlog  = -1,
  77         /*
  78          * Tolerant levels:
  79          * 0: always panic on uncorrected errors, log corrected errors
  80          * 1: panic or SIGBUS on uncorrected errors, log corrected errors
  81          * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors
  82          * 3: never panic or SIGBUS, log all errors (for testing only)
  83          */
  84         .tolerant = 1,
  85         .monarch_timeout = -1
  86 };
  87
  88 /* User mode helper program triggered by machine check event */
  89 static unsigned long            mce_need_notify;
  90 static char                     mce_helper[128];
  91 static char                     *mce_helper_argv[2] = { mce_helper, NULL };
  92
  93 static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
  94
  95 static DEFINE_PER_CPU(struct mce, mces_seen);
  96 static int                      cpu_missing;
  97
  98 /*
  99  * MCA banks polled by the period polling timer for corrected events.
 100  * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
 101  */
 102 DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
 103         [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
 104 };
 105
 106 /*
 107  * MCA banks controlled through firmware first for corrected errors.
 108  * This is a global list of banks for which we won't enable CMCI and we
 109  * won't poll. Firmware controls these banks and is responsible for
 110  * reporting corrected errors through GHES. Uncorrected/recoverable
 111  * errors are still notified through a machine check.
 112  */
 113 mce_banks_t mce_banks_ce_disabled;
 114
 115 static struct work_struct mce_work;
 116 static struct irq_work mce_irq_work;
 117
 118 static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
 119 static int mce_usable_address(struct mce *m);
 120
 121 /*
 122  * CPU/chipset specific EDAC code can register a notifier call here to print
 123  * MCE errors in a human-readable form.
 124  */
 125 ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
 126
 127 /* Do initial initialization of a struct mce */
 128 void mce_setup(struct mce *m)
 129 {
 130         memset(m, 0, sizeof(struct mce));
 131         m->cpu = m->extcpu = smp_processor_id();
 132         m->tsc = rdtsc();
 133         /* We hope get_seconds stays lockless */
 134         m->time = get_seconds();
 135         m->cpuvendor = boot_cpu_data.x86_vendor;
 136         m->cpuid = cpuid_eax(1);
 137         m->socketid = cpu_data(m->extcpu).phys_proc_id;
 138         m->apicid = cpu_data(m->extcpu).initial_apicid;
 139         rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
 140 }
 141
 142 DEFINE_PER_CPU(struct mce, injectm);
 143 EXPORT_PER_CPU_SYMBOL_GPL(injectm);
 144
 145 /*
 146  * Lockless MCE logging infrastructure.
 147  * This avoids deadlocks on printk locks without having to break locks. Also
 148  * separate MCEs from kernel messages to avoid bogus bug reports.
 149  */
 150
 151 static struct mce_log mcelog = {
 152         .signature      = MCE_LOG_SIGNATURE,
 153         .len            = MCE_LOG_LEN,
 154         .recordlen      = sizeof(struct mce),
 155 };
 156
 157 void mce_log(struct mce *mce)
 158 {
 159         unsigned next, entry;
 160
 161         /* Emit the trace record: */
 162         trace_mce_record(mce);
 163
 164         if (!mce_gen_pool_add(mce))
 165                 irq_work_queue(&mce_irq_work);
 166
 167         mce->finished = 0;
 168         wmb();
 169         for (;;) {
 170                 entry = mce_log_get_idx_check(mcelog.next);
 171                 for (;;) {
 172
 173                         /*
 174                          * When the buffer fills up discard new entries.
 175                          * Assume that the earlier errors are the more
 176                          * interesting ones:
 177                          */
 178                         if (entry >= MCE_LOG_LEN) {
 179                                 set_bit(MCE_OVERFLOW,
 180                                         (unsigned long *)&mcelog.flags);
 181                                 return;
 182                         }
 183                         /* Old left over entry. Skip: */
 184                         if (mcelog.entry[entry].finished) {
 185                                 entry++;
 186                                 continue;
 187                         }
 188                         break;
 189                 }
 190                 smp_rmb();
 191                 next = entry + 1;
 192                 if (cmpxchg(&mcelog.next, entry, next) == entry)
 193                         break;
 194         }
 195         memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
 196         wmb();
 197         mcelog.entry[entry].finished = 1;
 198         wmb();
 199
 200         mce->finished = 1;
 201         set_bit(0, &mce_need_notify);
 202 }
 203
 204 void mce_inject_log(struct mce *m)
 205 {
 206         mutex_lock(&mce_chrdev_read_mutex);
 207         mce_log(m);
 208         mutex_unlock(&mce_chrdev_read_mutex);
 209 }
 210 EXPORT_SYMBOL_GPL(mce_inject_log);
 211
 212 static struct notifier_block mce_srao_nb;
 213
 214 void mce_register_decode_chain(struct notifier_block *nb)
 215 {
 216         /* Ensure SRAO notifier has the highest priority in the decode chain. */
 217         if (nb != &mce_srao_nb && nb->priority == INT_MAX)
 218                 nb->priority -= 1;
 219
 220         atomic_notifier_chain_register(&x86_mce_decoder_chain, nb);
 221 }
 222 EXPORT_SYMBOL_GPL(mce_register_decode_chain);
 223
 224 void mce_unregister_decode_chain(struct notifier_block *nb)
 225 {
 226         atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
 227 }
 228 EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
 229
 230 static void print_mce(struct mce *m)
 231 {
 232         int ret = 0;
 233
 234         pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
 235                m->extcpu, m->mcgstatus, m->bank, m->status);
 236
 237         if (m->ip) {
 238                 pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
 239                         !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
 240                                 m->cs, m->ip);
 241
 242                 if (m->cs == __KERNEL_CS)
 243                         print_symbol("{%s}", m->ip);
 244                 pr_cont("\n");
 245         }
 246
 247         pr_emerg(HW_ERR "TSC %llx ", m->tsc);
 248         if (m->addr)
 249                 pr_cont("ADDR %llx ", m->addr);
 250         if (m->misc)
 251                 pr_cont("MISC %llx ", m->misc);
 252
 253         pr_cont("\n");
 254         /*
 255          * Note this output is parsed by external tools and old fields
 256          * should not be changed.
 257          */
 258         pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
 259                 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
 260                 cpu_data(m->extcpu).microcode);
 261
 262         /*
 263          * Print out human-readable details about the MCE error,
 264          * (if the CPU has an implementation for that)
 265          */
 266         ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
 267         if (ret == NOTIFY_STOP)
 268                 return;
 269
 270         pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
 271 }
 272
 273 #define PANIC_TIMEOUT 5 /* 5 seconds */
 274
 275 static atomic_t mce_panicked;
 276
 277 static int fake_panic;
 278 static atomic_t mce_fake_panicked;
 279
 280 /* Panic in progress. Enable interrupts and wait for final IPI */
 281 static void wait_for_panic(void)
 282 {
 283         long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
 284
 285         preempt_disable();
 286         local_irq_enable();
 287         while (timeout-- > 0)
 288                 udelay(1);
 289         if (panic_timeout == 0)
 290                 panic_timeout = mca_cfg.panic_timeout;
 291         panic("Panicing machine check CPU died");
 292 }
 293
 294 static void mce_panic(const char *msg, struct mce *final, char *exp)
 295 {
 296         int i, apei_err = 0;
 297
 298         if (!fake_panic) {
 299                 /*
 300                  * Make sure only one CPU runs in machine check panic
 301                  */
 302                 if (atomic_inc_return(&mce_panicked) > 1)
 303                         wait_for_panic();
 304                 barrier();
 305
 306                 bust_spinlocks(1);
 307                 console_verbose();
 308         } else {
 309                 /* Don't log too much for fake panic */
 310                 if (atomic_inc_return(&mce_fake_panicked) > 1)
 311                         return;
 312         }
 313         /* First print corrected ones that are still unlogged */
 314         for (i = 0; i < MCE_LOG_LEN; i++) {
 315                 struct mce *m = &mcelog.entry[i];
 316                 if (!(m->status & MCI_STATUS_VAL))
 317                         continue;
 318                 if (!(m->status & MCI_STATUS_UC)) {
 319                         print_mce(m);
 320                         if (!apei_err)
 321                                 apei_err = apei_write_mce(m);
 322                 }
 323         }
 324         /* Now print uncorrected but with the final one last */
 325         for (i = 0; i < MCE_LOG_LEN; i++) {
 326                 struct mce *m = &mcelog.entry[i];
 327                 if (!(m->status & MCI_STATUS_VAL))
 328                         continue;
 329                 if (!(m->status & MCI_STATUS_UC))
 330                         continue;
 331                 if (!final || memcmp(m, final, sizeof(struct mce))) {
 332                         print_mce(m);
 333                         if (!apei_err)
 334                                 apei_err = apei_write_mce(m);
 335                 }
 336         }
 337         if (final) {
 338                 print_mce(final);
 339                 if (!apei_err)
 340                         apei_err = apei_write_mce(final);
 341         }
 342         if (cpu_missing)
 343                 pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
 344         if (exp)
 345                 pr_emerg(HW_ERR "Machine check: %s\n", exp);
 346         if (!fake_panic) {
 347                 if (panic_timeout == 0)
 348                         panic_timeout = mca_cfg.panic_timeout;
 349                 panic(msg);
 350         } else
 351                 pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
 352 }
 353
 354 /* Support code for software error injection */
 355
 356 static int msr_to_offset(u32 msr)
 357 {
 358         unsigned bank = __this_cpu_read(injectm.bank);
 359
 360         if (msr == mca_cfg.rip_msr)
 361                 return offsetof(struct mce, ip);
 362         if (msr == MSR_IA32_MCx_STATUS(bank))
 363                 return offsetof(struct mce, status);
 364         if (msr == MSR_IA32_MCx_ADDR(bank))
 365                 return offsetof(struct mce, addr);
 366         if (msr == MSR_IA32_MCx_MISC(bank))
 367                 return offsetof(struct mce, misc);
 368         if (msr == MSR_IA32_MCG_STATUS)
 369                 return offsetof(struct mce, mcgstatus);
 370         return -1;
 371 }
 372
 373 /* MSR access wrappers used for error injection */
 374 static u64 mce_rdmsrl(u32 msr)
 375 {
 376         u64 v;
 377
 378         if (__this_cpu_read(injectm.finished)) {
 379                 int offset = msr_to_offset(msr);
 380
 381                 if (offset < 0)
 382                         return 0;
 383                 return *(u64 *)((char *)this_cpu_ptr(&injectm) + offset);
 384         }
 385
 386         if (rdmsrl_safe(msr, &v)) {
 387                 WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr);
 388                 /*
 389                  * Return zero in case the access faulted. This should
 390                  * not happen normally but can happen if the CPU does
 391                  * something weird, or if the code is buggy.
 392                  */
 393                 v = 0;
 394         }
 395
 396         return v;
 397 }
 398
 399 static void mce_wrmsrl(u32 msr, u64 v)
 400 {
 401         if (__this_cpu_read(injectm.finished)) {
 402                 int offset = msr_to_offset(msr);
 403
 404                 if (offset >= 0)
 405                         *(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v;
 406                 return;
 407         }
 408         wrmsrl(msr, v);
 409 }
 410
 411 /*
 412  * Collect all global (w.r.t. this processor) status about this machine
 413  * check into our "mce" struct so that we can use it later to assess
 414  * the severity of the problem as we read per-bank specific details.
 415  */
 416 static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
 417 {
 418         mce_setup(m);
 419
 420         m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
 421         if (regs) {
 422                 /*
 423                  * Get the address of the instruction at the time of
 424                  * the machine check error.
 425                  */
 426                 if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
 427                         m->ip = regs->ip;
 428                         m->cs = regs->cs;
 429
 430                         /*
 431                          * When in VM86 mode make the cs look like ring 3
 432                          * always. This is a lie, but it's better than passing
 433                          * the additional vm86 bit around everywhere.
 434                          */
 435                         if (v8086_mode(regs))
 436                                 m->cs |= 3;
 437                 }
 438                 /* Use accurate RIP reporting if available. */
 439                 if (mca_cfg.rip_msr)
 440                         m->ip = mce_rdmsrl(mca_cfg.rip_msr);
 441         }
 442 }
 443
 444 int mce_available(struct cpuinfo_x86 *c)
 445 {
 446         if (mca_cfg.disabled)
 447                 return 0;
 448         return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
 449 }
 450
 451 static void mce_schedule_work(void)
 452 {
 453         if (!mce_gen_pool_empty() && keventd_up())
 454                 schedule_work(&mce_work);
 455 }
 456
 457 static void mce_irq_work_cb(struct irq_work *entry)
 458 {
 459         mce_notify_irq();
 460         mce_schedule_work();
 461 }
 462
 463 static void mce_report_event(struct pt_regs *regs)
 464 {
 465         if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
 466                 mce_notify_irq();
 467                 /*
 468                  * Triggering the work queue here is just an insurance
 469                  * policy in case the syscall exit notify handler
 470                  * doesn't run soon enough or ends up running on the
 471                  * wrong CPU (can happen when audit sleeps)
 472                  */
 473                 mce_schedule_work();
 474                 return;
 475         }
 476
 477         irq_work_queue(&mce_irq_work);
 478 }
 479
 480 static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
 481                                 void *data)
 482 {
 483         struct mce *mce = (struct mce *)data;
 484         unsigned long pfn;
 485
 486         if (!mce)
 487                 return NOTIFY_DONE;
 488
 489         if (mce->usable_addr && (mce->severity == MCE_AO_SEVERITY)) {
 490                 pfn = mce->addr >> PAGE_SHIFT;
 491                 memory_failure(pfn, MCE_VECTOR, 0);
 492         }
 493
 494         return NOTIFY_OK;
 495 }
 496 static struct notifier_block mce_srao_nb = {
 497         .notifier_call  = srao_decode_notifier,
 498         .priority = INT_MAX,
 499 };
 500
 501 /*
 502  * Read ADDR and MISC registers.
 503  */
 504 static void mce_read_aux(struct mce *m, int i)
 505 {
 506         if (m->status & MCI_STATUS_MISCV)
 507                 m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
 508         if (m->status & MCI_STATUS_ADDRV) {
 509                 m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
 510
 511                 /*
 512                  * Mask the reported address by the reported granularity.
 513                  */
 514                 if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) {
 515                         u8 shift = MCI_MISC_ADDR_LSB(m->misc);
 516                         m->addr >>= shift;
 517                         m->addr <<= shift;
 518                 }
 519         }
 520 }
 521
 522 static bool memory_error(struct mce *m)
 523 {
 524         struct cpuinfo_x86 *c = &boot_cpu_data;
 525
 526         if (c->x86_vendor == X86_VENDOR_AMD) {
 527                 /*
 528                  * coming soon
 529                  */
 530                 return false;
 531         } else if (c->x86_vendor == X86_VENDOR_INTEL) {
 532                 /*
 533                  * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
 534                  *
 535                  * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
 536                  * indicating a memory error. Bit 8 is used for indicating a
 537                  * cache hierarchy error. The combination of bit 2 and bit 3
 538                  * is used for indicating a `generic' cache hierarchy error
 539                  * But we can't just blindly check the above bits, because if
 540                  * bit 11 is set, then it is a bus/interconnect error - and
 541                  * either way the above bits just gives more detail on what
 542                  * bus/interconnect error happened. Note that bit 12 can be
 543                  * ignored, as it's the "filter" bit.
 544                  */
 545                 return (m->status & 0xef80) == BIT(7) ||
 546                        (m->status & 0xef00) == BIT(8) ||
 547                        (m->status & 0xeffc) == 0xc;
 548         }
 549
 550         return false;
 551 }
 552
 553 DEFINE_PER_CPU(unsigned, mce_poll_count);
 554
 555 /*
 556  * Poll for corrected events or events that happened before reset.
 557  * Those are just logged through /dev/mcelog.
 558  *
 559  * This is executed in standard interrupt context.
 560  *
 561  * Note: spec recommends to panic for fatal unsignalled
 562  * errors here. However this would be quite problematic --
 563  * we would need to reimplement the Monarch handling and
 564  * it would mess up the exclusion between exception handler
 565  * and poll hander -- * so we skip this for now.
 566  * These cases should not happen anyways, or only when the CPU
 567  * is already totally * confused. In this case it's likely it will
 568  * not fully execute the machine check handler either.
 569  */
 570 bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 571 {
 572         bool error_logged = false;
 573         struct mce m;
 574         int severity;
 575         int i;
 576
 577         this_cpu_inc(mce_poll_count);
 578
 579         mce_gather_info(&m, NULL);
 580
 581         for (i = 0; i < mca_cfg.banks; i++) {
 582                 if (!mce_banks[i].ctl || !test_bit(i, *b))
 583                         continue;
 584
 585                 m.misc = 0;
 586                 m.addr = 0;
 587                 m.bank = i;
 588                 m.tsc = 0;
 589
 590                 barrier();
 591                 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
 592                 if (!(m.status & MCI_STATUS_VAL))
 593                         continue;
 594
 595
 596                 /*
 597                  * Uncorrected or signalled events are handled by the exception
 598                  * handler when it is enabled, so don't process those here.
 599                  *
 600                  * TBD do the same check for MCI_STATUS_EN here?
 601                  */
 602                 if (!(flags & MCP_UC) &&
 603                     (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC)))
 604                         continue;
 605
 606                 mce_read_aux(&m, i);
 607
 608                 if (!(flags & MCP_TIMESTAMP))
 609                         m.tsc = 0;
 610
 611                 severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
 612
 613                 /*
 614                  * In the cases where we don't have a valid address after all,
 615                  * do not add it into the ring buffer.
 616                  */
 617                 if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m)) {
 618                         if (m.status & MCI_STATUS_ADDRV) {
 619                                 m.severity = severity;
 620                                 m.usable_addr = mce_usable_address(&m);
 621
 622                                 if (!mce_gen_pool_add(&m))
 623                                         mce_schedule_work();
 624                         }
 625                 }
 626
 627                 /*
 628                  * Don't get the IP here because it's unlikely to
 629                  * have anything to do with the actual error location.
 630                  */
 631                 if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce) {
 632                         error_logged = true;
 633                         mce_log(&m);
 634                 }
 635
 636                 /*
 637                  * Clear state for this bank.
 638                  */
 639                 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
 640         }
 641
 642         /*
 643          * Don't clear MCG_STATUS here because it's only defined for
 644          * exceptions.
 645          */
 646
 647         sync_core();
 648
 649         return error_logged;
 650 }
 651 EXPORT_SYMBOL_GPL(machine_check_poll);
 652
 653 /*
 654  * Do a quick check if any of the events requires a panic.
 655  * This decides if we keep the events around or clear them.
 656  */
 657 static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
 658                           struct pt_regs *regs)
 659 {
 660         int i, ret = 0;
 661         char *tmp;
 662
 663         for (i = 0; i < mca_cfg.banks; i++) {
 664                 m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
 665                 if (m->status & MCI_STATUS_VAL) {
 666                         __set_bit(i, validp);
 667                         if (quirk_no_way_out)
 668                                 quirk_no_way_out(i, m, regs);
 669                 }
 670
 671                 if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
 672                         *msg = tmp;
 673                         ret = 1;
 674                 }
 675         }
 676         return ret;
 677 }
 678
 679 /*
 680  * Variable to establish order between CPUs while scanning.
 681  * Each CPU spins initially until executing is equal its number.
 682  */
 683 static atomic_t mce_executing;
 684
 685 /*
 686  * Defines order of CPUs on entry. First CPU becomes Monarch.
 687  */
 688 static atomic_t mce_callin;
 689
 690 /*
 691  * Check if a timeout waiting for other CPUs happened.
 692  */
 693 static int mce_timed_out(u64 *t, const char *msg)
 694 {
 695         /*
 696          * The others already did panic for some reason.
 697          * Bail out like in a timeout.
 698          * rmb() to tell the compiler that system_state
 699          * might have been modified by someone else.
 700          */
 701         rmb();
 702         if (atomic_read(&mce_panicked))
 703                 wait_for_panic();
 704         if (!mca_cfg.monarch_timeout)
 705                 goto out;
 706         if ((s64)*t < SPINUNIT) {
 707                 if (mca_cfg.tolerant <= 1)
 708                         mce_panic(msg, NULL, NULL);
 709                 cpu_missing = 1;
 710                 return 1;
 711         }
 712         *t -= SPINUNIT;
 713 out:
 714         touch_nmi_watchdog();
 715         return 0;
 716 }
 717
 718 /*
 719  * The Monarch's reign.  The Monarch is the CPU who entered
 720  * the machine check handler first. It waits for the others to
 721  * raise the exception too and then grades them. When any
 722  * error is fatal panic. Only then let the others continue.
 723  *
 724  * The other CPUs entering the MCE handler will be controlled by the
 725  * Monarch. They are called Subjects.
 726  *
 727  * This way we prevent any potential data corruption in a unrecoverable case
 728  * and also makes sure always all CPU's errors are examined.
 729  *
 730  * Also this detects the case of a machine check event coming from outer
 731  * space (not detected by any CPUs) In this case some external agent wants
 732  * us to shut down, so panic too.
 733  *
 734  * The other CPUs might still decide to panic if the handler happens
 735  * in a unrecoverable place, but in this case the system is in a semi-stable
 736  * state and won't corrupt anything by itself. It's ok to let the others
 737  * continue for a bit first.
 738  *
 739  * All the spin loops have timeouts; when a timeout happens a CPU
 740  * typically elects itself to be Monarch.
 741  */
 742 static void mce_reign(void)
 743 {
 744         int cpu;
 745         struct mce *m = NULL;
 746         int global_worst = 0;
 747         char *msg = NULL;
 748         char *nmsg = NULL;
 749
 750         /*
 751          * This CPU is the Monarch and the other CPUs have run
 752          * through their handlers.
 753          * Grade the severity of the errors of all the CPUs.
 754          */
 755         for_each_possible_cpu(cpu) {
 756                 int severity = mce_severity(&per_cpu(mces_seen, cpu),
 757                                             mca_cfg.tolerant,
 758                                             &nmsg, true);
 759                 if (severity > global_worst) {
 760                         msg = nmsg;
 761                         global_worst = severity;
 762                         m = &per_cpu(mces_seen, cpu);
 763                 }
 764         }
 765
 766         /*
 767          * Cannot recover? Panic here then.
 768          * This dumps all the mces in the log buffer and stops the
 769          * other CPUs.
 770          */
 771         if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
 772                 mce_panic("Fatal machine check", m, msg);
 773
 774         /*
 775          * For UC somewhere we let the CPU who detects it handle it.
 776          * Also must let continue the others, otherwise the handling
 777          * CPU could deadlock on a lock.
 778          */
 779
 780         /*
 781          * No machine check event found. Must be some external
 782          * source or one CPU is hung. Panic.
 783          */
 784         if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
 785                 mce_panic("Fatal machine check from unknown source", NULL, NULL);
 786
 787         /*
 788          * Now clear all the mces_seen so that they don't reappear on
 789          * the next mce.
 790          */
 791         for_each_possible_cpu(cpu)
 792                 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
 793 }
 794
 795 static atomic_t global_nwo;
 796
 797 /*
 798  * Start of Monarch synchronization. This waits until all CPUs have
 799  * entered the exception handler and then determines if any of them
 800  * saw a fatal event that requires panic. Then it executes them
 801  * in the entry order.
 802  * TBD double check parallel CPU hotunplug
 803  */
 804 static int mce_start(int *no_way_out)
 805 {
 806         int order;
 807         int cpus = num_online_cpus();
 808         u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
 809
 810         if (!timeout)
 811                 return -1;
 812
 813         atomic_add(*no_way_out, &global_nwo);
 814         /*
 815          * global_nwo should be updated before mce_callin
 816          */
 817         smp_wmb();
 818         order = atomic_inc_return(&mce_callin);
 819
 820         /*
 821          * Wait for everyone.
 822          */
 823         while (atomic_read(&mce_callin) != cpus) {
 824                 if (mce_timed_out(&timeout,
 825                                   "Timeout: Not all CPUs entered broadcast exception handler")) {
 826                         atomic_set(&global_nwo, 0);
 827                         return -1;
 828                 }
 829                 ndelay(SPINUNIT);
 830         }
 831
 832         /*
 833          * mce_callin should be read before global_nwo
 834          */
 835         smp_rmb();
 836
 837         if (order == 1) {
 838                 /*
 839                  * Monarch: Starts executing now, the others wait.
 840                  */
 841                 atomic_set(&mce_executing, 1);
 842         } else {
 843                 /*
 844                  * Subject: Now start the scanning loop one by one in
 845                  * the original callin order.
 846                  * This way when there are any shared banks it will be
 847                  * only seen by one CPU before cleared, avoiding duplicates.
 848                  */
 849                 while (atomic_read(&mce_executing) < order) {
 850                         if (mce_timed_out(&timeout,
 851                                           "Timeout: Subject CPUs unable to finish machine check processing")) {
 852                                 atomic_set(&global_nwo, 0);
 853                                 return -1;
 854                         }
 855                         ndelay(SPINUNIT);
 856                 }
 857         }
 858
 859         /*
 860          * Cache the global no_way_out state.
 861          */
 862         *no_way_out = atomic_read(&global_nwo);
 863
 864         return order;
 865 }
 866
 867 /*
 868  * Synchronize between CPUs after main scanning loop.
 869  * This invokes the bulk of the Monarch processing.
 870  */
 871 static int mce_end(int order)
 872 {
 873         int ret = -1;
 874         u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
 875
 876         if (!timeout)
 877                 goto reset;
 878         if (order < 0)
 879                 goto reset;
 880
 881         /*
 882          * Allow others to run.
 883          */
 884         atomic_inc(&mce_executing);
 885
 886         if (order == 1) {
 887                 /* CHECKME: Can this race with a parallel hotplug? */
 888                 int cpus = num_online_cpus();
 889
 890                 /*
 891                  * Monarch: Wait for everyone to go through their scanning
 892                  * loops.
 893                  */
 894                 while (atomic_read(&mce_executing) <= cpus) {
 895                         if (mce_timed_out(&timeout,
 896                                           "Timeout: Monarch CPU unable to finish machine check processing"))
 897                                 goto reset;
 898                         ndelay(SPINUNIT);
 899                 }
 900
 901                 mce_reign();
 902                 barrier();
 903                 ret = 0;
 904         } else {
 905                 /*
 906                  * Subject: Wait for Monarch to finish.
 907                  */
 908                 while (atomic_read(&mce_executing) != 0) {
 909                         if (mce_timed_out(&timeout,
 910                                           "Timeout: Monarch CPU did not finish machine check processing"))
 911                                 goto reset;
 912                         ndelay(SPINUNIT);
 913                 }
 914
 915                 /*
 916                  * Don't reset anything. That's done by the Monarch.
 917                  */
 918                 return 0;
 919         }
 920
 921         /*
 922          * Reset all global state.
 923          */
 924 reset:
 925         atomic_set(&global_nwo, 0);
 926         atomic_set(&mce_callin, 0);
 927         barrier();
 928
 929         /*
 930          * Let others run again.
 931          */
 932         atomic_set(&mce_executing, 0);
 933         return ret;
 934 }
 935
 936 /*
 937  * Check if the address reported by the CPU is in a format we can parse.
 938  * It would be possible to add code for most other cases, but all would
 939  * be somewhat complicated (e.g. segment offset would require an instruction
 940  * parser). So only support physical addresses up to page granuality for now.
 941  */
 942 static int mce_usable_address(struct mce *m)
 943 {
 944         if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
 945                 return 0;
 946         if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
 947                 return 0;
 948         if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
 949                 return 0;
 950         return 1;
 951 }
 952
 953 static void mce_clear_state(unsigned long *toclear)
 954 {
 955         int i;
 956
 957         for (i = 0; i < mca_cfg.banks; i++) {
 958                 if (test_bit(i, toclear))
 959                         mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
 960         }
 961 }
 962
 963 /*
 964  * The actual machine check handler. This only handles real
 965  * exceptions when something got corrupted coming in through int 18.
 966  *
 967  * This is executed in NMI context not subject to normal locking rules. This
 968  * implies that most kernel services cannot be safely used. Don't even
 969  * think about putting a printk in there!
 970  *
 971  * On Intel systems this is entered on all CPUs in parallel through
 972  * MCE broadcast. However some CPUs might be broken beyond repair,
 973  * so be always careful when synchronizing with others.
 974  */
 975 void do_machine_check(struct pt_regs *regs, long error_code)
 976 {
 977         struct mca_config *cfg = &mca_cfg;
 978         struct mce m, *final;
 979         int i;
 980         int worst = 0;
 981         int severity;
 982         /*
 983          * Establish sequential order between the CPUs entering the machine
 984          * check handler.
 985          */
 986         int order;
 987         /*
 988          * If no_way_out gets set, there is no safe way to recover from this
 989          * MCE.  If mca_cfg.tolerant is cranked up, we'll try anyway.
 990          */
 991         int no_way_out = 0;
 992         /*
 993          * If kill_it gets set, there might be a way to recover from this
 994          * error.
 995          */
 996         int kill_it = 0;
 997         DECLARE_BITMAP(toclear, MAX_NR_BANKS);
 998         DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
 999         char *msg = "Unknown";
1000         u64 recover_paddr = ~0ull;
1001         int flags = MF_ACTION_REQUIRED;
1002         int lmce = 0;
1003
1004         /* If this CPU is offline, just bail out. */
1005         if (cpu_is_offline(smp_processor_id())) {
1006                 u64 mcgstatus;
1007
1008                 mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
1009                 if (mcgstatus & MCG_STATUS_RIPV) {
1010                         mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1011                         return;
1012                 }
1013         }
1014
1015         ist_enter(regs);
1016
1017         this_cpu_inc(mce_exception_count);
1018
1019         if (!cfg->banks)
1020                 goto out;
1021
1022         mce_gather_info(&m, regs);
1023
1024         final = this_cpu_ptr(&mces_seen);
1025         *final = m;
1026
1027         memset(valid_banks, 0, sizeof(valid_banks));
1028         no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
1029
1030         barrier();
1031
1032         /*
1033          * When no restart IP might need to kill or panic.
1034          * Assume the worst for now, but if we find the
1035          * severity is MCE_AR_SEVERITY we have other options.
1036          */
1037         if (!(m.mcgstatus & MCG_STATUS_RIPV))
1038                 kill_it = 1;
1039
1040         /*
1041          * Check if this MCE is signaled to only this logical processor
1042          */
1043         if (m.mcgstatus & MCG_STATUS_LMCES)
1044                 lmce = 1;
1045         else {
1046                 /*
1047                  * Go through all the banks in exclusion of the other CPUs.
1048                  * This way we don't report duplicated events on shared banks
1049                  * because the first one to see it will clear it.
1050                  * If this is a Local MCE, then no need to perform rendezvous.
1051                  */
1052                 order = mce_start(&no_way_out);
1053         }
1054
1055         for (i = 0; i < cfg->banks; i++) {
1056                 __clear_bit(i, toclear);
1057                 if (!test_bit(i, valid_banks))
1058                         continue;
1059                 if (!mce_banks[i].ctl)
1060                         continue;
1061
1062                 m.misc = 0;
1063                 m.addr = 0;
1064                 m.bank = i;
1065
1066                 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
1067                 if ((m.status & MCI_STATUS_VAL) == 0)
1068                         continue;
1069
1070                 /*
1071                  * Non uncorrected or non signaled errors are handled by
1072                  * machine_check_poll. Leave them alone, unless this panics.
1073                  */
1074                 if (!(m.status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
1075                         !no_way_out)
1076                         continue;
1077
1078                 /*
1079                  * Set taint even when machine check was not enabled.
1080                  */
1081                 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
1082
1083                 severity = mce_severity(&m, cfg->tolerant, NULL, true);
1084
1085                 /*
1086                  * When machine check was for corrected/deferred handler don't
1087                  * touch, unless we're panicing.
1088                  */
1089                 if ((severity == MCE_KEEP_SEVERITY ||
1090                      severity == MCE_UCNA_SEVERITY) && !no_way_out)
1091                         continue;
1092                 __set_bit(i, toclear);
1093                 if (severity == MCE_NO_SEVERITY) {
1094                         /*
1095                          * Machine check event was not enabled. Clear, but
1096                          * ignore.
1097                          */
1098                         continue;
1099                 }
1100
1101                 mce_read_aux(&m, i);
1102
1103                 /* assuming valid severity level != 0 */
1104                 m.severity = severity;
1105                 m.usable_addr = mce_usable_address(&m);
1106
1107                 mce_log(&m);
1108
1109                 if (severity > worst) {
1110                         *final = m;
1111                         worst = severity;
1112                 }
1113         }
1114
1115         /* mce_clear_state will clear *final, save locally for use later */
1116         m = *final;
1117
1118         if (!no_way_out)
1119                 mce_clear_state(toclear);
1120
1121         /*
1122          * Do most of the synchronization with other CPUs.
1123          * When there's any problem use only local no_way_out state.
1124          */
1125         if (!lmce) {
1126                 if (mce_end(order) < 0)
1127                         no_way_out = worst >= MCE_PANIC_SEVERITY;
1128         } else {
1129                 /*
1130                  * Local MCE skipped calling mce_reign()
1131                  * If we found a fatal error, we need to panic here.
1132                  */
1133                  if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
1134                         mce_panic("Machine check from unknown source",
1135                                 NULL, NULL);
1136         }
1137
1138         /*
1139          * At insane "tolerant" levels we take no action. Otherwise
1140          * we only die if we have no other choice. For less serious
1141          * issues we try to recover, or limit damage to the current
1142          * process.
1143          */
1144         if (cfg->tolerant < 3) {
1145                 if (no_way_out)
1146                         mce_panic("Fatal machine check on current CPU", &m, msg);
1147                 if (worst == MCE_AR_SEVERITY) {
1148                         recover_paddr = m.addr;
1149                         if (!(m.mcgstatus & MCG_STATUS_RIPV))
1150                                 flags |= MF_MUST_KILL;
1151                 } else if (kill_it) {
1152                         force_sig(SIGBUS, current);
1153                 }
1154         }
1155
1156         if (worst > 0)
1157                 mce_report_event(regs);
1158         mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1159 out:
1160         sync_core();
1161
1162         if (recover_paddr == ~0ull)
1163                 goto done;
1164
1165         pr_err("Uncorrected hardware memory error in user-access at %llx",
1166                  recover_paddr);
1167         /*
1168          * We must call memory_failure() here even if the current process is
1169          * doomed. We still need to mark the page as poisoned and alert any
1170          * other users of the page.
1171          */
1172         ist_begin_non_atomic(regs);
1173         local_irq_enable();
1174         if (memory_failure(recover_paddr >> PAGE_SHIFT, MCE_VECTOR, flags) < 0) {
1175                 pr_err("Memory error not recovered");
1176                 force_sig(SIGBUS, current);
1177         }
1178         local_irq_disable();
1179         ist_end_non_atomic();
1180 done:
1181         ist_exit(regs);
1182 }
1183 EXPORT_SYMBOL_GPL(do_machine_check);
1184
1185 #ifndef CONFIG_MEMORY_FAILURE
1186 int memory_failure(unsigned long pfn, int vector, int flags)
1187 {
1188         /* mce_severity() should not hand us an ACTION_REQUIRED error */
1189         BUG_ON(flags & MF_ACTION_REQUIRED);
1190         pr_err("Uncorrected memory error in page 0x%lx ignored\n"
1191                "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
1192                pfn);
1193
1194         return 0;
1195 }
1196 #endif
1197
1198 /*
1199  * Action optional processing happens here (picking up
1200  * from the list of faulting pages that do_machine_check()
1201  * placed into the genpool).
1202  */
1203 static void mce_process_work(struct work_struct *dummy)
1204 {
1205         mce_gen_pool_process();
1206 }
1207
1208 #ifdef CONFIG_X86_MCE_INTEL
1209 /***
1210  * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
1211  * @cpu: The CPU on which the event occurred.
1212  * @status: Event status information
1213  *
1214  * This function should be called by the thermal interrupt after the
1215  * event has been processed and the decision was made to log the event
1216  * further.
1217  *
1218  * The status parameter will be saved to the 'status' field of 'struct mce'
1219  * and historically has been the register value of the
1220  * MSR_IA32_THERMAL_STATUS (Intel) msr.
1221  */
1222 void mce_log_therm_throt_event(__u64 status)
1223 {
1224         struct mce m;
1225
1226         mce_setup(&m);
1227         m.bank = MCE_THERMAL_BANK;
1228         m.status = status;
1229         mce_log(&m);
1230 }
1231 #endif /* CONFIG_X86_MCE_INTEL */
1232
1233 /*
1234  * Periodic polling timer for "silent" machine check errors.  If the
1235  * poller finds an MCE, poll 2x faster.  When the poller finds no more
1236  * errors, poll 2x slower (up to check_interval seconds).
1237  */
1238 static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
1239
1240 static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
1241 static DEFINE_PER_CPU(struct hrtimer, mce_timer);
1242
1243 static unsigned long mce_adjust_timer_default(unsigned long interval)
1244 {
1245         return interval;
1246 }
1247
1248 static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
1249
1250 static enum hrtimer_restart __restart_timer(struct hrtimer *timer, unsigned long interval)
1251 {
1252         if (!interval)
1253                 return HRTIMER_NORESTART;
1254         hrtimer_forward_now(timer, ns_to_ktime(jiffies_to_nsecs(interval)));
1255         return HRTIMER_RESTART;
1256 }
1257
1258 static enum hrtimer_restart mce_timer_fn(struct hrtimer *timer)
1259 {
1260         unsigned long iv;
1261
1262         iv = __this_cpu_read(mce_next_interval);
1263
1264         if (mce_available(this_cpu_ptr(&cpu_info))) {
1265                 machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_poll_banks));
1266
1267                 if (mce_intel_cmci_poll()) {
1268                         iv = mce_adjust_timer(iv);
1269                         goto done;
1270                 }
1271         }
1272
1273         /*
1274          * Alert userspace if needed. If we logged an MCE, reduce the polling
1275          * interval, otherwise increase the polling interval.
1276          */
1277         if (mce_notify_irq())
1278                 iv = max(iv / 2, (unsigned long) HZ/100);
1279         else
1280                 iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
1281
1282 done:
1283         __this_cpu_write(mce_next_interval, iv);
1284         return __restart_timer(timer, iv);
1285 }
1286
1287 /*
1288  * Ensure that the timer is firing in @interval from now.
1289  */
1290 void mce_timer_kick(unsigned long interval)
1291 {
1292         struct hrtimer *t = this_cpu_ptr(&mce_timer);
1293         unsigned long iv = __this_cpu_read(mce_next_interval);
1294
1295         __restart_timer(t, interval);
1296
1297         if (interval < iv)
1298                 __this_cpu_write(mce_next_interval, interval);
1299 }
1300
1301 /* Must not be called in IRQ context where del_timer_sync() can deadlock */
1302 static void mce_timer_delete_all(void)
1303 {
1304         int cpu;
1305
1306         for_each_online_cpu(cpu)
1307                 hrtimer_cancel(&per_cpu(mce_timer, cpu));
1308 }
1309
1310 static void mce_do_trigger(struct work_struct *work)
1311 {
1312         call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
1313 }
1314
1315 static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
1316
1317 static void __mce_notify_work(struct swork_event *event)
1318 {
1319         /* Not more than two messages every minute */
1320         static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1321
1322         /* wake processes polling /dev/mcelog */
1323         wake_up_interruptible(&mce_chrdev_wait);
1324
1325         /*
1326          * There is no risk of missing notifications because
1327          * work_pending is always cleared before the function is
1328          * executed.
1329          */
1330         if (mce_helper[0] && !work_pending(&mce_trigger_work))
1331                 schedule_work(&mce_trigger_work);
1332
1333         if (__ratelimit(&ratelimit))
1334                 pr_info(HW_ERR "Machine check events logged\n");
1335 }
1336
1337 #ifdef CONFIG_PREEMPT_RT_FULL
1338 static bool notify_work_ready __read_mostly;
1339 static struct swork_event notify_work;
1340
1341 static int mce_notify_work_init(void)
1342 {
1343         int err;
1344
1345         err = swork_get();
1346         if (err)
1347                 return err;
1348
1349         INIT_SWORK(&notify_work, __mce_notify_work);
1350         notify_work_ready = true;
1351         return 0;
1352 }
1353
1354 static void mce_notify_work(void)
1355 {
1356         if (notify_work_ready)
1357                 swork_queue(&notify_work);
1358 }
1359 #else
1360 static void mce_notify_work(void)
1361 {
1362         __mce_notify_work(NULL);
1363 }
1364 static inline int mce_notify_work_init(void) { return 0; }
1365 #endif
1366
1367 /*
1368  * Notify the user(s) about new machine check events.
1369  * Can be called from interrupt context, but not from machine check/NMI
1370  * context.
1371  */
1372 int mce_notify_irq(void)
1373 {
1374         if (test_and_clear_bit(0, &mce_need_notify)) {
1375                 mce_notify_work();
1376                 return 1;
1377         }
1378         return 0;
1379 }
1380 EXPORT_SYMBOL_GPL(mce_notify_irq);
1381
1382 static int __mcheck_cpu_mce_banks_init(void)
1383 {
1384         int i;
1385         u8 num_banks = mca_cfg.banks;
1386
1387         mce_banks = kzalloc(num_banks * sizeof(struct mce_bank), GFP_KERNEL);
1388         if (!mce_banks)
1389                 return -ENOMEM;
1390
1391         for (i = 0; i < num_banks; i++) {
1392                 struct mce_bank *b = &mce_banks[i];
1393
1394                 b->ctl = -1ULL;
1395                 b->init = 1;
1396         }
1397         return 0;
1398 }
1399
1400 /*
1401  * Initialize Machine Checks for a CPU.
1402  */
1403 static int __mcheck_cpu_cap_init(void)
1404 {
1405         unsigned b;
1406         u64 cap;
1407
1408         rdmsrl(MSR_IA32_MCG_CAP, cap);
1409
1410         b = cap & MCG_BANKCNT_MASK;
1411         if (!mca_cfg.banks)
1412                 pr_info("CPU supports %d MCE banks\n", b);
1413
1414         if (b > MAX_NR_BANKS) {
1415                 pr_warn("Using only %u machine check banks out of %u\n",
1416                         MAX_NR_BANKS, b);
1417                 b = MAX_NR_BANKS;
1418         }
1419
1420         /* Don't support asymmetric configurations today */
1421         WARN_ON(mca_cfg.banks != 0 && b != mca_cfg.banks);
1422         mca_cfg.banks = b;
1423
1424         if (!mce_banks) {
1425                 int err = __mcheck_cpu_mce_banks_init();
1426
1427                 if (err)
1428                         return err;
1429         }
1430
1431         /* Use accurate RIP reporting if available. */
1432         if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
1433                 mca_cfg.rip_msr = MSR_IA32_MCG_EIP;
1434
1435         if (cap & MCG_SER_P)
1436                 mca_cfg.ser = true;
1437
1438         return 0;
1439 }
1440
1441 static void __mcheck_cpu_init_generic(void)
1442 {
1443         enum mcp_flags m_fl = 0;
1444         mce_banks_t all_banks;
1445         u64 cap;
1446         int i;
1447
1448         if (!mca_cfg.bootlog)
1449                 m_fl = MCP_DONTLOG;
1450
1451         /*
1452          * Log the machine checks left over from the previous reset.
1453          */
1454         bitmap_fill(all_banks, MAX_NR_BANKS);
1455         machine_check_poll(MCP_UC | m_fl, &all_banks);
1456
1457         cr4_set_bits(X86_CR4_MCE);
1458
1459         rdmsrl(MSR_IA32_MCG_CAP, cap);
1460         if (cap & MCG_CTL_P)
1461                 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
1462
1463         for (i = 0; i < mca_cfg.banks; i++) {
1464                 struct mce_bank *b = &mce_banks[i];
1465
1466                 if (!b->init)
1467                         continue;
1468                 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
1469                 wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
1470         }
1471 }
1472
1473 /*
1474  * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
1475  * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
1476  * Vol 3B Table 15-20). But this confuses both the code that determines
1477  * whether the machine check occurred in kernel or user mode, and also
1478  * the severity assessment code. Pretend that EIPV was set, and take the
1479  * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
1480  */
1481 static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
1482 {
1483         if (bank != 0)
1484                 return;
1485         if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
1486                 return;
1487         if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
1488                           MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
1489                           MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
1490                           MCACOD)) !=
1491                          (MCI_STATUS_UC|MCI_STATUS_EN|
1492                           MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
1493                           MCI_STATUS_AR|MCACOD_INSTR))
1494                 return;
1495
1496         m->mcgstatus |= MCG_STATUS_EIPV;
1497         m->ip = regs->ip;
1498         m->cs = regs->cs;
1499 }
1500
1501 /* Add per CPU specific workarounds here */
1502 static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1503 {
1504         struct mca_config *cfg = &mca_cfg;
1505
1506         if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1507                 pr_info("unknown CPU type - not enabling MCE support\n");
1508                 return -EOPNOTSUPP;
1509         }
1510
1511         /* This should be disabled by the BIOS, but isn't always */
1512         if (c->x86_vendor == X86_VENDOR_AMD) {
1513                 if (c->x86 == 15 && cfg->banks > 4) {
1514                         /*
1515                          * disable GART TBL walk error reporting, which
1516                          * trips off incorrectly with the IOMMU & 3ware
1517                          * & Cerberus:
1518                          */
1519                         clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
1520                 }
1521                 if (c->x86 <= 17 && cfg->bootlog < 0) {
1522                         /*
1523                          * Lots of broken BIOS around that don't clear them
1524                          * by default and leave crap in there. Don't log:
1525                          */
1526                         cfg->bootlog = 0;
1527                 }
1528                 /*
1529                  * Various K7s with broken bank 0 around. Always disable
1530                  * by default.
1531                  */
1532                 if (c->x86 == 6 && cfg->banks > 0)
1533                         mce_banks[0].ctl = 0;
1534
1535                 /*
1536                  * overflow_recov is supported for F15h Models 00h-0fh
1537                  * even though we don't have a CPUID bit for it.
1538                  */
1539                 if (c->x86 == 0x15 && c->x86_model <= 0xf)
1540                         mce_flags.overflow_recov = 1;
1541
1542                 /*
1543                  * Turn off MC4_MISC thresholding banks on those models since
1544                  * they're not supported there.
1545                  */
1546                 if (c->x86 == 0x15 &&
1547                     (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
1548                         int i;
1549                         u64 hwcr;
1550                         bool need_toggle;
1551                         u32 msrs[] = {
1552                                 0x00000413, /* MC4_MISC0 */
1553                                 0xc0000408, /* MC4_MISC1 */
1554                         };
1555
1556                         rdmsrl(MSR_K7_HWCR, hwcr);
1557
1558                         /* McStatusWrEn has to be set */
1559                         need_toggle = !(hwcr & BIT(18));
1560
1561                         if (need_toggle)
1562                                 wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
1563
1564                         /* Clear CntP bit safely */
1565                         for (i = 0; i < ARRAY_SIZE(msrs); i++)
1566                                 msr_clear_bit(msrs[i], 62);
1567
1568                         /* restore old settings */
1569                         if (need_toggle)
1570                                 wrmsrl(MSR_K7_HWCR, hwcr);
1571                 }
1572         }
1573
1574         if (c->x86_vendor == X86_VENDOR_INTEL) {
1575                 /*
1576                  * SDM documents that on family 6 bank 0 should not be written
1577                  * because it aliases to another special BIOS controlled
1578                  * register.
1579                  * But it's not aliased anymore on model 0x1a+
1580                  * Don't ignore bank 0 completely because there could be a
1581                  * valid event later, merely don't write CTL0.
1582                  */
1583
1584                 if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0)
1585                         mce_banks[0].init = 0;
1586
1587                 /*
1588                  * All newer Intel systems support MCE broadcasting. Enable
1589                  * synchronization with a one second timeout.
1590                  */
1591                 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
1592                         cfg->monarch_timeout < 0)
1593                         cfg->monarch_timeout = USEC_PER_SEC;
1594
1595                 /*
1596                  * There are also broken BIOSes on some Pentium M and
1597                  * earlier systems:
1598                  */
1599                 if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0)
1600                         cfg->bootlog = 0;
1601
1602                 if (c->x86 == 6 && c->x86_model == 45)
1603                         quirk_no_way_out = quirk_sandybridge_ifu;
1604         }
1605         if (cfg->monarch_timeout < 0)
1606                 cfg->monarch_timeout = 0;
1607         if (cfg->bootlog != 0)
1608                 cfg->panic_timeout = 30;
1609
1610         return 0;
1611 }
1612
1613 static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
1614 {
1615         if (c->x86 != 5)
1616                 return 0;
1617
1618         switch (c->x86_vendor) {
1619         case X86_VENDOR_INTEL:
1620                 intel_p5_mcheck_init(c);
1621                 return 1;
1622                 break;
1623         case X86_VENDOR_CENTAUR:
1624                 winchip_mcheck_init(c);
1625                 return 1;
1626                 break;
1627         default:
1628                 return 0;
1629         }
1630
1631         return 0;
1632 }
1633
1634 static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1635 {
1636         switch (c->x86_vendor) {
1637         case X86_VENDOR_INTEL:
1638                 mce_intel_feature_init(c);
1639                 mce_adjust_timer = cmci_intel_adjust_timer;
1640                 break;
1641
1642         case X86_VENDOR_AMD: {
1643                 u32 ebx = cpuid_ebx(0x80000007);
1644
1645                 mce_amd_feature_init(c);
1646                 mce_flags.overflow_recov = !!(ebx & BIT(0));
1647                 mce_flags.succor         = !!(ebx & BIT(1));
1648                 mce_flags.smca           = !!(ebx & BIT(3));
1649
1650                 break;
1651                 }
1652
1653         default:
1654                 break;
1655         }
1656 }
1657
1658 static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
1659 {
1660         switch (c->x86_vendor) {
1661         case X86_VENDOR_INTEL:
1662                 mce_intel_feature_clear(c);
1663                 break;
1664         default:
1665                 break;
1666         }
1667 }
1668
1669 static void mce_start_timer(unsigned int cpu, struct hrtimer *t)
1670 {
1671         unsigned long iv = check_interval * HZ;
1672
1673         if (mca_cfg.ignore_ce || !iv)
1674                 return;
1675
1676         per_cpu(mce_next_interval, cpu) = iv;
1677
1678         hrtimer_start_range_ns(t, ns_to_ktime(jiffies_to_usecs(iv) * 1000ULL),
1679                         0, HRTIMER_MODE_REL_PINNED);
1680 }
1681
1682 static void __mcheck_cpu_init_timer(void)
1683 {
1684         struct hrtimer *t = this_cpu_ptr(&mce_timer);
1685         unsigned int cpu = smp_processor_id();
1686
1687         hrtimer_init(t, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1688         t->function = mce_timer_fn;
1689         mce_start_timer(cpu, t);
1690 }
1691
1692 /* Handle unconfigured int18 (should never happen) */
1693 static void unexpected_machine_check(struct pt_regs *regs, long error_code)
1694 {
1695         pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
1696                smp_processor_id());
1697 }
1698
1699 /* Call the installed machine check handler for this CPU setup. */
1700 void (*machine_check_vector)(struct pt_regs *, long error_code) =
1701                                                 unexpected_machine_check;
1702
1703 /*
1704  * Called for each booted CPU to set up machine checks.
1705  * Must be called with preempt off:
1706  */
1707 void mcheck_cpu_init(struct cpuinfo_x86 *c)
1708 {
1709         if (mca_cfg.disabled)
1710                 return;
1711
1712         if (__mcheck_cpu_ancient_init(c))
1713                 return;
1714
1715         if (!mce_available(c))
1716                 return;
1717
1718         if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
1719                 mca_cfg.disabled = true;
1720                 return;
1721         }
1722
1723         if (mce_gen_pool_init()) {
1724                 mca_cfg.disabled = true;
1725                 pr_emerg("Couldn't allocate MCE records pool!\n");
1726                 return;
1727         }
1728
1729         machine_check_vector = do_machine_check;
1730
1731         __mcheck_cpu_init_generic();
1732         __mcheck_cpu_init_vendor(c);
1733         __mcheck_cpu_init_timer();
1734 }
1735
1736 /*
1737  * Called for each booted CPU to clear some machine checks opt-ins
1738  */
1739 void mcheck_cpu_clear(struct cpuinfo_x86 *c)
1740 {
1741         if (mca_cfg.disabled)
1742                 return;
1743
1744         if (!mce_available(c))
1745                 return;
1746
1747         /*
1748          * Possibly to clear general settings generic to x86
1749          * __mcheck_cpu_clear_generic(c);
1750          */
1751         __mcheck_cpu_clear_vendor(c);
1752
1753 }
1754
1755 /*
1756  * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
1757  */
1758
1759 static DEFINE_SPINLOCK(mce_chrdev_state_lock);
1760 static int mce_chrdev_open_count;       /* #times opened */
1761 static int mce_chrdev_open_exclu;       /* already open exclusive? */
1762
1763 static int mce_chrdev_open(struct inode *inode, struct file *file)
1764 {
1765         spin_lock(&mce_chrdev_state_lock);
1766
1767         if (mce_chrdev_open_exclu ||
1768             (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
1769                 spin_unlock(&mce_chrdev_state_lock);
1770
1771                 return -EBUSY;
1772         }
1773
1774         if (file->f_flags & O_EXCL)
1775                 mce_chrdev_open_exclu = 1;
1776         mce_chrdev_open_count++;
1777
1778         spin_unlock(&mce_chrdev_state_lock);
1779
1780         return nonseekable_open(inode, file);
1781 }
1782
1783 static int mce_chrdev_release(struct inode *inode, struct file *file)
1784 {
1785         spin_lock(&mce_chrdev_state_lock);
1786
1787         mce_chrdev_open_count--;
1788         mce_chrdev_open_exclu = 0;
1789
1790         spin_unlock(&mce_chrdev_state_lock);
1791
1792         return 0;
1793 }
1794
1795 static void collect_tscs(void *data)
1796 {
1797         unsigned long *cpu_tsc = (unsigned long *)data;
1798
1799         cpu_tsc[smp_processor_id()] = rdtsc();
1800 }
1801
1802 static int mce_apei_read_done;
1803
1804 /* Collect MCE record of previous boot in persistent storage via APEI ERST. */
1805 static int __mce_read_apei(char __user **ubuf, size_t usize)
1806 {
1807         int rc;
1808         u64 record_id;
1809         struct mce m;
1810
1811         if (usize < sizeof(struct mce))
1812                 return -EINVAL;
1813
1814         rc = apei_read_mce(&m, &record_id);
1815         /* Error or no more MCE record */
1816         if (rc <= 0) {
1817                 mce_apei_read_done = 1;
1818                 /*
1819                  * When ERST is disabled, mce_chrdev_read() should return
1820                  * "no record" instead of "no device."
1821                  */
1822                 if (rc == -ENODEV)
1823                         return 0;
1824                 return rc;
1825         }
1826         rc = -EFAULT;
1827         if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
1828                 return rc;
1829         /*
1830          * In fact, we should have cleared the record after that has
1831          * been flushed to the disk or sent to network in
1832          * /sbin/mcelog, but we have no interface to support that now,
1833          * so just clear it to avoid duplication.
1834          */
1835         rc = apei_clear_mce(record_id);
1836         if (rc) {
1837                 mce_apei_read_done = 1;
1838                 return rc;
1839         }
1840         *ubuf += sizeof(struct mce);
1841
1842         return 0;
1843 }
1844
1845 static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
1846                                 size_t usize, loff_t *off)
1847 {
1848         char __user *buf = ubuf;
1849         unsigned long *cpu_tsc;
1850         unsigned prev, next;
1851         int i, err;
1852
1853         cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
1854         if (!cpu_tsc)
1855                 return -ENOMEM;
1856
1857         mutex_lock(&mce_chrdev_read_mutex);
1858
1859         if (!mce_apei_read_done) {
1860                 err = __mce_read_apei(&buf, usize);
1861                 if (err || buf != ubuf)
1862                         goto out;
1863         }
1864
1865         next = mce_log_get_idx_check(mcelog.next);
1866
1867         /* Only supports full reads right now */
1868         err = -EINVAL;
1869         if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
1870                 goto out;
1871
1872         err = 0;
1873         prev = 0;
1874         do {
1875                 for (i = prev; i < next; i++) {
1876                         unsigned long start = jiffies;
1877                         struct mce *m = &mcelog.entry[i];
1878
1879                         while (!m->finished) {
1880                                 if (time_after_eq(jiffies, start + 2)) {
1881                                         memset(m, 0, sizeof(*m));
1882                                         goto timeout;
1883                                 }
1884                                 cpu_relax();
1885                         }
1886                         smp_rmb();
1887                         err |= copy_to_user(buf, m, sizeof(*m));
1888                         buf += sizeof(*m);
1889 timeout:
1890                         ;
1891                 }
1892
1893                 memset(mcelog.entry + prev, 0,
1894                        (next - prev) * sizeof(struct mce));
1895                 prev = next;
1896                 next = cmpxchg(&mcelog.next, prev, 0);
1897         } while (next != prev);
1898
1899         synchronize_sched();
1900
1901         /*
1902          * Collect entries that were still getting written before the
1903          * synchronize.
1904          */
1905         on_each_cpu(collect_tscs, cpu_tsc, 1);
1906
1907         for (i = next; i < MCE_LOG_LEN; i++) {
1908                 struct mce *m = &mcelog.entry[i];
1909
1910                 if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
1911                         err |= copy_to_user(buf, m, sizeof(*m));
1912                         smp_rmb();
1913                         buf += sizeof(*m);
1914                         memset(m, 0, sizeof(*m));
1915                 }
1916         }
1917
1918         if (err)
1919                 err = -EFAULT;
1920
1921 out:
1922         mutex_unlock(&mce_chrdev_read_mutex);
1923         kfree(cpu_tsc);
1924
1925         return err ? err : buf - ubuf;
1926 }
1927
1928 static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
1929 {
1930         poll_wait(file, &mce_chrdev_wait, wait);
1931         if (READ_ONCE(mcelog.next))
1932                 return POLLIN | POLLRDNORM;
1933         if (!mce_apei_read_done && apei_check_mce())
1934                 return POLLIN | POLLRDNORM;
1935         return 0;
1936 }
1937
1938 static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
1939                                 unsigned long arg)
1940 {
1941         int __user *p = (int __user *)arg;
1942
1943         if (!capable(CAP_SYS_ADMIN))
1944                 return -EPERM;
1945
1946         switch (cmd) {
1947         case MCE_GET_RECORD_LEN:
1948                 return put_user(sizeof(struct mce), p);
1949         case MCE_GET_LOG_LEN:
1950                 return put_user(MCE_LOG_LEN, p);
1951         case MCE_GETCLEAR_FLAGS: {
1952                 unsigned flags;
1953
1954                 do {
1955                         flags = mcelog.flags;
1956                 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
1957
1958                 return put_user(flags, p);
1959         }
1960         default:
1961                 return -ENOTTY;
1962         }
1963 }
1964
1965 static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf,
1966                             size_t usize, loff_t *off);
1967
1968 void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
1969                              const char __user *ubuf,
1970                              size_t usize, loff_t *off))
1971 {
1972         mce_write = fn;
1973 }
1974 EXPORT_SYMBOL_GPL(register_mce_write_callback);
1975
1976 static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
1977                                 size_t usize, loff_t *off)
1978 {
1979         if (mce_write)
1980                 return mce_write(filp, ubuf, usize, off);
1981         else
1982                 return -EINVAL;
1983 }
1984
1985 static const struct file_operations mce_chrdev_ops = {
1986         .open                   = mce_chrdev_open,
1987         .release                = mce_chrdev_release,
1988         .read                   = mce_chrdev_read,
1989         .write                  = mce_chrdev_write,
1990         .poll                   = mce_chrdev_poll,
1991         .unlocked_ioctl         = mce_chrdev_ioctl,
1992         .llseek                 = no_llseek,
1993 };
1994
1995 static struct miscdevice mce_chrdev_device = {
1996         MISC_MCELOG_MINOR,
1997         "mcelog",
1998         &mce_chrdev_ops,
1999 };
2000
2001 static void __mce_disable_bank(void *arg)
2002 {
2003         int bank = *((int *)arg);
2004         __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
2005         cmci_disable_bank(bank);
2006 }
2007
2008 void mce_disable_bank(int bank)
2009 {
2010         if (bank >= mca_cfg.banks) {
2011                 pr_warn(FW_BUG
2012                         "Ignoring request to disable invalid MCA bank %d.\n",
2013                         bank);
2014                 return;
2015         }
2016         set_bit(bank, mce_banks_ce_disabled);
2017         on_each_cpu(__mce_disable_bank, &bank, 1);
2018 }
2019
2020 /*
2021  * mce=off Disables machine check
2022  * mce=no_cmci Disables CMCI
2023  * mce=no_lmce Disables LMCE
2024  * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
2025  * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
2026  * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
2027  *      monarchtimeout is how long to wait for other CPUs on machine
2028  *      check, or 0 to not wait
2029  * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
2030  * mce=nobootlog Don't log MCEs from before booting.
2031  * mce=bios_cmci_threshold Don't program the CMCI threshold
2032  */
2033 static int __init mcheck_enable(char *str)
2034 {
2035         struct mca_config *cfg = &mca_cfg;
2036
2037         if (*str == 0) {
2038                 enable_p5_mce();
2039                 return 1;
2040         }
2041         if (*str == '=')
2042                 str++;
2043         if (!strcmp(str, "off"))
2044                 cfg->disabled = true;
2045         else if (!strcmp(str, "no_cmci"))
2046                 cfg->cmci_disabled = true;
2047         else if (!strcmp(str, "no_lmce"))
2048                 cfg->lmce_disabled = true;
2049         else if (!strcmp(str, "dont_log_ce"))
2050                 cfg->dont_log_ce = true;
2051         else if (!strcmp(str, "ignore_ce"))
2052                 cfg->ignore_ce = true;
2053         else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
2054                 cfg->bootlog = (str[0] == 'b');
2055         else if (!strcmp(str, "bios_cmci_threshold"))
2056                 cfg->bios_cmci_threshold = true;
2057         else if (isdigit(str[0])) {
2058                 if (get_option(&str, &cfg->tolerant) == 2)
2059                         get_option(&str, &(cfg->monarch_timeout));
2060         } else {
2061                 pr_info("mce argument %s ignored. Please use /sys\n", str);
2062                 return 0;
2063         }
2064         return 1;
2065 }
2066 __setup("mce", mcheck_enable);
2067
2068 int __init mcheck_init(void)
2069 {
2070         mcheck_intel_therm_init();
2071         mce_register_decode_chain(&mce_srao_nb);
2072         mcheck_vendor_init_severity();
2073
2074         INIT_WORK(&mce_work, mce_process_work);
2075         init_irq_work(&mce_irq_work, mce_irq_work_cb);
2076
2077         return 0;
2078 }
2079
2080 /*
2081  * mce_syscore: PM support
2082  */
2083
2084 /*
2085  * Disable machine checks on suspend and shutdown. We can't really handle
2086  * them later.
2087  */
2088 static void mce_disable_error_reporting(void)
2089 {
2090         int i;
2091
2092         for (i = 0; i < mca_cfg.banks; i++) {
2093                 struct mce_bank *b = &mce_banks[i];
2094
2095                 if (b->init)
2096                         wrmsrl(MSR_IA32_MCx_CTL(i), 0);
2097         }
2098         return;
2099 }
2100
2101 static void vendor_disable_error_reporting(void)
2102 {
2103         /*
2104          * Don't clear on Intel CPUs. Some of these MSRs are socket-wide.
2105          * Disabling them for just a single offlined CPU is bad, since it will
2106          * inhibit reporting for all shared resources on the socket like the
2107          * last level cache (LLC), the integrated memory controller (iMC), etc.
2108          */
2109         if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
2110                 return;
2111
2112         mce_disable_error_reporting();
2113 }
2114
2115 static int mce_syscore_suspend(void)
2116 {
2117         vendor_disable_error_reporting();
2118         return 0;
2119 }
2120
2121 static void mce_syscore_shutdown(void)
2122 {
2123         vendor_disable_error_reporting();
2124 }
2125
2126 /*
2127  * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
2128  * Only one CPU is active at this time, the others get re-added later using
2129  * CPU hotplug:
2130  */
2131 static void mce_syscore_resume(void)
2132 {
2133         __mcheck_cpu_init_generic();
2134         __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info));
2135 }
2136
2137 static struct syscore_ops mce_syscore_ops = {
2138         .suspend        = mce_syscore_suspend,
2139         .shutdown       = mce_syscore_shutdown,
2140         .resume         = mce_syscore_resume,
2141 };
2142
2143 /*
2144  * mce_device: Sysfs support
2145  */
2146
2147 static void mce_cpu_restart(void *data)
2148 {
2149         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2150                 return;
2151         __mcheck_cpu_init_generic();
2152         __mcheck_cpu_init_timer();
2153 }
2154
2155 /* Reinit MCEs after user configuration changes */
2156 static void mce_restart(void)
2157 {
2158         mce_timer_delete_all();
2159         on_each_cpu(mce_cpu_restart, NULL, 1);
2160 }
2161
2162 /* Toggle features for corrected errors */
2163 static void mce_disable_cmci(void *data)
2164 {
2165         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2166                 return;
2167         cmci_clear();
2168 }
2169
2170 static void mce_enable_ce(void *all)
2171 {
2172         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2173                 return;
2174         cmci_reenable();
2175         cmci_recheck();
2176         if (all)
2177                 __mcheck_cpu_init_timer();
2178 }
2179
2180 static struct bus_type mce_subsys = {
2181         .name           = "machinecheck",
2182         .dev_name       = "machinecheck",
2183 };
2184
2185 DEFINE_PER_CPU(struct device *, mce_device);
2186
2187 void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
2188
2189 static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
2190 {
2191         return container_of(attr, struct mce_bank, attr);
2192 }
2193
2194 static ssize_t show_bank(struct device *s, struct device_attribute *attr,
2195                          char *buf)
2196 {
2197         return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
2198 }
2199
2200 static ssize_t set_bank(struct device *s, struct device_attribute *attr,
2201                         const char *buf, size_t size)
2202 {
2203         u64 new;
2204
2205         if (kstrtou64(buf, 0, &new) < 0)
2206                 return -EINVAL;
2207
2208         attr_to_bank(attr)->ctl = new;
2209         mce_restart();
2210
2211         return size;
2212 }
2213
2214 static ssize_t
2215 show_trigger(struct device *s, struct device_attribute *attr, char *buf)
2216 {
2217         strcpy(buf, mce_helper);
2218         strcat(buf, "\n");
2219         return strlen(mce_helper) + 1;
2220 }
2221
2222 static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
2223                                 const char *buf, size_t siz)
2224 {
2225         char *p;
2226
2227         strncpy(mce_helper, buf, sizeof(mce_helper));
2228         mce_helper[sizeof(mce_helper)-1] = 0;
2229         p = strchr(mce_helper, '\n');
2230
2231         if (p)
2232                 *p = 0;
2233
2234         return strlen(mce_helper) + !!p;
2235 }
2236
2237 static ssize_t set_ignore_ce(struct device *s,
2238                              struct device_attribute *attr,
2239                              const char *buf, size_t size)
2240 {
2241         u64 new;
2242
2243         if (kstrtou64(buf, 0, &new) < 0)
2244                 return -EINVAL;
2245
2246         if (mca_cfg.ignore_ce ^ !!new) {
2247                 if (new) {
2248                         /* disable ce features */
2249                         mce_timer_delete_all();
2250                         on_each_cpu(mce_disable_cmci, NULL, 1);
2251                         mca_cfg.ignore_ce = true;
2252                 } else {
2253                         /* enable ce features */
2254                         mca_cfg.ignore_ce = false;
2255                         on_each_cpu(mce_enable_ce, (void *)1, 1);
2256                 }
2257         }
2258         return size;
2259 }
2260
2261 static ssize_t set_cmci_disabled(struct device *s,
2262                                  struct device_attribute *attr,
2263                                  const char *buf, size_t size)
2264 {
2265         u64 new;
2266
2267         if (kstrtou64(buf, 0, &new) < 0)
2268                 return -EINVAL;
2269
2270         if (mca_cfg.cmci_disabled ^ !!new) {
2271                 if (new) {
2272                         /* disable cmci */
2273                         on_each_cpu(mce_disable_cmci, NULL, 1);
2274                         mca_cfg.cmci_disabled = true;
2275                 } else {
2276                         /* enable cmci */
2277                         mca_cfg.cmci_disabled = false;
2278                         on_each_cpu(mce_enable_ce, NULL, 1);
2279                 }
2280         }
2281         return size;
2282 }
2283
2284 static ssize_t store_int_with_restart(struct device *s,
2285                                       struct device_attribute *attr,
2286                                       const char *buf, size_t size)
2287 {
2288         ssize_t ret = device_store_int(s, attr, buf, size);
2289         mce_restart();
2290         return ret;
2291 }
2292
2293 static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
2294 static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant);
2295 static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
2296 static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
2297
2298 static struct dev_ext_attribute dev_attr_check_interval = {
2299         __ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
2300         &check_interval
2301 };
2302
2303 static struct dev_ext_attribute dev_attr_ignore_ce = {
2304         __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce),
2305         &mca_cfg.ignore_ce
2306 };
2307
2308 static struct dev_ext_attribute dev_attr_cmci_disabled = {
2309         __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled),
2310         &mca_cfg.cmci_disabled
2311 };
2312
2313 static struct device_attribute *mce_device_attrs[] = {
2314         &dev_attr_tolerant.attr,
2315         &dev_attr_check_interval.attr,
2316         &dev_attr_trigger,
2317         &dev_attr_monarch_timeout.attr,
2318         &dev_attr_dont_log_ce.attr,
2319         &dev_attr_ignore_ce.attr,
2320         &dev_attr_cmci_disabled.attr,
2321         NULL
2322 };
2323
2324 static cpumask_var_t mce_device_initialized;
2325
2326 static void mce_device_release(struct device *dev)
2327 {
2328         kfree(dev);
2329 }
2330
2331 /* Per cpu device init. All of the cpus still share the same ctrl bank: */
2332 static int mce_device_create(unsigned int cpu)
2333 {
2334         struct device *dev;
2335         int err;
2336         int i, j;
2337
2338         if (!mce_available(&boot_cpu_data))
2339                 return -EIO;
2340
2341         dev = kzalloc(sizeof *dev, GFP_KERNEL);
2342         if (!dev)
2343                 return -ENOMEM;
2344         dev->id  = cpu;
2345         dev->bus = &mce_subsys;
2346         dev->release = &mce_device_release;
2347
2348         err = device_register(dev);
2349         if (err) {
2350                 put_device(dev);
2351                 return err;
2352         }
2353
2354         for (i = 0; mce_device_attrs[i]; i++) {
2355                 err = device_create_file(dev, mce_device_attrs[i]);
2356                 if (err)
2357                         goto error;
2358         }
2359         for (j = 0; j < mca_cfg.banks; j++) {
2360                 err = device_create_file(dev, &mce_banks[j].attr);
2361                 if (err)
2362                         goto error2;
2363         }
2364         cpumask_set_cpu(cpu, mce_device_initialized);
2365         per_cpu(mce_device, cpu) = dev;
2366
2367         return 0;
2368 error2:
2369         while (--j >= 0)
2370                 device_remove_file(dev, &mce_banks[j].attr);
2371 error:
2372         while (--i >= 0)
2373                 device_remove_file(dev, mce_device_attrs[i]);
2374
2375         device_unregister(dev);
2376
2377         return err;
2378 }
2379
2380 static void mce_device_remove(unsigned int cpu)
2381 {
2382         struct device *dev = per_cpu(mce_device, cpu);
2383         int i;
2384
2385         if (!cpumask_test_cpu(cpu, mce_device_initialized))
2386                 return;
2387
2388         for (i = 0; mce_device_attrs[i]; i++)
2389                 device_remove_file(dev, mce_device_attrs[i]);
2390
2391         for (i = 0; i < mca_cfg.banks; i++)
2392                 device_remove_file(dev, &mce_banks[i].attr);
2393
2394         device_unregister(dev);
2395         cpumask_clear_cpu(cpu, mce_device_initialized);
2396         per_cpu(mce_device, cpu) = NULL;
2397 }
2398
2399 /* Make sure there are no machine checks on offlined CPUs. */
2400 static void mce_disable_cpu(void *h)
2401 {
2402         unsigned long action = *(unsigned long *)h;
2403
2404         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2405                 return;
2406
2407         hrtimer_cancel(this_cpu_ptr(&mce_timer));
2408
2409         if (!(action & CPU_TASKS_FROZEN))
2410                 cmci_clear();
2411
2412         vendor_disable_error_reporting();
2413 }
2414
2415 static void mce_reenable_cpu(void *h)
2416 {
2417         unsigned long action = *(unsigned long *)h;
2418         int i;
2419
2420         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2421                 return;
2422
2423         if (!(action & CPU_TASKS_FROZEN))
2424                 cmci_reenable();
2425         for (i = 0; i < mca_cfg.banks; i++) {
2426                 struct mce_bank *b = &mce_banks[i];
2427
2428                 if (b->init)
2429                         wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
2430         }
2431         __mcheck_cpu_init_timer();
2432 }
2433
2434 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
2435 static int
2436 mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2437 {
2438         unsigned int cpu = (unsigned long)hcpu;
2439
2440         switch (action & ~CPU_TASKS_FROZEN) {
2441         case CPU_ONLINE:
2442                 mce_device_create(cpu);
2443                 if (threshold_cpu_callback)
2444                         threshold_cpu_callback(action, cpu);
2445                 break;
2446         case CPU_DEAD:
2447                 if (threshold_cpu_callback)
2448                         threshold_cpu_callback(action, cpu);
2449                 mce_device_remove(cpu);
2450                 mce_intel_hcpu_update(cpu);
2451
2452                 /* intentionally ignoring frozen here */
2453                 if (!(action & CPU_TASKS_FROZEN))
2454                         cmci_rediscover();
2455                 break;
2456         case CPU_DOWN_PREPARE:
2457                 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
2458                 break;
2459         case CPU_DOWN_FAILED:
2460                 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
2461                 break;
2462         }
2463
2464         return NOTIFY_OK;
2465 }
2466
2467 static struct notifier_block mce_cpu_notifier = {
2468         .notifier_call = mce_cpu_callback,
2469 };
2470
2471 static __init void mce_init_banks(void)
2472 {
2473         int i;
2474
2475         for (i = 0; i < mca_cfg.banks; i++) {
2476                 struct mce_bank *b = &mce_banks[i];
2477                 struct device_attribute *a = &b->attr;
2478
2479                 sysfs_attr_init(&a->attr);
2480                 a->attr.name    = b->attrname;
2481                 snprintf(b->attrname, ATTR_LEN, "bank%d", i);
2482
2483                 a->attr.mode    = 0644;
2484                 a->show         = show_bank;
2485                 a->store        = set_bank;
2486         }
2487 }
2488
2489 static __init int mcheck_init_device(void)
2490 {
2491         int err;
2492         int i = 0;
2493
2494         if (!mce_available(&boot_cpu_data)) {
2495                 err = -EIO;
2496                 goto err_out;
2497         }
2498
2499         err = mce_notify_work_init();
2500         if (err)
2501                 goto err_out;
2502
2503         if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
2504                 err = -ENOMEM;
2505                 goto err_out;
2506         }
2507
2508         mce_init_banks();
2509
2510         err = subsys_system_register(&mce_subsys, NULL);
2511         if (err)
2512                 goto err_out_mem;
2513
2514         cpu_notifier_register_begin();
2515         for_each_online_cpu(i) {
2516                 err = mce_device_create(i);
2517                 if (err) {
2518                         /*
2519                          * Register notifier anyway (and do not unreg it) so
2520                          * that we don't leave undeleted timers, see notifier
2521                          * callback above.
2522                          */
2523                         __register_hotcpu_notifier(&mce_cpu_notifier);
2524                         cpu_notifier_register_done();
2525                         goto err_device_create;
2526                 }
2527         }
2528
2529         __register_hotcpu_notifier(&mce_cpu_notifier);
2530         cpu_notifier_register_done();
2531
2532         register_syscore_ops(&mce_syscore_ops);
2533
2534         /* register character device /dev/mcelog */
2535         err = misc_register(&mce_chrdev_device);
2536         if (err)
2537                 goto err_register;
2538
2539         return 0;
2540
2541 err_register:
2542         unregister_syscore_ops(&mce_syscore_ops);
2543
2544 err_device_create:
2545         /*
2546          * We didn't keep track of which devices were created above, but
2547          * even if we had, the set of online cpus might have changed.
2548          * Play safe and remove for every possible cpu, since
2549          * mce_device_remove() will do the right thing.
2550          */
2551         for_each_possible_cpu(i)
2552                 mce_device_remove(i);
2553
2554 err_out_mem:
2555         free_cpumask_var(mce_device_initialized);
2556
2557 err_out:
2558         pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
2559
2560         return err;
2561 }
2562 device_initcall_sync(mcheck_init_device);
2563
2564 /*
2565  * Old style boot options parsing. Only for compatibility.
2566  */
2567 static int __init mcheck_disable(char *str)
2568 {
2569         mca_cfg.disabled = true;
2570         return 1;
2571 }
2572 __setup("nomce", mcheck_disable);
2573
2574 #ifdef CONFIG_DEBUG_FS
2575 struct dentry *mce_get_debugfs_dir(void)
2576 {
2577         static struct dentry *dmce;
2578
2579         if (!dmce)
2580                 dmce = debugfs_create_dir("mce", NULL);
2581
2582         return dmce;
2583 }
2584
2585 static void mce_reset(void)
2586 {
2587         cpu_missing = 0;
2588         atomic_set(&mce_fake_panicked, 0);
2589         atomic_set(&mce_executing, 0);
2590         atomic_set(&mce_callin, 0);
2591         atomic_set(&global_nwo, 0);
2592 }
2593
2594 static int fake_panic_get(void *data, u64 *val)
2595 {
2596         *val = fake_panic;
2597         return 0;
2598 }
2599
2600 static int fake_panic_set(void *data, u64 val)
2601 {
2602         mce_reset();
2603         fake_panic = val;
2604         return 0;
2605 }
2606
2607 DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
2608                         fake_panic_set, "%llu\n");
2609
2610 static int __init mcheck_debugfs_init(void)
2611 {
2612         struct dentry *dmce, *ffake_panic;
2613
2614         dmce = mce_get_debugfs_dir();
2615         if (!dmce)
2616                 return -ENOMEM;
2617         ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL,
2618                                           &fake_panic_fops);
2619         if (!ffake_panic)
2620                 return -ENOMEM;
2621
2622         return 0;
2623 }
2624 #else
2625 static int __init mcheck_debugfs_init(void) { return -EINVAL; }
2626 #endif
2627
2628 static int __init mcheck_late_init(void)
2629 {
2630         mcheck_debugfs_init();
2631
2632         /*
2633          * Flush out everything that has been logged during early boot, now that
2634          * everything has been initialized (workqueues, decoders, ...).
2635          */
2636         mce_schedule_work();
2637
2638         return 0;
2639 }
2640 late_initcall(mcheck_late_init);