These changes are the raw update to linux-4.4.6-rt14. Kernel sources
[kvmfornfv.git] / kernel / arch / x86 / xen / setup.c
index 55f388e..7ab2951 100644 (file)
 #include <xen/interface/memory.h>
 #include <xen/interface/physdev.h>
 #include <xen/features.h>
+#include <xen/hvc-console.h>
 #include "xen-ops.h"
 #include "vdso.h"
-#include "p2m.h"
 #include "mmu.h"
 
+#define GB(x) ((uint64_t)(x) * 1024 * 1024 * 1024)
+
 /* Amount of extra memory space we add to the e820 ranges */
 struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
 
 /* Number of pages released from the initial allocation. */
 unsigned long xen_released_pages;
 
+/* E820 map used during setting up memory. */
+static struct e820entry xen_e820_map[E820MAX] __initdata;
+static u32 xen_e820_map_entries __initdata;
+
 /*
  * Buffer used to remap identity mapped pages. We only need the virtual space.
  * The physical page behind this address is remapped as needed to different
@@ -64,62 +70,89 @@ static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY;
  */
 #define EXTRA_MEM_RATIO                (10)
 
-static void __init xen_add_extra_mem(phys_addr_t start, phys_addr_t size)
+static bool xen_512gb_limit __initdata = IS_ENABLED(CONFIG_XEN_512GB);
+
+static void __init xen_parse_512gb(void)
+{
+       bool val = false;
+       char *arg;
+
+       arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit");
+       if (!arg)
+               return;
+
+       arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit=");
+       if (!arg)
+               val = true;
+       else if (strtobool(arg + strlen("xen_512gb_limit="), &val))
+               return;
+
+       xen_512gb_limit = val;
+}
+
+static void __init xen_add_extra_mem(unsigned long start_pfn,
+                                    unsigned long n_pfns)
 {
        int i;
 
+       /*
+        * No need to check for zero size, should happen rarely and will only
+        * write a new entry regarded to be unused due to zero size.
+        */
        for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
                /* Add new region. */
-               if (xen_extra_mem[i].size == 0) {
-                       xen_extra_mem[i].start = start;
-                       xen_extra_mem[i].size  = size;
+               if (xen_extra_mem[i].n_pfns == 0) {
+                       xen_extra_mem[i].start_pfn = start_pfn;
+                       xen_extra_mem[i].n_pfns = n_pfns;
                        break;
                }
                /* Append to existing region. */
-               if (xen_extra_mem[i].start + xen_extra_mem[i].size == start) {
-                       xen_extra_mem[i].size += size;
+               if (xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns ==
+                   start_pfn) {
+                       xen_extra_mem[i].n_pfns += n_pfns;
                        break;
                }
        }
        if (i == XEN_EXTRA_MEM_MAX_REGIONS)
                printk(KERN_WARNING "Warning: not enough extra memory regions\n");
 
-       memblock_reserve(start, size);
+       memblock_reserve(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
 }
 
-static void __init xen_del_extra_mem(phys_addr_t start, phys_addr_t size)
+static void __init xen_del_extra_mem(unsigned long start_pfn,
+                                    unsigned long n_pfns)
 {
        int i;
-       phys_addr_t start_r, size_r;
+       unsigned long start_r, size_r;
 
        for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
-               start_r = xen_extra_mem[i].start;
-               size_r = xen_extra_mem[i].size;
+               start_r = xen_extra_mem[i].start_pfn;
+               size_r = xen_extra_mem[i].n_pfns;
 
                /* Start of region. */
-               if (start_r == start) {
-                       BUG_ON(size > size_r);
-                       xen_extra_mem[i].start += size;
-                       xen_extra_mem[i].size -= size;
+               if (start_r == start_pfn) {
+                       BUG_ON(n_pfns > size_r);
+                       xen_extra_mem[i].start_pfn += n_pfns;
+                       xen_extra_mem[i].n_pfns -= n_pfns;
                        break;
                }
                /* End of region. */
-               if (start_r + size_r == start + size) {
-                       BUG_ON(size > size_r);
-                       xen_extra_mem[i].size -= size;
+               if (start_r + size_r == start_pfn + n_pfns) {
+                       BUG_ON(n_pfns > size_r);
+                       xen_extra_mem[i].n_pfns -= n_pfns;
                        break;
                }
                /* Mid of region. */
-               if (start > start_r && start < start_r + size_r) {
-                       BUG_ON(start + size > start_r + size_r);
-                       xen_extra_mem[i].size = start - start_r;
+               if (start_pfn > start_r && start_pfn < start_r + size_r) {
+                       BUG_ON(start_pfn + n_pfns > start_r + size_r);
+                       xen_extra_mem[i].n_pfns = start_pfn - start_r;
                        /* Calling memblock_reserve() again is okay. */
-                       xen_add_extra_mem(start + size, start_r + size_r -
-                                         (start + size));
+                       xen_add_extra_mem(start_pfn + n_pfns, start_r + size_r -
+                                         (start_pfn + n_pfns));
                        break;
                }
        }
-       memblock_free(start, size);
+       memblock_free(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
 }
 
 /*
@@ -130,11 +163,10 @@ static void __init xen_del_extra_mem(phys_addr_t start, phys_addr_t size)
 unsigned long __ref xen_chk_extra_mem(unsigned long pfn)
 {
        int i;
-       phys_addr_t addr = PFN_PHYS(pfn);
 
        for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
-               if (addr >= xen_extra_mem[i].start &&
-                   addr < xen_extra_mem[i].start + xen_extra_mem[i].size)
+               if (pfn >= xen_extra_mem[i].start_pfn &&
+                   pfn < xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns)
                        return INVALID_P2M_ENTRY;
        }
 
@@ -150,10 +182,10 @@ void __init xen_inv_extra_mem(void)
        int i;
 
        for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
-               if (!xen_extra_mem[i].size)
+               if (!xen_extra_mem[i].n_pfns)
                        continue;
-               pfn_s = PFN_DOWN(xen_extra_mem[i].start);
-               pfn_e = PFN_UP(xen_extra_mem[i].start + xen_extra_mem[i].size);
+               pfn_s = xen_extra_mem[i].start_pfn;
+               pfn_e = pfn_s + xen_extra_mem[i].n_pfns;
                for (pfn = pfn_s; pfn < pfn_e; pfn++)
                        set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
        }
@@ -164,15 +196,13 @@ void __init xen_inv_extra_mem(void)
  * This function updates min_pfn with the pfn found and returns
  * the size of that range or zero if not found.
  */
-static unsigned long __init xen_find_pfn_range(
-       const struct e820entry *list, size_t map_size,
-       unsigned long *min_pfn)
+static unsigned long __init xen_find_pfn_range(unsigned long *min_pfn)
 {
-       const struct e820entry *entry;
+       const struct e820entry *entry = xen_e820_map;
        unsigned int i;
        unsigned long done = 0;
 
-       for (i = 0, entry = list; i < map_size; i++, entry++) {
+       for (i = 0; i < xen_e820_map_entries; i++, entry++) {
                unsigned long s_pfn;
                unsigned long e_pfn;
 
@@ -182,7 +212,7 @@ static unsigned long __init xen_find_pfn_range(
                e_pfn = PFN_DOWN(entry->addr + entry->size);
 
                /* We only care about E820 after this */
-               if (e_pfn < *min_pfn)
+               if (e_pfn <= *min_pfn)
                        continue;
 
                s_pfn = PFN_UP(entry->addr);
@@ -221,7 +251,7 @@ static int __init xen_free_mfn(unsigned long mfn)
  * as a fallback if the remapping fails.
  */
 static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn,
-       unsigned long end_pfn, unsigned long nr_pages, unsigned long *released)
+                       unsigned long end_pfn, unsigned long nr_pages)
 {
        unsigned long pfn, end;
        int ret;
@@ -241,7 +271,7 @@ static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn,
                WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret);
 
                if (ret == 1) {
-                       (*released)++;
+                       xen_released_pages++;
                        if (!__set_phys_to_machine(pfn, INVALID_P2M_ENTRY))
                                break;
                } else
@@ -356,9 +386,8 @@ static void __init xen_do_set_identity_and_remap_chunk(
  * to Xen and not remapped.
  */
 static unsigned long __init xen_set_identity_and_remap_chunk(
-        const struct e820entry *list, size_t map_size, unsigned long start_pfn,
-       unsigned long end_pfn, unsigned long nr_pages, unsigned long remap_pfn,
-       unsigned long *released, unsigned long *remapped)
+       unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages,
+       unsigned long remap_pfn)
 {
        unsigned long pfn;
        unsigned long i = 0;
@@ -379,12 +408,11 @@ static unsigned long __init xen_set_identity_and_remap_chunk(
                if (cur_pfn + size > nr_pages)
                        size = nr_pages - cur_pfn;
 
-               remap_range_size = xen_find_pfn_range(list, map_size,
-                                                     &remap_pfn);
+               remap_range_size = xen_find_pfn_range(&remap_pfn);
                if (!remap_range_size) {
                        pr_warning("Unable to find available pfn range, not remapping identity pages\n");
                        xen_set_identity_and_release_chunk(cur_pfn,
-                               cur_pfn + left, nr_pages, released);
+                                               cur_pfn + left, nr_pages);
                        break;
                }
                /* Adjust size to fit in current e820 RAM region */
@@ -396,7 +424,6 @@ static unsigned long __init xen_set_identity_and_remap_chunk(
                /* Update variables to reflect new mappings. */
                i += size;
                remap_pfn += size;
-               *remapped += size;
        }
 
        /*
@@ -411,15 +438,11 @@ static unsigned long __init xen_set_identity_and_remap_chunk(
        return remap_pfn;
 }
 
-static void __init xen_set_identity_and_remap(
-       const struct e820entry *list, size_t map_size, unsigned long nr_pages,
-       unsigned long *released, unsigned long *remapped)
+static void __init xen_set_identity_and_remap(unsigned long nr_pages)
 {
        phys_addr_t start = 0;
        unsigned long last_pfn = nr_pages;
-       const struct e820entry *entry;
-       unsigned long num_released = 0;
-       unsigned long num_remapped = 0;
+       const struct e820entry *entry = xen_e820_map;
        int i;
 
        /*
@@ -433,9 +456,9 @@ static void __init xen_set_identity_and_remap(
         * example) the DMI tables in a reserved region that begins on
         * a non-page boundary.
         */
-       for (i = 0, entry = list; i < map_size; i++, entry++) {
+       for (i = 0; i < xen_e820_map_entries; i++, entry++) {
                phys_addr_t end = entry->addr + entry->size;
-               if (entry->type == E820_RAM || i == map_size - 1) {
+               if (entry->type == E820_RAM || i == xen_e820_map_entries - 1) {
                        unsigned long start_pfn = PFN_DOWN(start);
                        unsigned long end_pfn = PFN_UP(end);
 
@@ -444,17 +467,13 @@ static void __init xen_set_identity_and_remap(
 
                        if (start_pfn < end_pfn)
                                last_pfn = xen_set_identity_and_remap_chunk(
-                                               list, map_size, start_pfn,
-                                               end_pfn, nr_pages, last_pfn,
-                                               &num_released, &num_remapped);
+                                               start_pfn, end_pfn, nr_pages,
+                                               last_pfn);
                        start = end;
                }
        }
 
-       *released = num_released;
-       *remapped = num_remapped;
-
-       pr_info("Released %ld page(s)\n", num_released);
+       pr_info("Released %ld page(s)\n", xen_released_pages);
 }
 
 /*
@@ -494,7 +513,7 @@ void __init xen_remap_memory(void)
                } else if (pfn_s + len == xen_remap_buf.target_pfn) {
                        len += xen_remap_buf.size;
                } else {
-                       xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len));
+                       xen_del_extra_mem(pfn_s, len);
                        pfn_s = xen_remap_buf.target_pfn;
                        len = xen_remap_buf.size;
                }
@@ -504,18 +523,35 @@ void __init xen_remap_memory(void)
        }
 
        if (pfn_s != ~0UL && len)
-               xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len));
+               xen_del_extra_mem(pfn_s, len);
 
        set_pte_mfn(buf, mfn_save, PAGE_KERNEL);
 
        pr_info("Remapped %ld page(s)\n", remapped);
 }
 
+static unsigned long __init xen_get_pages_limit(void)
+{
+       unsigned long limit;
+
+#ifdef CONFIG_X86_32
+       limit = GB(64) / PAGE_SIZE;
+#else
+       limit = MAXMEM / PAGE_SIZE;
+       if (!xen_initial_domain() && xen_512gb_limit)
+               limit = GB(512) / PAGE_SIZE;
+#endif
+       return limit;
+}
+
 static unsigned long __init xen_get_max_pages(void)
 {
-       unsigned long max_pages = MAX_DOMAIN_PAGES;
+       unsigned long max_pages, limit;
        domid_t domid = DOMID_SELF;
-       int ret;
+       long ret;
+
+       limit = xen_get_pages_limit();
+       max_pages = limit;
 
        /*
         * For the initial domain we use the maximum reservation as
@@ -532,7 +568,7 @@ static unsigned long __init xen_get_max_pages(void)
                        max_pages = ret;
        }
 
-       return min(max_pages, MAX_DOMAIN_PAGES);
+       return min(max_pages, limit);
 }
 
 static void __init xen_align_and_add_e820_region(phys_addr_t start,
@@ -549,39 +585,188 @@ static void __init xen_align_and_add_e820_region(phys_addr_t start,
        e820_add_region(start, end - start, type);
 }
 
-static void __init xen_ignore_unusable(struct e820entry *list, size_t map_size)
+static void __init xen_ignore_unusable(void)
 {
-       struct e820entry *entry;
+       struct e820entry *entry = xen_e820_map;
        unsigned int i;
 
-       for (i = 0, entry = list; i < map_size; i++, entry++) {
+       for (i = 0; i < xen_e820_map_entries; i++, entry++) {
                if (entry->type == E820_UNUSABLE)
                        entry->type = E820_RAM;
        }
 }
 
+static unsigned long __init xen_count_remap_pages(unsigned long max_pfn)
+{
+       unsigned long extra = 0;
+       unsigned long start_pfn, end_pfn;
+       const struct e820entry *entry = xen_e820_map;
+       int i;
+
+       end_pfn = 0;
+       for (i = 0; i < xen_e820_map_entries; i++, entry++) {
+               start_pfn = PFN_DOWN(entry->addr);
+               /* Adjacent regions on non-page boundaries handling! */
+               end_pfn = min(end_pfn, start_pfn);
+
+               if (start_pfn >= max_pfn)
+                       return extra + max_pfn - end_pfn;
+
+               /* Add any holes in map to result. */
+               extra += start_pfn - end_pfn;
+
+               end_pfn = PFN_UP(entry->addr + entry->size);
+               end_pfn = min(end_pfn, max_pfn);
+
+               if (entry->type != E820_RAM)
+                       extra += end_pfn - start_pfn;
+       }
+
+       return extra;
+}
+
+bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size)
+{
+       struct e820entry *entry;
+       unsigned mapcnt;
+       phys_addr_t end;
+
+       if (!size)
+               return false;
+
+       end = start + size;
+       entry = xen_e820_map;
+
+       for (mapcnt = 0; mapcnt < xen_e820_map_entries; mapcnt++) {
+               if (entry->type == E820_RAM && entry->addr <= start &&
+                   (entry->addr + entry->size) >= end)
+                       return false;
+
+               entry++;
+       }
+
+       return true;
+}
+
+/*
+ * Find a free area in physical memory not yet reserved and compliant with
+ * E820 map.
+ * Used to relocate pre-allocated areas like initrd or p2m list which are in
+ * conflict with the to be used E820 map.
+ * In case no area is found, return 0. Otherwise return the physical address
+ * of the area which is already reserved for convenience.
+ */
+phys_addr_t __init xen_find_free_area(phys_addr_t size)
+{
+       unsigned mapcnt;
+       phys_addr_t addr, start;
+       struct e820entry *entry = xen_e820_map;
+
+       for (mapcnt = 0; mapcnt < xen_e820_map_entries; mapcnt++, entry++) {
+               if (entry->type != E820_RAM || entry->size < size)
+                       continue;
+               start = entry->addr;
+               for (addr = start; addr < start + size; addr += PAGE_SIZE) {
+                       if (!memblock_is_reserved(addr))
+                               continue;
+                       start = addr + PAGE_SIZE;
+                       if (start + size > entry->addr + entry->size)
+                               break;
+               }
+               if (addr >= start + size) {
+                       memblock_reserve(start, size);
+                       return start;
+               }
+       }
+
+       return 0;
+}
+
+/*
+ * Like memcpy, but with physical addresses for dest and src.
+ */
+static void __init xen_phys_memcpy(phys_addr_t dest, phys_addr_t src,
+                                  phys_addr_t n)
+{
+       phys_addr_t dest_off, src_off, dest_len, src_len, len;
+       void *from, *to;
+
+       while (n) {
+               dest_off = dest & ~PAGE_MASK;
+               src_off = src & ~PAGE_MASK;
+               dest_len = n;
+               if (dest_len > (NR_FIX_BTMAPS << PAGE_SHIFT) - dest_off)
+                       dest_len = (NR_FIX_BTMAPS << PAGE_SHIFT) - dest_off;
+               src_len = n;
+               if (src_len > (NR_FIX_BTMAPS << PAGE_SHIFT) - src_off)
+                       src_len = (NR_FIX_BTMAPS << PAGE_SHIFT) - src_off;
+               len = min(dest_len, src_len);
+               to = early_memremap(dest - dest_off, dest_len + dest_off);
+               from = early_memremap(src - src_off, src_len + src_off);
+               memcpy(to, from, len);
+               early_memunmap(to, dest_len + dest_off);
+               early_memunmap(from, src_len + src_off);
+               n -= len;
+               dest += len;
+               src += len;
+       }
+}
+
+/*
+ * Reserve Xen mfn_list.
+ */
+static void __init xen_reserve_xen_mfnlist(void)
+{
+       phys_addr_t start, size;
+
+       if (xen_start_info->mfn_list >= __START_KERNEL_map) {
+               start = __pa(xen_start_info->mfn_list);
+               size = PFN_ALIGN(xen_start_info->nr_pages *
+                                sizeof(unsigned long));
+       } else {
+               start = PFN_PHYS(xen_start_info->first_p2m_pfn);
+               size = PFN_PHYS(xen_start_info->nr_p2m_frames);
+       }
+
+       if (!xen_is_e820_reserved(start, size)) {
+               memblock_reserve(start, size);
+               return;
+       }
+
+#ifdef CONFIG_X86_32
+       /*
+        * Relocating the p2m on 32 bit system to an arbitrary virtual address
+        * is not supported, so just give up.
+        */
+       xen_raw_console_write("Xen hypervisor allocated p2m list conflicts with E820 map\n");
+       BUG();
+#else
+       xen_relocate_p2m();
+#endif
+}
+
 /**
  * machine_specific_memory_setup - Hook for machine specific memory setup.
  **/
 char * __init xen_memory_setup(void)
 {
-       static struct e820entry map[E820MAX] __initdata;
-
-       unsigned long max_pfn = xen_start_info->nr_pages;
-       phys_addr_t mem_end;
+       unsigned long max_pfn, pfn_s, n_pfns;
+       phys_addr_t mem_end, addr, size, chunk_size;
+       u32 type;
        int rc;
        struct xen_memory_map memmap;
        unsigned long max_pages;
        unsigned long extra_pages = 0;
-       unsigned long remapped_pages;
        int i;
        int op;
 
-       max_pfn = min(MAX_DOMAIN_PAGES, max_pfn);
+       xen_parse_512gb();
+       max_pfn = xen_get_pages_limit();
+       max_pfn = min(max_pfn, xen_start_info->nr_pages);
        mem_end = PFN_PHYS(max_pfn);
 
        memmap.nr_entries = E820MAX;
-       set_xen_guest_handle(memmap.buffer, map);
+       set_xen_guest_handle(memmap.buffer, xen_e820_map);
 
        op = xen_initial_domain() ?
                XENMEM_machine_memory_map :
@@ -590,15 +775,16 @@ char * __init xen_memory_setup(void)
        if (rc == -ENOSYS) {
                BUG_ON(xen_initial_domain());
                memmap.nr_entries = 1;
-               map[0].addr = 0ULL;
-               map[0].size = mem_end;
+               xen_e820_map[0].addr = 0ULL;
+               xen_e820_map[0].size = mem_end;
                /* 8MB slack (to balance backend allocations). */
-               map[0].size += 8ULL << 20;
-               map[0].type = E820_RAM;
+               xen_e820_map[0].size += 8ULL << 20;
+               xen_e820_map[0].type = E820_RAM;
                rc = 0;
        }
        BUG_ON(rc);
        BUG_ON(memmap.nr_entries == 0);
+       xen_e820_map_entries = memmap.nr_entries;
 
        /*
         * Xen won't allow a 1:1 mapping to be created to UNUSABLE
@@ -609,24 +795,19 @@ char * __init xen_memory_setup(void)
         * a patch in the future.
         */
        if (xen_initial_domain())
-               xen_ignore_unusable(map, memmap.nr_entries);
+               xen_ignore_unusable();
 
        /* Make sure the Xen-supplied memory map is well-ordered. */
-       sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries);
+       sanitize_e820_map(xen_e820_map, ARRAY_SIZE(xen_e820_map),
+                         &xen_e820_map_entries);
 
        max_pages = xen_get_max_pages();
-       if (max_pages > max_pfn)
-               extra_pages += max_pages - max_pfn;
 
-       /*
-        * Set identity map on non-RAM pages and prepare remapping the
-        * underlying RAM.
-        */
-       xen_set_identity_and_remap(map, memmap.nr_entries, max_pfn,
-                                  &xen_released_pages, &remapped_pages);
+       /* How many extra pages do we need due to remapping? */
+       max_pages += xen_count_remap_pages(max_pfn);
 
-       extra_pages += xen_released_pages;
-       extra_pages += remapped_pages;
+       if (max_pages > max_pfn)
+               extra_pages += max_pages - max_pfn;
 
        /*
         * Clamp the amount of extra memory to a EXTRA_MEM_RATIO
@@ -635,46 +816,57 @@ char * __init xen_memory_setup(void)
         * is limited to the max size of lowmem, so that it doesn't
         * get completely filled.
         *
+        * Make sure we have no memory above max_pages, as this area
+        * isn't handled by the p2m management.
+        *
         * In principle there could be a problem in lowmem systems if
         * the initial memory is also very large with respect to
         * lowmem, but we won't try to deal with that here.
         */
-       extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
-                         extra_pages);
+       extra_pages = min3(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
+                          extra_pages, max_pages - max_pfn);
        i = 0;
-       while (i < memmap.nr_entries) {
-               phys_addr_t addr = map[i].addr;
-               phys_addr_t size = map[i].size;
-               u32 type = map[i].type;
+       addr = xen_e820_map[0].addr;
+       size = xen_e820_map[0].size;
+       while (i < xen_e820_map_entries) {
+               bool discard = false;
+
+               chunk_size = size;
+               type = xen_e820_map[i].type;
 
                if (type == E820_RAM) {
                        if (addr < mem_end) {
-                               size = min(size, mem_end - addr);
+                               chunk_size = min(size, mem_end - addr);
                        } else if (extra_pages) {
-                               size = min(size, PFN_PHYS(extra_pages));
-                               extra_pages -= PFN_DOWN(size);
-                               xen_add_extra_mem(addr, size);
-                               xen_max_p2m_pfn = PFN_DOWN(addr + size);
+                               chunk_size = min(size, PFN_PHYS(extra_pages));
+                               pfn_s = PFN_UP(addr);
+                               n_pfns = PFN_DOWN(addr + chunk_size) - pfn_s;
+                               extra_pages -= n_pfns;
+                               xen_add_extra_mem(pfn_s, n_pfns);
+                               xen_max_p2m_pfn = pfn_s + n_pfns;
                        } else
-                               type = E820_UNUSABLE;
+                               discard = true;
                }
 
-               xen_align_and_add_e820_region(addr, size, type);
+               if (!discard)
+                       xen_align_and_add_e820_region(addr, chunk_size, type);
 
-               map[i].addr += size;
-               map[i].size -= size;
-               if (map[i].size == 0)
+               addr += chunk_size;
+               size -= chunk_size;
+               if (size == 0) {
                        i++;
+                       if (i < xen_e820_map_entries) {
+                               addr = xen_e820_map[i].addr;
+                               size = xen_e820_map[i].size;
+                       }
+               }
        }
 
        /*
         * Set the rest as identity mapped, in case PCI BARs are
         * located here.
-        *
-        * PFNs above MAX_P2M_PFN are considered identity mapped as
-        * well.
         */
-       set_phys_range_identity(map[i-1].addr / PAGE_SIZE, ~0ul);
+       set_phys_range_identity(addr / PAGE_SIZE, ~0ul);
 
        /*
         * In domU, the ISA region is normal, usable memory, but we
@@ -684,34 +876,53 @@ char * __init xen_memory_setup(void)
        e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
                        E820_RESERVED);
 
+       sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+
        /*
-        * Reserve Xen bits:
-        *  - mfn_list
-        *  - xen_start_info
-        * See comment above "struct start_info" in <xen/interface/xen.h>
-        * We tried to make the the memblock_reserve more selective so
-        * that it would be clear what region is reserved. Sadly we ran
-        * in the problem wherein on a 64-bit hypervisor with a 32-bit
-        * initial domain, the pt_base has the cr3 value which is not
-        * neccessarily where the pagetable starts! As Jan put it: "
-        * Actually, the adjustment turns out to be correct: The page
-        * tables for a 32-on-64 dom0 get allocated in the order "first L1",
-        * "first L2", "first L3", so the offset to the page table base is
-        * indeed 2. When reading xen/include/public/xen.h's comment
-        * very strictly, this is not a violation (since there nothing is said
-        * that the first thing in the page table space is pointed to by
-        * pt_base; I admit that this seems to be implied though, namely
-        * do I think that it is implied that the page table space is the
-        * range [pt_base, pt_base + nt_pt_frames), whereas that
-        * range here indeed is [pt_base - 2, pt_base - 2 + nt_pt_frames),
-        * which - without a priori knowledge - the kernel would have
-        * difficulty to figure out)." - so lets just fall back to the
-        * easy way and reserve the whole region.
+        * Check whether the kernel itself conflicts with the target E820 map.
+        * Failing now is better than running into weird problems later due
+        * to relocating (and even reusing) pages with kernel text or data.
         */
-       memblock_reserve(__pa(xen_start_info->mfn_list),
-                        xen_start_info->pt_base - xen_start_info->mfn_list);
+       if (xen_is_e820_reserved(__pa_symbol(_text),
+                       __pa_symbol(__bss_stop) - __pa_symbol(_text))) {
+               xen_raw_console_write("Xen hypervisor allocated kernel memory conflicts with E820 map\n");
+               BUG();
+       }
 
-       sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+       /*
+        * Check for a conflict of the hypervisor supplied page tables with
+        * the target E820 map.
+        */
+       xen_pt_check_e820();
+
+       xen_reserve_xen_mfnlist();
+
+       /* Check for a conflict of the initrd with the target E820 map. */
+       if (xen_is_e820_reserved(boot_params.hdr.ramdisk_image,
+                                boot_params.hdr.ramdisk_size)) {
+               phys_addr_t new_area, start, size;
+
+               new_area = xen_find_free_area(boot_params.hdr.ramdisk_size);
+               if (!new_area) {
+                       xen_raw_console_write("Can't find new memory area for initrd needed due to E820 map conflict\n");
+                       BUG();
+               }
+
+               start = boot_params.hdr.ramdisk_image;
+               size = boot_params.hdr.ramdisk_size;
+               xen_phys_memcpy(new_area, start, size);
+               pr_info("initrd moved from [mem %#010llx-%#010llx] to [mem %#010llx-%#010llx]\n",
+                       start, start + size, new_area, new_area + size);
+               memblock_free(start, size);
+               boot_params.hdr.ramdisk_image = new_area;
+               boot_params.ext_ramdisk_image = new_area >> 32;
+       }
+
+       /*
+        * Set identity map on non-RAM pages and prepare remapping the
+        * underlying RAM.
+        */
+       xen_set_identity_and_remap(max_pfn);
 
        return "Xen";
 }
@@ -721,26 +932,30 @@ char * __init xen_memory_setup(void)
  */
 char * __init xen_auto_xlated_memory_setup(void)
 {
-       static struct e820entry map[E820MAX] __initdata;
-
        struct xen_memory_map memmap;
        int i;
        int rc;
 
        memmap.nr_entries = E820MAX;
-       set_xen_guest_handle(memmap.buffer, map);
+       set_xen_guest_handle(memmap.buffer, xen_e820_map);
 
        rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
        if (rc < 0)
                panic("No memory map (%d)\n", rc);
 
-       sanitize_e820_map(map, ARRAY_SIZE(map), &memmap.nr_entries);
+       xen_e820_map_entries = memmap.nr_entries;
+
+       sanitize_e820_map(xen_e820_map, ARRAY_SIZE(xen_e820_map),
+                         &xen_e820_map_entries);
 
-       for (i = 0; i < memmap.nr_entries; i++)
-               e820_add_region(map[i].addr, map[i].size, map[i].type);
+       for (i = 0; i < xen_e820_map_entries; i++)
+               e820_add_region(xen_e820_map[i].addr, xen_e820_map[i].size,
+                               xen_e820_map[i].type);
 
-       memblock_reserve(__pa(xen_start_info->mfn_list),
-                        xen_start_info->pt_base - xen_start_info->mfn_list);
+       /* Remove p2m info, it is not needed. */
+       xen_start_info->mfn_list = 0;
+       xen_start_info->first_p2m_pfn = 0;
+       xen_start_info->nr_p2m_frames = 0;
 
        return "Xen";
 }
@@ -753,17 +968,8 @@ char * __init xen_auto_xlated_memory_setup(void)
 static void __init fiddle_vdso(void)
 {
 #ifdef CONFIG_X86_32
-       /*
-        * This could be called before selected_vdso32 is initialized, so
-        * just fiddle with both possible images.  vdso_image_32_syscall
-        * can't be selected, since it only exists on 64-bit systems.
-        */
-       u32 *mask;
-       mask = vdso_image_32_int80.data +
-               vdso_image_32_int80.sym_VDSO32_NOTE_MASK;
-       *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
-       mask = vdso_image_32_sysenter.data +
-               vdso_image_32_sysenter.sym_VDSO32_NOTE_MASK;
+       u32 *mask = vdso_image_32.data +
+               vdso_image_32.sym_VDSO32_NOTE_MASK;
        *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
 #endif
 }