From: jbeulich@xxxxxxxxxx Subject: eliminate scalability issues from initial mapping setup Patch-mainline: obsolete References: bnc#417417 Direct Xen to place the initial P->M table outside of the initial mapping, as otherwise the 1G (implementation) / 2G (theoretical) restriction on the size of the initial mapping limits the amount of memory a domain can be handed initially. --- head-2009-01-06.orig/arch/x86/kernel/head64-xen.c 2008-12-15 11:28:15.000000000 +0100 +++ head-2009-01-06/arch/x86/kernel/head64-xen.c 2008-12-23 12:23:58.000000000 +0100 @@ -171,6 +171,14 @@ void __init x86_64_start_reservations(ch + (xen_start_info->nr_pt_frames << PAGE_SHIFT), "Xen provided"); + if (xen_feature(XENFEAT_auto_translated_physmap)) + xen_start_info->mfn_list = ~0UL; + else if (xen_start_info->mfn_list < __START_KERNEL_map) + reserve_early(xen_start_info->first_p2m_pfn << PAGE_SHIFT, + (xen_start_info->first_p2m_pfn + + xen_start_info->nr_p2m_frames) << PAGE_SHIFT, + "INITP2M"); + /* * At this point everything still needed from the boot loader * or BIOS or kernel text should be early reserved or marked not --- head-2009-01-06.orig/arch/x86/kernel/head_64-xen.S 2008-12-15 11:34:16.000000000 +0100 +++ head-2009-01-06/arch/x86/kernel/head_64-xen.S 2008-12-22 12:57:33.000000000 +0100 @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -135,6 +136,7 @@ ENTRY(empty_zero_page) ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .quad startup_64) ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .quad hypercall_page) ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .quad _PAGE_PRESENT, _PAGE_PRESENT) + ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M, .quad VMEMMAP_START) ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "writable_page_tables|writable_descriptor_tables|auto_translated_physmap|pae_pgdir_above_4gb|supervisor_mode_kernel") ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1) --- head-2009-01-06.orig/arch/x86/kernel/setup-xen.c 2008-12-23 09:42:42.000000000 +0100 +++ head-2009-01-06/arch/x86/kernel/setup-xen.c 2008-12-23 15:58:43.000000000 +0100 @@ -1022,7 +1022,7 @@ void __init setup_arch(char **cmdline_p) difference = xen_start_info->nr_pages - max_pfn; set_xen_guest_handle(reservation.extent_start, - ((unsigned long *)xen_start_info->mfn_list) + max_pfn); + phys_to_machine_mapping + max_pfn); reservation.nr_extents = difference; ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); @@ -1039,14 +1039,86 @@ void __init setup_arch(char **cmdline_p) phys_to_machine_mapping = alloc_bootmem_pages( max_pfn * sizeof(unsigned long)); memcpy(phys_to_machine_mapping, - (unsigned long *)xen_start_info->mfn_list, + __va(__pa(xen_start_info->mfn_list)), p2m_pages * sizeof(unsigned long)); memset(phys_to_machine_mapping + p2m_pages, ~0, (max_pfn - p2m_pages) * sizeof(unsigned long)); - free_bootmem( - __pa(xen_start_info->mfn_list), - PFN_PHYS(PFN_UP(xen_start_info->nr_pages * - sizeof(unsigned long)))); + +#ifdef CONFIG_X86_64 + if (xen_start_info->mfn_list == VMEMMAP_START) { + /* + * Since it is well isolated we can (and since it is + * perhaps large we should) also free the page tables + * mapping the initial P->M table. + */ + unsigned long va = VMEMMAP_START, pa; + pgd_t *pgd = pgd_offset_k(va); + pud_t *pud_page = pud_offset(pgd, 0); + + BUILD_BUG_ON(VMEMMAP_START & ~PGDIR_MASK); + xen_l4_entry_update(pgd, __pgd(0)); + for(;;) { + pud_t *pud = pud_page + pud_index(va); + + if (pud_none(*pud)) + va += PUD_SIZE; + else if (pud_large(*pud)) { + pa = pud_val(*pud) & PHYSICAL_PAGE_MASK; + make_pages_writable(__va(pa), + PUD_SIZE >> PAGE_SHIFT, + XENFEAT_writable_page_tables); + free_bootmem(pa, PUD_SIZE); + va += PUD_SIZE; + } else { + pmd_t *pmd = pmd_offset(pud, va); + + if (pmd_large(*pmd)) { + pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK; + make_pages_writable(__va(pa), + PMD_SIZE >> PAGE_SHIFT, + XENFEAT_writable_page_tables); + free_bootmem(pa, PMD_SIZE); + } else if (!pmd_none(*pmd)) { + pte_t *pte = pte_offset_kernel(pmd, va); + + for (i = 0; i < PTRS_PER_PTE; ++i) { + if (pte_none(pte[i])) + break; + pa = pte_pfn(pte[i]) << PAGE_SHIFT; + make_page_writable(__va(pa), + XENFEAT_writable_page_tables); + free_bootmem(pa, PAGE_SIZE); + } + ClearPagePinned(virt_to_page(pte)); + make_page_writable(pte, + XENFEAT_writable_page_tables); + free_bootmem(__pa(pte), PAGE_SIZE); + } + va += PMD_SIZE; + if (pmd_index(va)) + continue; + ClearPagePinned(virt_to_page(pmd)); + make_page_writable(pmd, + XENFEAT_writable_page_tables); + free_bootmem(__pa((unsigned long)pmd + & PAGE_MASK), + PAGE_SIZE); + } + if (!pud_index(va)) + break; + } + ClearPagePinned(virt_to_page(pud_page)); + make_page_writable(pud_page, + XENFEAT_writable_page_tables); + free_bootmem(__pa((unsigned long)pud_page & PAGE_MASK), + PAGE_SIZE); + } else if (!WARN_ON(xen_start_info->mfn_list + < __START_KERNEL_map)) +#endif + free_bootmem(__pa(xen_start_info->mfn_list), + PFN_PHYS(PFN_UP(xen_start_info->nr_pages * + sizeof(unsigned long)))); + /* * Initialise the list of the frames that specify the list of --- head-2009-01-06.orig/arch/x86/mm/init_64-xen.c 2009-01-07 10:57:55.000000000 +0100 +++ head-2009-01-06/arch/x86/mm/init_64-xen.c 2009-01-07 10:58:07.000000000 +0100 @@ -157,6 +157,17 @@ static unsigned long __meminitdata table static unsigned long __meminitdata table_cur; static unsigned long __meminitdata table_top; +static __init unsigned long get_table_cur(void) +{ + BUG_ON(!table_cur); + if (xen_start_info->mfn_list < __START_KERNEL_map + && table_cur == xen_start_info->first_p2m_pfn) { + table_cur += xen_start_info->nr_p2m_frames; + table_top += xen_start_info->nr_p2m_frames; + } + return table_cur++; +} + /* * NOTE: This function is marked __ref because it calls __init function * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. @@ -168,8 +179,7 @@ static __ref void *spp_getpage(void) if (after_bootmem) ptr = (void *) get_zeroed_page(GFP_ATOMIC); else if (table_cur < table_top) { - ptr = __va(table_cur << PAGE_SHIFT); - table_cur++; + ptr = __va(get_table_cur() << PAGE_SHIFT); memset(ptr, 0, PAGE_SIZE); } else ptr = alloc_bootmem_pages(PAGE_SIZE); @@ -334,8 +344,7 @@ static __ref void *alloc_low_page(unsign return adr; } - BUG_ON(!table_cur); - pfn = table_cur++; + pfn = get_table_cur(); if (pfn >= table_top) panic("alloc_low_page: ran out of memory"); @@ -361,14 +370,29 @@ static inline int __meminit make_readonl /* Make new page tables read-only on the first pass. */ if (!xen_feature(XENFEAT_writable_page_tables) && !max_pfn_mapped - && (paddr >= (table_start << PAGE_SHIFT)) - && (paddr < (table_top << PAGE_SHIFT))) - readonly = 1; + && (paddr >= (table_start << PAGE_SHIFT))) { + unsigned long top = table_top; + + /* Account for the range get_table_cur() skips. */ + if (xen_start_info->mfn_list < __START_KERNEL_map + && table_cur <= xen_start_info->first_p2m_pfn + && top > xen_start_info->first_p2m_pfn) + top += xen_start_info->nr_p2m_frames; + if (paddr < (top << PAGE_SHIFT)) + readonly = 1; + } /* Make old page tables read-only. */ if (!xen_feature(XENFEAT_writable_page_tables) && (paddr >= (xen_start_info->pt_base - __START_KERNEL_map)) && (paddr < (table_cur << PAGE_SHIFT))) readonly = 1; + /* Make P->M table (and its page tables) read-only. */ + if (!xen_feature(XENFEAT_writable_page_tables) + && xen_start_info->mfn_list < __START_KERNEL_map + && paddr >= (xen_start_info->first_p2m_pfn << PAGE_SHIFT) + && paddr < (xen_start_info->first_p2m_pfn + + xen_start_info->nr_p2m_frames) << PAGE_SHIFT) + readonly = 1; /* * No need for writable mapping of kernel image. This also ensures that @@ -616,6 +640,12 @@ void __init xen_init_pt(void) __pud(__pa_symbol(level2_kernel_pgt) | _PAGE_TABLE); memcpy(level2_kernel_pgt, page, PAGE_SIZE); + /* Copy the initial P->M table mappings if necessary. */ + addr = pgd_index(xen_start_info->mfn_list); + if (addr < pgd_index(__START_KERNEL_map)) + init_level4_pgt[addr] = + ((pgd_t *)xen_start_info->pt_base)[addr]; + /* Do an early initialization of the fixmap area. */ addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE); level3_kernel_pgt[pud_index(addr)] = @@ -675,22 +705,28 @@ static void __init find_early_table_spac static void __init xen_finish_init_mapping(void) { unsigned long i, start, end; + struct mmuext_op mmuext; /* Re-vector virtual addresses pointing into the initial mapping to the just-established permanent ones. */ xen_start_info = __va(__pa(xen_start_info)); xen_start_info->pt_base = (unsigned long) __va(__pa(xen_start_info->pt_base)); - if (!xen_feature(XENFEAT_auto_translated_physmap)) { + if (!xen_feature(XENFEAT_auto_translated_physmap) + && xen_start_info->mfn_list >= __START_KERNEL_map) phys_to_machine_mapping = __va(__pa(xen_start_info->mfn_list)); - xen_start_info->mfn_list = (unsigned long) - phys_to_machine_mapping; - } if (xen_start_info->mod_start) xen_start_info->mod_start = (unsigned long) __va(__pa(xen_start_info->mod_start)); + /* Unpin the no longer used Xen provided page tables. */ + mmuext.cmd = MMUEXT_UNPIN_TABLE; + mmuext.arg1.mfn = pfn_to_mfn(__pa(xen_start_info->pt_base) + >> PAGE_SHIFT); + if (HYPERVISOR_mmuext_op(&mmuext, 1, NULL, DOMID_SELF)) + BUG(); + /* Destroy the Xen-created mappings beyond the kernel image. */ start = PAGE_ALIGN((unsigned long)_end); end = __START_KERNEL_map + (table_start << PAGE_SHIFT); @@ -934,9 +970,20 @@ unsigned long __init_refok init_memory_m __flush_tlb_all(); - if (!after_bootmem && table_top > table_start) + if (!after_bootmem && table_top > table_start) { + if (xen_start_info->mfn_list < __START_KERNEL_map + && table_start <= xen_start_info->first_p2m_pfn + && table_top > xen_start_info->first_p2m_pfn) { + reserve_early(table_start << PAGE_SHIFT, + xen_start_info->first_p2m_pfn + << PAGE_SHIFT, + "PGTABLE"); + table_start = xen_start_info->first_p2m_pfn + + xen_start_info->nr_p2m_frames; + } reserve_early(table_start << PAGE_SHIFT, table_top << PAGE_SHIFT, "PGTABLE"); + } printk(KERN_INFO "last_map_addr: %lx end: %lx\n", last_map_addr, end); --- head-2009-01-06.orig/arch/x86/mm/pgtable-xen.c 2008-12-15 11:34:16.000000000 +0100 +++ head-2009-01-06/arch/x86/mm/pgtable-xen.c 2008-12-23 14:38:22.000000000 +0100 @@ -323,7 +323,7 @@ void __init xen_init_pgd_pin(void) if (PTRS_PER_PUD > 1) /* not folded */ SetPagePinned(virt_to_page(pud)); for (u = 0; u < PTRS_PER_PUD; u++, pud++) { - if (!pud_present(*pud)) + if (!pud_present(*pud) || pud_large(*pud)) continue; pmd = pmd_offset(pud, 0); if (PTRS_PER_PMD > 1) /* not folded */ @@ -334,7 +334,7 @@ void __init xen_init_pgd_pin(void) && m >= pmd_index(HYPERVISOR_VIRT_START)) continue; #endif - if (!pmd_present(*pmd)) + if (!pmd_present(*pmd) || pmd_large(*pmd)) continue; SetPagePinned(pmd_page(*pmd)); } --- head-2009-01-06.orig/include/xen/interface/elfnote.h 2008-11-25 12:35:56.000000000 +0100 +++ head-2009-01-06/include/xen/interface/elfnote.h 2008-12-22 12:53:19.000000000 +0100 @@ -162,9 +162,20 @@ #define XEN_ELFNOTE_SUSPEND_CANCEL 14 /* + * The (non-default) location the initial phys-to-machine map should be + * placed at by the hypervisor (Dom0) or the tools (DomU). + * The kernel must be prepared for this mapping to be established using + * large pages, despite such otherwise not being available to guests. + * The kernel must also be prepared that the page table pages used for + * this mapping may not be accessible through the initial mapping. + * (Only x86-64 supports this at present.) + */ +#define XEN_ELFNOTE_INIT_P2M 15 + +/* * The number of the highest elfnote defined. */ -#define XEN_ELFNOTE_MAX XEN_ELFNOTE_SUSPEND_CANCEL +#define XEN_ELFNOTE_MAX XEN_ELFNOTE_INIT_P2M /* * System information exported through crash notes. --- head-2009-01-06.orig/include/xen/interface/xen.h 2008-12-15 11:27:38.000000000 +0100 +++ head-2009-01-06/include/xen/interface/xen.h 2008-12-23 10:50:00.000000000 +0100 @@ -534,6 +534,7 @@ typedef struct shared_info shared_info_t * a. relocated kernel image * b. initial ram disk [mod_start, mod_len] * c. list of allocated page frames [mfn_list, nr_pages] + * (unless relocated due to XEN_ELFNOTE_INIT_P2M) * d. start_info_t structure [register ESI (x86)] * e. bootstrap page tables [pt_base, CR3 (x86)] * f. bootstrap stack [register ESP (x86)] @@ -575,6 +576,9 @@ struct start_info { unsigned long mod_start; /* VIRTUAL address of pre-loaded module. */ unsigned long mod_len; /* Size (bytes) of pre-loaded module. */ int8_t cmd_line[MAX_GUEST_CMDLINE]; + /* The pfn range here covers both page table and p->m table frames. */ + unsigned long first_p2m_pfn;/* 1st pfn forming initial P->M table. */ + unsigned long nr_p2m_frames;/* # of pfns forming initial P->M table. */ }; typedef struct start_info start_info_t;