Subject: linux/x86-64: fix issues with the assignment of huge amounts of memory This, together with the Xen side patch to not assign unmanageable amounts of memory to it when it doesn't specify XEN_ELFNOTE_INIT_P2M, makes Dom0 boot without any extra arguments specified on a 1Tb machine. The limit enforced for MAXMEM may not be very exact (i.e. it might be possible to up it further to, say, 0x77ffffffffUL), but the limit is definitely below 512Gb. It doesn't seem very significant, though, to have the exact boundary determined here. Also fix the overlap of the modules area with the fixmaps. Written and tested on our 2.6.16.60-based SLE10 kernel; it applies unmodified to the legacy 2.6.18 one. Signed-off-by: Jan Beulich --- sle10sp3-2009-05-28.orig/arch/x86_64/kernel/head-xen.S 2009-06-05 08:36:00.000000000 +0200 +++ sle10sp3-2009-05-28/arch/x86_64/kernel/head-xen.S 2009-06-04 17:01:57.000000000 +0200 @@ -77,9 +77,6 @@ NEXT_PAGE(level3_kernel_pgt) NEXT_PAGE(level3_user_pgt) .fill 512,8,0 -NEXT_PAGE(level2_kernel_pgt) - .fill 512,8,0 - NEXT_PAGE(hypercall_page) CFI_STARTPROC .rept 0x1000 / 0x20 --- sle10sp3-2009-05-28.orig/arch/x86_64/kernel/setup-xen.c 2009-05-15 15:32:47.000000000 +0200 +++ sle10sp3-2009-05-28/arch/x86_64/kernel/setup-xen.c 2009-06-05 08:39:45.000000000 +0200 @@ -519,10 +519,12 @@ contig_initmem_init(unsigned long start_ panic("Cannot find bootmem map of size %ld\n",bootmap_size); bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn); #ifdef CONFIG_XEN - e820_bootmem_free(NODE_DATA(0), 0, xen_start_info->nr_pages<nr_pages < end_pfn) + e820_bootmem_free(NODE_DATA(0), 0, + xen_start_info->nr_pages<pt_base; addr = page[pgd_index(__START_KERNEL_map)]; addr_to_page(addr, page); - addr = page[pud_index(__START_KERNEL_map)]; - addr_to_page(addr, page); #if CONFIG_XEN_COMPAT <= 0x030002 /* On Xen 3.0.2 and older we may need to explicitly specify _PAGE_USER @@ -512,7 +510,9 @@ void __init xen_init_pt(void) /* Mess with the initial mapping of page 0. It's not needed. */ BUILD_BUG_ON(__START_KERNEL <= __START_KERNEL_map); - addr = page[pmd_index(__START_KERNEL_map)]; + addr = page[pud_index(__START_KERNEL_map)]; + addr_to_page(addr, pg); + addr = pg[pmd_index(__START_KERNEL_map)]; addr_to_page(addr, pg); pte.pte = pg[pte_index(__START_KERNEL_map)]; BUG_ON(!(pte.pte & _PAGE_PRESENT)); @@ -533,9 +533,10 @@ void __init xen_init_pt(void) /* Construct mapping of initial pte page in our own directories. */ init_level4_pgt[pgd_index(__START_KERNEL_map)] = __pgd(__pa_symbol(level3_kernel_pgt) | _PAGE_TABLE); - level3_kernel_pgt[pud_index(__START_KERNEL_map)] = - __pud(__pa_symbol(level2_kernel_pgt) | _PAGE_TABLE); - memcpy(level2_kernel_pgt, page, PAGE_SIZE); + memcpy(level3_kernel_pgt + pud_index(__START_KERNEL_map), + page + pud_index(__START_KERNEL_map), + (PTRS_PER_PUD - pud_index(__START_KERNEL_map)) + * sizeof(*level3_kernel_pgt)); __user_pgd(init_level4_pgt)[pgd_index(VSYSCALL_START)] = __pgd(__pa_symbol(level3_user_pgt) | _PAGE_TABLE); @@ -548,8 +549,6 @@ void __init xen_init_pt(void) XENFEAT_writable_page_tables); early_make_page_readonly(level3_user_pgt, XENFEAT_writable_page_tables); - early_make_page_readonly(level2_kernel_pgt, - XENFEAT_writable_page_tables); if (!xen_feature(XENFEAT_writable_page_tables)) { xen_pgd_pin(__pa_symbol(init_level4_pgt)); @@ -581,6 +580,23 @@ static void __init extend_init_mapping(u while (va < (__START_KERNEL_map + (start_pfn << PAGE_SHIFT) + tables_space)) { + if (!(pmd_index(va) | pte_index(va))) { + pud_t *pud; + + page = (unsigned long *)init_level4_pgt; + addr = page[pgd_index(va)]; + addr_to_page(addr, page); + pud = (pud_t *)&page[pud_index(va)]; + if (pud_none(*pud)) { + page = alloc_static_page(&phys); + early_make_page_readonly( + page, XENFEAT_writable_page_tables); + set_pud(pud, __pud(phys | _KERNPG_TABLE)); + } else { + addr = page[pud_index(va)]; + addr_to_page(addr, page); + } + } pmd = (pmd_t *)&page[pmd_index(va)]; if (pmd_none(*pmd)) { pte_page = alloc_static_page(&phys); @@ -603,6 +619,15 @@ static void __init extend_init_mapping(u /* Finally, blow away any spurious initial mappings. */ while (1) { + if (!(pmd_index(va) | pte_index(va))) { + page = (unsigned long *)init_level4_pgt; + addr = page[pgd_index(va)]; + addr_to_page(addr, page); + if (pud_none(((pud_t *)page)[pud_index(va)])) + break; + addr = page[pud_index(va)]; + addr_to_page(addr, page); + } pmd = (pmd_t *)&page[pmd_index(va)]; if (pmd_none(*pmd)) break; --- sle10sp3-2009-05-28.orig/include/asm-x86_64/mach-xen/asm/pgtable.h 2009-06-05 08:36:00.000000000 +0200 +++ sle10sp3-2009-05-28/include/asm-x86_64/mach-xen/asm/pgtable.h 2009-05-28 13:26:14.000000000 +0200 @@ -137,11 +137,11 @@ static inline void pgd_clear (pgd_t * pg #define FIRST_USER_ADDRESS 0 #ifndef __ASSEMBLY__ -#define MAXMEM 0x3fffffffffffUL +#define MAXMEM 0x6fffffffffUL #define VMALLOC_START 0xffffc20000000000UL #define VMALLOC_END 0xffffe1ffffffffffUL #define MODULES_VADDR 0xffffffff88000000UL -#define MODULES_END 0xfffffffffff00000UL +#define MODULES_END 0xffffffffff000000UL #define MODULES_LEN (MODULES_END - MODULES_VADDR) #define _PAGE_BIT_PRESENT 0