diff -r 0f36c2eec2e1 xen/drivers/passthrough/amd/iommu_map.c --- a/xen/drivers/passthrough/amd/iommu_map.c Thu Jul 28 15:40:54 2011 +0100 +++ b/xen/drivers/passthrough/amd/iommu_map.c Fri Aug 12 14:03:29 2011 +0200 @@ -168,98 +168,59 @@ void flush_command_buffer(struct amd_iom AMD_IOMMU_DEBUG("Warning: ComWaitInt bit did not assert!\n"); } -static void clear_iommu_l1e_present(u64 l2e, unsigned long gfn) -{ - u32 *l1e; - int offset; - void *l1_table; - - l1_table = map_domain_page(l2e >> PAGE_SHIFT); - - offset = gfn & (~PTE_PER_TABLE_MASK); - l1e = (u32*)(l1_table + (offset * IOMMU_PAGE_TABLE_ENTRY_SIZE)); - - /* clear l1 entry */ - l1e[0] = l1e[1] = 0; - - unmap_domain_page(l1_table); -} - -static int set_iommu_l1e_present(u64 l2e, unsigned long gfn, - u64 maddr, int iw, int ir) -{ - u64 addr_lo, addr_hi, maddr_old; +/* Given pfn and page table level, return pde index */ +static unsigned int pfn_to_pde_idx(unsigned long pfn, unsigned int level) +{ + unsigned int idx; + + idx = pfn >> (PTE_PER_TABLE_SHIFT * (--level)); + idx &= ~PTE_PER_TABLE_MASK; + return idx; +} + +void clear_iommu_pte_present(unsigned long l1_mfn, unsigned long gfn) +{ + u64 *table, *pte; + + table = map_domain_page(l1_mfn); + pte = table + pfn_to_pde_idx(gfn, IOMMU_PAGING_MODE_LEVEL_1); + *pte = 0; + unmap_domain_page(table); +} + +static bool_t set_iommu_pde_present(u32 *pde, unsigned long next_mfn, + unsigned int next_level, + bool_t iw, bool_t ir) +{ + u64 addr_lo, addr_hi, maddr_old, maddr_next; u32 entry; - void *l1_table; - int offset; - u32 *l1e; - int need_flush = 0; - - l1_table = map_domain_page(l2e >> PAGE_SHIFT); - - offset = gfn & (~PTE_PER_TABLE_MASK); - l1e = (u32*)((u8*)l1_table + (offset * IOMMU_PAGE_TABLE_ENTRY_SIZE)); - - addr_hi = get_field_from_reg_u32(l1e[1], + bool_t need_flush = 0; + + maddr_next = (u64)next_mfn << PAGE_SHIFT; + + addr_hi = get_field_from_reg_u32(pde[1], IOMMU_PTE_ADDR_HIGH_MASK, IOMMU_PTE_ADDR_HIGH_SHIFT); - addr_lo = get_field_from_reg_u32(l1e[0], + addr_lo = get_field_from_reg_u32(pde[0], IOMMU_PTE_ADDR_LOW_MASK, IOMMU_PTE_ADDR_LOW_SHIFT); - maddr_old = ((addr_hi << 32) | addr_lo) << PAGE_SHIFT; - - if ( maddr_old && (maddr_old != maddr) ) + maddr_old = (addr_hi << 32) | (addr_lo << PAGE_SHIFT); + + if ( maddr_old != maddr_next ) need_flush = 1; - addr_lo = maddr & DMA_32BIT_MASK; - addr_hi = maddr >> 32; - - set_field_in_reg_u32((u32)addr_hi, 0, - IOMMU_PTE_ADDR_HIGH_MASK, - IOMMU_PTE_ADDR_HIGH_SHIFT, &entry); - set_field_in_reg_u32(iw ? IOMMU_CONTROL_ENABLED : - IOMMU_CONTROL_DISABLED, entry, - IOMMU_PTE_IO_WRITE_PERMISSION_MASK, - IOMMU_PTE_IO_WRITE_PERMISSION_SHIFT, &entry); - set_field_in_reg_u32(ir ? IOMMU_CONTROL_ENABLED : - IOMMU_CONTROL_DISABLED, entry, - IOMMU_PTE_IO_READ_PERMISSION_MASK, - IOMMU_PTE_IO_READ_PERMISSION_SHIFT, &entry); - l1e[1] = entry; - - set_field_in_reg_u32((u32)addr_lo >> PAGE_SHIFT, 0, - IOMMU_PTE_ADDR_LOW_MASK, - IOMMU_PTE_ADDR_LOW_SHIFT, &entry); - set_field_in_reg_u32(IOMMU_PAGING_MODE_LEVEL_0, entry, - IOMMU_PTE_NEXT_LEVEL_MASK, - IOMMU_PTE_NEXT_LEVEL_SHIFT, &entry); - set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, entry, - IOMMU_PTE_PRESENT_MASK, - IOMMU_PTE_PRESENT_SHIFT, &entry); - l1e[0] = entry; - - unmap_domain_page(l1_table); - return need_flush; -} - -static void amd_iommu_set_page_directory_entry(u32 *pde, - u64 next_ptr, u8 next_level) -{ - u64 addr_lo, addr_hi; - u32 entry; - - addr_lo = next_ptr & DMA_32BIT_MASK; - addr_hi = next_ptr >> 32; + addr_lo = maddr_next & DMA_32BIT_MASK; + addr_hi = maddr_next >> 32; /* enable read/write permissions,which will be enforced at the PTE */ set_field_in_reg_u32((u32)addr_hi, 0, IOMMU_PDE_ADDR_HIGH_MASK, IOMMU_PDE_ADDR_HIGH_SHIFT, &entry); - set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, entry, + set_field_in_reg_u32(iw, entry, IOMMU_PDE_IO_WRITE_PERMISSION_MASK, IOMMU_PDE_IO_WRITE_PERMISSION_SHIFT, &entry); - set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, entry, + set_field_in_reg_u32(ir, entry, IOMMU_PDE_IO_READ_PERMISSION_MASK, IOMMU_PDE_IO_READ_PERMISSION_SHIFT, &entry); pde[1] = entry; @@ -275,6 +236,26 @@ static void amd_iommu_set_page_directory IOMMU_PDE_PRESENT_MASK, IOMMU_PDE_PRESENT_SHIFT, &entry); pde[0] = entry; + + return need_flush; +} + +static bool_t set_iommu_pte_present(unsigned long pt_mfn, unsigned long gfn, + unsigned long next_mfn, int pde_level, + bool_t iw, bool_t ir) +{ + u64 *table; + u32 *pde; + bool_t need_flush = 0; + + table = map_domain_page(pt_mfn); + + pde = (u32*)(table + pfn_to_pde_idx(gfn, pde_level)); + + need_flush = set_iommu_pde_present(pde, next_mfn, + IOMMU_PAGING_MODE_LEVEL_0, iw, ir); + unmap_domain_page(table); + return need_flush; } void amd_iommu_set_root_page_table( @@ -413,11 +394,18 @@ u64 amd_iommu_get_next_table_from_pte(u3 return ptr; } +static unsigned int iommu_next_level(u32 *entry) +{ + return get_field_from_reg_u32(entry[0], + IOMMU_PDE_NEXT_LEVEL_MASK, + IOMMU_PDE_NEXT_LEVEL_SHIFT); +} + static int amd_iommu_is_pte_present(u32 *entry) { - return (get_field_from_reg_u32(entry[0], - IOMMU_PDE_PRESENT_MASK, - IOMMU_PDE_PRESENT_SHIFT)); + return get_field_from_reg_u32(entry[0], + IOMMU_PDE_PRESENT_MASK, + IOMMU_PDE_PRESENT_SHIFT); } void invalidate_dev_table_entry(struct amd_iommu *iommu, @@ -439,54 +427,241 @@ void invalidate_dev_table_entry(struct a send_iommu_command(iommu, cmd); } -static u64 iommu_l2e_from_pfn(struct page_info *table, int level, - unsigned long io_pfn) -{ - unsigned long offset; - void *pde = NULL; - void *table_vaddr; - u64 next_table_maddr = 0; - unsigned int lowest = 1; - - BUG_ON( table == NULL || level < lowest ); - - if ( level == lowest ) - return page_to_maddr(table); - - while ( level > lowest ) - { - offset = io_pfn >> ((PTE_PER_TABLE_SHIFT * - (level - IOMMU_PAGING_MODE_LEVEL_1))); - offset &= ~PTE_PER_TABLE_MASK; - - table_vaddr = __map_domain_page(table); - pde = table_vaddr + (offset * IOMMU_PAGE_TABLE_ENTRY_SIZE); - next_table_maddr = amd_iommu_get_next_table_from_pte(pde); - - if ( !amd_iommu_is_pte_present(pde) ) +/* For each pde, We use ignored bits (bit 1 - bit 8 and bit 63) + * to save pde count, pde count = 511 is a candidate of page coalescing. + */ +static unsigned int get_pde_count(u64 pde) +{ + unsigned int count; + u64 upper_mask = 1ULL << 63 ; + u64 lower_mask = 0xFF << 1; + + count = ((pde & upper_mask) >> 55) | ((pde & lower_mask) >> 1); + return count; +} + +/* Convert pde count into iommu pte ignored bits */ +static void set_pde_count(u64 *pde, unsigned int count) +{ + u64 upper_mask = 1ULL << 8 ; + u64 lower_mask = 0xFF; + u64 pte_mask = (~(1ULL << 63)) & (~(0xFF << 1)); + + *pde &= pte_mask; + *pde |= ((count & upper_mask ) << 55) | ((count & lower_mask ) << 1); +} + +/* Return 1, if pages are suitable for merging at merge_level. + * otherwise increase pde count if mfn is contigous with mfn - 1 + */ +static int iommu_update_pde_count(struct domain *d, unsigned long pt_mfn, + unsigned long gfn, unsigned long mfn, + unsigned int merge_level) +{ + unsigned int pde_count, next_level; + unsigned long first_mfn; + u64 *table, *pde, *ntable; + u64 ntable_maddr, mask; + struct hvm_iommu *hd = domain_hvm_iommu(d); + bool_t ok = 0; + + ASSERT( spin_is_locked(&hd->mapping_lock) && pt_mfn ); + + next_level = merge_level - 1; + + /* get pde at merge level */ + table = map_domain_page(pt_mfn); + pde = table + pfn_to_pde_idx(gfn, merge_level); + + /* get page table of next level */ + ntable_maddr = amd_iommu_get_next_table_from_pte((u32*)pde); + ntable = map_domain_page(ntable_maddr >> PAGE_SHIFT); + + /* get the first mfn of next level */ + first_mfn = amd_iommu_get_next_table_from_pte((u32*)ntable) >> PAGE_SHIFT; + + if ( first_mfn == 0 ) + goto out; + + mask = (1ULL<< (PTE_PER_TABLE_SHIFT * next_level)) - 1; + + if ( ((first_mfn & mask) == 0) && + (((gfn & mask) | first_mfn) == mfn) ) + { + pde_count = get_pde_count(*pde); + + if ( pde_count == (PTE_PER_TABLE_SIZE - 1) ) + ok = 1; + else if ( pde_count < (PTE_PER_TABLE_SIZE - 1)) { - if ( next_table_maddr == 0 ) + pde_count++; + set_pde_count(pde, pde_count); + } + } + + else + /* non-contiguous mapping */ + set_pde_count(pde, 0); + +out: + unmap_domain_page(ntable); + unmap_domain_page(table); + + return ok; +} + +static int iommu_merge_pages(struct domain *d, unsigned long pt_mfn, + unsigned long gfn, unsigned int flags, + unsigned int merge_level) +{ + u64 *table, *pde, *ntable; + u64 ntable_mfn; + unsigned long first_mfn; + struct hvm_iommu *hd = domain_hvm_iommu(d); + + ASSERT( spin_is_locked(&hd->mapping_lock) && pt_mfn ); + + table = map_domain_page(pt_mfn); + pde = table + pfn_to_pde_idx(gfn, merge_level); + + /* get first mfn */ + ntable_mfn = amd_iommu_get_next_table_from_pte((u32*)pde) >> PAGE_SHIFT; + + if ( ntable_mfn == 0 ) + { + unmap_domain_page(table); + return 1; + } + + ntable = map_domain_page(ntable_mfn); + first_mfn = amd_iommu_get_next_table_from_pte((u32*)ntable) >> PAGE_SHIFT; + + if ( first_mfn == 0 ) + { + unmap_domain_page(ntable); + unmap_domain_page(table); + return 1; + } + + /* setup super page mapping, next level = 0 */ + set_iommu_pde_present((u32*)pde, first_mfn, + IOMMU_PAGING_MODE_LEVEL_0, + !!(flags & IOMMUF_writable), + !!(flags & IOMMUF_readable)); + + amd_iommu_flush_all_pages(d); + + unmap_domain_page(ntable); + unmap_domain_page(table); + return 0; +} + +/* Walk io page tables and build level page tables if necessary + * {Re, un}mapping super page frames causes re-allocation of io + * page tables. + */ +static int iommu_pde_from_gfn(struct domain *d, unsigned long pfn, + unsigned long pt_mfn[]) +{ + u64 *pde, *next_table_vaddr; + unsigned long next_table_mfn; + unsigned int level; + struct page_info *table; + struct hvm_iommu *hd = domain_hvm_iommu(d); + + table = hd->root_table; + level = hd->paging_mode; + + BUG_ON( table == NULL || level < IOMMU_PAGING_MODE_LEVEL_1 || + level > IOMMU_PAGING_MODE_LEVEL_6 ); + + next_table_mfn = page_to_mfn(table); + + if ( level == IOMMU_PAGING_MODE_LEVEL_1 ) + { + pt_mfn[level] = next_table_mfn; + return 0; + } + + while ( level > IOMMU_PAGING_MODE_LEVEL_1 ) + { + unsigned int next_level = level - 1; + pt_mfn[level] = next_table_mfn; + + next_table_vaddr = map_domain_page(next_table_mfn); + pde = next_table_vaddr + pfn_to_pde_idx(pfn, level); + + /* Here might be a super page frame */ + next_table_mfn = amd_iommu_get_next_table_from_pte((uint32_t*)pde) + >> PAGE_SHIFT; + + /* Split super page frame into smaller pieces.*/ + if ( amd_iommu_is_pte_present((u32*)pde) && + (iommu_next_level((u32*)pde) == 0) && + next_table_mfn != 0 ) + { + int i; + unsigned long mfn, gfn; + unsigned int page_sz; + + page_sz = 1 << (PTE_PER_TABLE_SHIFT * (next_level - 1)); + gfn = pfn & ~((1 << (PTE_PER_TABLE_SHIFT * next_level)) - 1); + mfn = next_table_mfn; + + /* allocate lower level page table */ + table = alloc_amd_iommu_pgtable(); + if ( table == NULL ) + { + AMD_IOMMU_DEBUG("Cannot allocate I/O page table\n"); + unmap_domain_page(next_table_vaddr); + return 1; + } + + next_table_mfn = page_to_mfn(table); + set_iommu_pde_present((u32*)pde, next_table_mfn, next_level, + !!IOMMUF_writable, !!IOMMUF_readable); + + for ( i = 0; i < PTE_PER_TABLE_SIZE; i++ ) + { + set_iommu_pte_present(next_table_mfn, gfn, mfn, next_level, + !!IOMMUF_writable, !!IOMMUF_readable); + mfn += page_sz; + gfn += page_sz; + } + + amd_iommu_flush_all_pages(d); + } + + /* Install lower level page table for non-present entries */ + else if ( !amd_iommu_is_pte_present((u32*)pde) ) + { + if ( next_table_mfn == 0 ) { table = alloc_amd_iommu_pgtable(); if ( table == NULL ) { - printk("AMD-Vi: Cannot allocate I/O page table\n"); - return 0; + AMD_IOMMU_DEBUG("Cannot allocate I/O page table\n"); + unmap_domain_page(next_table_vaddr); + return 1; } - next_table_maddr = page_to_maddr(table); - amd_iommu_set_page_directory_entry( - (u32 *)pde, next_table_maddr, level - 1); + next_table_mfn = page_to_mfn(table); + set_iommu_pde_present((u32*)pde, next_table_mfn, next_level, + !!IOMMUF_writable, !!IOMMUF_readable); } else /* should never reach here */ - return 0; + { + unmap_domain_page(next_table_vaddr); + return 1; + } } - unmap_domain_page(table_vaddr); - table = maddr_to_page(next_table_maddr); + unmap_domain_page(next_table_vaddr); level--; } - return next_table_maddr; + /* mfn of level 1 page table */ + pt_mfn[level] = next_table_mfn; + return 0; } static int update_paging_mode(struct domain *d, unsigned long gfn) @@ -500,7 +675,7 @@ static int update_paging_mode(struct dom struct page_info *new_root = NULL; struct page_info *old_root = NULL; void *new_root_vaddr; - u64 old_root_maddr; + unsigned long old_root_mfn; struct hvm_iommu *hd = domain_hvm_iommu(d); level = hd->paging_mode; @@ -522,12 +697,13 @@ static int update_paging_mode(struct dom } new_root_vaddr = __map_domain_page(new_root); - old_root_maddr = page_to_maddr(old_root); - amd_iommu_set_page_directory_entry((u32 *)new_root_vaddr, - old_root_maddr, level); + old_root_mfn = page_to_mfn(old_root); + set_iommu_pde_present(new_root_vaddr, old_root_mfn, level, + !!IOMMUF_writable, !!IOMMUF_readable); level++; old_root = new_root; offset >>= PTE_PER_TABLE_SHIFT; + unmap_domain_page(new_root_vaddr); } if ( new_root != NULL ) @@ -575,14 +751,17 @@ int amd_iommu_map_page(struct domain *d, int amd_iommu_map_page(struct domain *d, unsigned long gfn, unsigned long mfn, unsigned int flags) { - u64 iommu_l2e; - int need_flush = 0; + bool_t need_flush = 0; struct hvm_iommu *hd = domain_hvm_iommu(d); + unsigned long pt_mfn[7]; + unsigned int merge_level; BUG_ON( !hd->root_table ); if ( iommu_hap_pt_share && is_hvm_domain(d) ) return 0; + + memset(pt_mfn, 0, sizeof(pt_mfn)); spin_lock(&hd->mapping_lock); @@ -592,14 +771,14 @@ int amd_iommu_map_page(struct domain *d, { if ( update_paging_mode(d, gfn) ) { + spin_unlock(&hd->mapping_lock); AMD_IOMMU_DEBUG("Update page mode failed gfn = %lx\n", gfn); domain_crash(d); return -EFAULT; } } - iommu_l2e = iommu_l2e_from_pfn(hd->root_table, hd->paging_mode, gfn); - if ( iommu_l2e == 0 ) + if ( iommu_pde_from_gfn(d, gfn, pt_mfn) || (pt_mfn[1] == 0) ) { spin_unlock(&hd->mapping_lock); AMD_IOMMU_DEBUG("Invalid IO pagetable entry gfn = %lx\n", gfn); @@ -607,25 +786,56 @@ int amd_iommu_map_page(struct domain *d, return -EFAULT; } - need_flush = set_iommu_l1e_present(iommu_l2e, gfn, (u64)mfn << PAGE_SHIFT, + /* Install 4k mapping first */ + need_flush = set_iommu_pte_present(pt_mfn[1], gfn, mfn, + IOMMU_PAGING_MODE_LEVEL_1, !!(flags & IOMMUF_writable), !!(flags & IOMMUF_readable)); - if ( need_flush ) - amd_iommu_flush_pages(d, gfn, 0); - + + /* Do not increase pde count if io mapping has not been changed */ + if ( !need_flush ) + goto out; + + amd_iommu_flush_pages(d, gfn, 0); + + for ( merge_level = IOMMU_PAGING_MODE_LEVEL_2; + merge_level <= hd->paging_mode; merge_level++ ) + { + if ( pt_mfn[merge_level] == 0 ) + break; + if ( !iommu_update_pde_count(d, pt_mfn[merge_level], + gfn, mfn, merge_level) ) + break; + /* Deallocate lower level page table */ + free_amd_iommu_pgtable(mfn_to_page(pt_mfn[merge_level - 1])); + + if ( iommu_merge_pages(d, pt_mfn[merge_level], gfn, + flags, merge_level) ) + { + spin_unlock(&hd->mapping_lock); + AMD_IOMMU_DEBUG("Merge iommu page failed at level %d, " + "gfn = %lx mfn = %lx\n", merge_level, gfn, mfn); + domain_crash(d); + return -EFAULT; + } + } + +out: spin_unlock(&hd->mapping_lock); return 0; } int amd_iommu_unmap_page(struct domain *d, unsigned long gfn) { - u64 iommu_l2e; + unsigned long pt_mfn[7]; struct hvm_iommu *hd = domain_hvm_iommu(d); BUG_ON( !hd->root_table ); if ( iommu_hap_pt_share && is_hvm_domain(d) ) return 0; + + memset(pt_mfn, 0, sizeof(pt_mfn)); spin_lock(&hd->mapping_lock); @@ -635,15 +845,14 @@ int amd_iommu_unmap_page(struct domain * { if ( update_paging_mode(d, gfn) ) { + spin_unlock(&hd->mapping_lock); AMD_IOMMU_DEBUG("Update page mode failed gfn = %lx\n", gfn); domain_crash(d); return -EFAULT; } } - iommu_l2e = iommu_l2e_from_pfn(hd->root_table, hd->paging_mode, gfn); - - if ( iommu_l2e == 0 ) + if ( iommu_pde_from_gfn(d, gfn, pt_mfn) || (pt_mfn[1] == 0) ) { spin_unlock(&hd->mapping_lock); AMD_IOMMU_DEBUG("Invalid IO pagetable entry gfn = %lx\n", gfn); @@ -652,7 +861,7 @@ int amd_iommu_unmap_page(struct domain * } /* mark PTE as 'page not present' */ - clear_iommu_l1e_present(iommu_l2e, gfn); + clear_iommu_pte_present(pt_mfn[1], gfn); spin_unlock(&hd->mapping_lock); amd_iommu_flush_pages(d, gfn, 0); diff -r 0f36c2eec2e1 xen/drivers/passthrough/amd/pci_amd_iommu.c --- a/xen/drivers/passthrough/amd/pci_amd_iommu.c Thu Jul 28 15:40:54 2011 +0100 +++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c Fri Aug 12 14:03:29 2011 +0200 @@ -237,7 +237,9 @@ static void __init amd_iommu_dom0_init(s * XXX Should we really map all non-RAM (above 4G)? Minimally * a pfn_valid() check would seem desirable here. */ - amd_iommu_map_page(d, pfn, pfn, IOMMUF_readable|IOMMUF_writable); + if ( mfn_valid(pfn) ) + amd_iommu_map_page(d, pfn, pfn, + IOMMUF_readable|IOMMUF_writable); } } @@ -333,7 +335,8 @@ static void deallocate_next_page_table(s { void *table_vaddr, *pde; u64 next_table_maddr; - int index; + int index, next_level, present; + u32 *entry; table_vaddr = __map_domain_page(pg); @@ -343,7 +346,18 @@ static void deallocate_next_page_table(s { pde = table_vaddr + (index * IOMMU_PAGE_TABLE_ENTRY_SIZE); next_table_maddr = amd_iommu_get_next_table_from_pte(pde); - if ( next_table_maddr != 0 ) + entry = (u32*)pde; + + next_level = get_field_from_reg_u32(entry[0], + IOMMU_PDE_NEXT_LEVEL_MASK, + IOMMU_PDE_NEXT_LEVEL_SHIFT); + + present = get_field_from_reg_u32(entry[0], + IOMMU_PDE_PRESENT_MASK, + IOMMU_PDE_PRESENT_SHIFT); + + if ( (next_table_maddr != 0) && (next_level != 0) + && present ) { deallocate_next_page_table( maddr_to_page(next_table_maddr), level - 1);