diff -NruabBEp linux-2.6.34.5/arch/x86/include/asm/xen/hypercall.h l-t/arch/x86/include/asm/xen/hypercall.h --- linux-2.6.34.5/arch/x86/include/asm/xen/hypercall.h 2010-08-20 22:52:05.000000000 +0400 +++ l-t/arch/x86/include/asm/xen/hypercall.h 2010-08-24 12:36:08.000000000 +0400 @@ -45,6 +45,7 @@ #include #include #include +#include /* * The hypercall asms have to meet several constraints: @@ -417,6 +418,12 @@ HYPERVISOR_nmi_op(unsigned long op, unsi return _hypercall2(int, nmi_op, op, arg); } +static inline int +HYPERVISOR_tmem_op(struct tmem_op *op) +{ + return _hypercall1(int, tmem_op, op); +} + static inline void MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set) { diff -NruabBEp linux-2.6.34.5/Documentation/ABI/testing/sysfs-kernel-mm-cleancache l-t/Documentation/ABI/testing/sysfs-kernel-mm-cleancache --- linux-2.6.34.5/Documentation/ABI/testing/sysfs-kernel-mm-cleancache 1970-01-01 03:00:00.000000000 +0300 +++ l-t/Documentation/ABI/testing/sysfs-kernel-mm-cleancache 2010-08-24 12:35:44.000000000 +0400 @@ -0,0 +1,11 @@ +What: /sys/kernel/mm/cleancache/ +Date: June 2010 +Contact: Dan Magenheimer +Description: + /sys/kernel/mm/cleancache/ contains a number of files which + record a count of various cleancache operations + (sum across all filesystems): + succ_gets + failed_gets + puts + flushes diff -NruabBEp linux-2.6.34.5/Documentation/ABI/testing/sysfs-kernel-mm-frontswap l-t/Documentation/ABI/testing/sysfs-kernel-mm-frontswap --- linux-2.6.34.5/Documentation/ABI/testing/sysfs-kernel-mm-frontswap 1970-01-01 03:00:00.000000000 +0300 +++ l-t/Documentation/ABI/testing/sysfs-kernel-mm-frontswap 2010-08-24 12:35:47.000000000 +0400 @@ -0,0 +1,16 @@ +What: /sys/kernel/mm/frontswap/ +Date: June 2010 +Contact: Dan Magenheimer +Description: + /sys/kernel/mm/frontswap/ contains a number of files which + record a count of various frontswap operations (sum across + all swap devices): + succ_puts + failed_puts + gets + flushes + In addition, reading the curr_pages file shows how many + pages are currently contained in frontswap and writing this + file with an integer performs a "partial swapoff", reducing + the number of frontswap pages to that integer if memory + constraints permit. diff -NruabBEp linux-2.6.34.5/Documentation/vm/cleancache.txt l-t/Documentation/vm/cleancache.txt --- linux-2.6.34.5/Documentation/vm/cleancache.txt 1970-01-01 03:00:00.000000000 +0300 +++ l-t/Documentation/vm/cleancache.txt 2010-08-24 12:35:53.000000000 +0400 @@ -0,0 +1,74 @@ +Cleancache can be thought of as a page-granularity victim cache for clean +pages that the kernel's pageframe replacement algorithm (PFRA) would like +to keep around, but can't since there isn't enough memory. So when the +PFRA "evicts" a page, it first attempts to put it into a synchronous +concurrency-safe page-oriented "pseudo-RAM" device (such as Xen's Transcendent +Memory, aka "tmem", or in-kernel compressed memory, aka "zmem", or other +RAM-like devices) which is not directly accessible or addressable by the +kernel and is of unknown and possibly time-varying size. And when a +cleancache-enabled filesystem wishes to access a page in a file on disk, +it first checks cleancache to see if it already contains it; if it does, +the page is copied into the kernel and a disk access is avoided. +A cleancache "backend" that interfaces to this pseudo-RAM links itself +to the kernel's cleancache "frontend" by setting the cleancache_ops funcs +appropriately and the functions it provides must conform to certain +semantics as follows: + +Most important, cleancache is "ephemeral". Pages which are copied into +cleancache have an indefinite lifetime which is completely unknowable +by the kernel and so may or may not still be in cleancache at any later time. +Thus, as its name implies, cleancache is not suitable for dirty pages. +Cleancache has complete discretion over what pages to preserve and what +pages to discard and when. + +Mounting a cleancache-enabled filesystem should call "init_fs" to obtain a +pool id which, if positive, must be saved in the filesystem's superblock; +a negative return value indicates failure. A "put_page" will copy a +(presumably about-to-be-evicted) page into cleancache and associate it with +the pool id, the file inode, and a page index into the file. (The combination +of a pool id, an inode, and an index is sometimes called a "handle".) +A "get_page" will copy the page, if found, from cleancache into kernel memory. +A "flush_page" will ensure the page no longer is present in cleancache; +a "flush_inode" will flush all pages associated with the specified inode; +and, when a filesystem is unmounted, a "flush_fs" will flush all pages in +all inodes specified by the given pool id and also surrender the pool id. + +A "init_shared_fs", like init, obtains a pool id but tells cleancache +to treat the pool as shared using a 128-bit UUID as a key. On systems +that may run multiple kernels (such as hard partitioned or virtualized +systems) that may share a clustered filesystem, and where cleancache +may be shared among those kernels, calls to init_shared_fs that specify the +same UUID will receive the same pool id, thus allowing the pages to +be shared. Note that any security requirements must be imposed outside +of the kernel (e.g. by "tools" that control cleancache). Or a +cleancache implementation can simply disable shared_init by always +returning a negative value. + +If a get_page is successful on a non-shared pool, the page is flushed (thus +making cleancache an "exclusive" cache). On a shared pool, the page +is NOT flushed on a successful get_page so that it remains accessible to +other sharers. The kernel is responsible for ensuring coherency between +cleancache (shared or not), the page cache, and the filesystem, using +cleancache flush operations as required. + +Note that cleancache must enforce put-put-get coherency and get-get +coherency. For the former, if two puts are made to the same handle but +with different data, say AAA by the first put and BBB by the second, a +subsequent get can never return the stale data (AAA). For get-get coherency, +if a get for a given handle fails, subsequent gets for that handle will +never succeed unless preceded by a successful put with that handle. + +Last, cleancache provides no SMP serialization guarantees; if two +different Linux threads are simultaneously putting and flushing a page +with the same handle, the results are indeterminate. + +Cleancache monitoring is done by sysfs files in the +/sys/kernel/mm/cleancache directory. The effectiveness of cleancache +can be measured (across all filesystems) with: + +succ_gets - number of gets that were successful +failed_gets - number of gets that failed +puts - number of puts attempted (all "succeed") +flushes - number of flushes attempted + +Dan Magenheimer, June 04 2010 diff -NruabBEp linux-2.6.34.5/Documentation/vm/frontswap.txt l-t/Documentation/vm/frontswap.txt --- linux-2.6.34.5/Documentation/vm/frontswap.txt 1970-01-01 03:00:00.000000000 +0300 +++ l-t/Documentation/vm/frontswap.txt 2010-08-24 12:35:56.000000000 +0400 @@ -0,0 +1,48 @@ +Frontswap is so named because it can be thought of as the opposite of +a "backing" store for a swap device. The storage is assumed to be +a synchronous concurrency-safe page-oriented pseudo-RAM device (such as +Xen's Transcendent Memory, aka "tmem", or in-kernel compressed memory, +aka "zmem", or other RAM-like devices) which is not directly accessible +or addressable by the kernel and is of unknown and possibly time-varying +size. This pseudo-RAM device links itself to frontswap by setting the +frontswap_ops funcs appropriately and the functions it provides must +conform to certain policies as follows: + +An "init" prepares the pseudo-RAM to receive frontswap pages and returns +a non-negative pool id, used for all swap device numbers (aka "type"). +A "put_page" will copy the page to pseudo-RAM and associate it with +the type and offset associated with the page. A "get_page" will copy the +page, if found, from pseudo-RAM into kernel memory, but will NOT remove +the page from pseudo-RAM. A "flush_page" will remove the page from +pseudo-RAM and a "flush_area" will remove ALL pages associated with the +swap type (e.g., like swapoff) and notify the pseudo-RAM device to refuse +further puts with that swap type. + +Once a page is successfully put, a matching get on the page will always +succeed. So when the kernel finds itself in a situation where it needs +to swap out a page, it first attempts to use frontswap. If the put returns +non-zero, the data has been successfully saved to pseudo-RAM and +a disk write and, if the data is later read back, a disk read are avoided. +If a put returns zero, pseudo-RAM has rejected the data, and the page can +be written to swap as usual. + +Note that if a page is put and the page already exists in pseudo-RAM +(a "duplicate" put), either the put succeeds and the data is overwritten, +or the put fails AND the page is flushed. This ensures stale data may +never be obtained from psuedo-RAM. + +Monitoring and control of frontswap is done by sysfs files in the +/sys/kernel/mm/frontswap directory. The effectiveness of frontswap can +be measured (across all swap devices) with: + +curr_pages - number of pages currently contained in frontswap +failed_puts - how many put attempts have failed +gets - how many gets were attempted (all should succeed) +succ_puts - how many put attempts have succeeded +flushes - how many flushes were attempted + +The number can be reduced by root by writing an integer target to curr_pages, +which results in a "partial swapoff", thus reducing the number of frontswap +pages to that target if memory constraints permit. + +Dan Magenheimer, June 04 2010 diff -NruabBEp linux-2.6.34.5/drivers/xen/balloon.c l-t/drivers/xen/balloon.c --- linux-2.6.34.5/drivers/xen/balloon.c 2010-08-20 22:52:05.000000000 +0400 +++ l-t/drivers/xen/balloon.c 2010-08-24 15:02:27.000000000 +0400 @@ -6,6 +6,7 @@ * Copyright (c) 2003, B Dragovic * Copyright (c) 2003-2004, M Williamson, K Fraser * Copyright (c) 2005 Dan M. Smith, IBM Corporation + * Copyright (c) 2010 Daniel Kiper * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version 2 @@ -43,7 +44,11 @@ #include #include #include +#include #include +#include +#include /* for cleancache_enabled */ +#include /* for frontswap_enabled */ #include #include @@ -69,6 +74,9 @@ struct balloon_stats { /* We aim for 'current allocation' == 'target allocation'. */ unsigned long current_pages; unsigned long target_pages; + unsigned long min_target_pages; + /* We may hit the hard limit in Xen. If we do then we remember it. */ + unsigned long hard_limit; /* * Drivers may alter the memory reservation independently, but they * must inform the balloon driver so we avoid hitting the hard limit. @@ -77,8 +85,35 @@ struct balloon_stats { /* Number of pages in high- and low-memory balloons. */ unsigned long balloon_low; unsigned long balloon_high; +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG + unsigned long boot_max_pfn; + u64 hotplug_start_paddr; + u64 hotplug_size; +#endif +#ifdef CONFIG_XEN_SELFBALLOONING + int selfballooning_enabled; + unsigned int selfballoon_interval; + unsigned int selfballoon_downhysteresis; + unsigned int selfballoon_uphysteresis; +#ifdef CONFIG_FRONTSWAP + unsigned int frontswap_selfshrinking; + unsigned int frontswap_hysteresis; + unsigned int frontswap_inertia; +#endif +#endif }; +#ifdef CONFIG_XEN_SELFBALLOONING +/* for disabling from boot parameter even when cleancache is enabled */ +static int use_selfballooning __read_mostly = 1; +/* for enabling from sysfs even when cleancache is disabled */ +static int force_enable_selfballooning __read_mostly = 0; +#ifdef CONFIG_FRONTSWAP +/* for disabling from boot parameter even when frontswap is enabled */ +static int use_frontswap_selfshrink __read_mostly = 1; +#endif +#endif + static DEFINE_MUTEX(balloon_mutex); static struct sys_device balloon_sysdev; @@ -112,6 +147,11 @@ static LIST_HEAD(ballooned_pages); static void balloon_process(struct work_struct *work); static DECLARE_WORK(balloon_worker, balloon_process); static struct timer_list balloon_timer; +#ifdef CONFIG_XEN_SELFBALLOONING +static void selfballoon_process(struct work_struct *work); +static DECLARE_WORK(selfballoon_worker, selfballoon_process); +static struct timer_list selfballoon_timer; +#endif /* When ballooning out (allocating memory to return to Xen) we don't really want the kernel to try too hard since that can trigger the oom killer. */ @@ -125,6 +165,35 @@ static void scrub_page(struct page *page #endif } +/* Heuristics to ensure over-ballooning doesn't occur. Can be overridden */ +static unsigned long init_totalram_pages; + +static unsigned long min_target_pages(void) +{ +#define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT)) + /* Simple continuous piecewiese linear function: + * max MiB -> min MiB gradient + * 0 0 + * 16 16 + * 32 24 + * 128 72 (1/2) + * 512 168 (1/4) + * 2048 360 (1/8) + * 8192 552 (1/32) + * 32768 1320 + * 131072 4392 + */ + if (init_totalram_pages < MB2PAGES(128)) + return MB2PAGES(8) + (init_totalram_pages >> 1); + else if (init_totalram_pages < MB2PAGES(512)) + return MB2PAGES(40) + (init_totalram_pages >> 2); + else if (init_totalram_pages < MB2PAGES(2048)) + return MB2PAGES(104) + (init_totalram_pages >> 3); + else + return MB2PAGES(296) + (init_totalram_pages >> 5); +#undef max_pfn +} + /* balloon_append: add the given page to the balloon. */ static void balloon_append(struct page *page) { @@ -184,18 +253,181 @@ static void balloon_alarm(unsigned long schedule_work(&balloon_worker); } -static unsigned long current_target(void) +#ifdef CONFIG_XEN_SELFBALLOONING +static void selfballoon_alarm(unsigned long unused) { - unsigned long target = balloon_stats.target_pages; + schedule_work(&selfballoon_worker); +} +#endif + +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG +static inline u64 is_memory_resource_reserved(void) +{ + return balloon_stats.hotplug_start_paddr; +} + +static int allocate_additional_memory(unsigned long nr_pages) +{ + long rc; + resource_size_t r_min, r_size; + struct resource *r; + struct xen_memory_reservation reservation = { + .address_bits = 0, + .extent_order = 0, + .domid = DOMID_SELF + }; + unsigned long flags, i, pfn; + + if (nr_pages > ARRAY_SIZE(frame_list)) + nr_pages = ARRAY_SIZE(frame_list); + + if (!is_memory_resource_reserved()) { + + /* + * Look for first unused memory region starting at page + * boundary. Skip last memory section created at boot time + * becuase it may contains unused memory pages with PG_reserved + * bit not set (online_pages require PG_reserved bit set). + */ + + r = kzalloc(sizeof(struct resource), GFP_KERNEL); + + if (!r) { + rc = -ENOMEM; + goto out_0; + } + + r->name = "System RAM"; + r->flags = IORESOURCE_MEM | IORESOURCE_BUSY; + r_min = PFN_PHYS(section_nr_to_pfn(pfn_to_section_nr(balloon_stats.boot_max_pfn) + 1)); + r_size = (balloon_stats.target_pages - balloon_stats.current_pages) << PAGE_SHIFT; + + rc = allocate_resource(&iomem_resource, r, r_size, r_min, + ULONG_MAX, PAGE_SIZE, NULL, NULL); + + if (rc < 0) { + kfree(r); + goto out_0; + } + + balloon_stats.hotplug_start_paddr = r->start; + } + + spin_lock_irqsave(&balloon_lock, flags); + + pfn = PFN_DOWN(balloon_stats.hotplug_start_paddr + balloon_stats.hotplug_size); + + for (i = 0; i < nr_pages; ++i, ++pfn) + frame_list[i] = pfn; - target = min(target, - balloon_stats.current_pages + - balloon_stats.balloon_low + - balloon_stats.balloon_high); + set_xen_guest_handle(reservation.extent_start, frame_list); + reservation.nr_extents = nr_pages; - return target; + rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation); + + if (rc < 0) + goto out_1; + + pfn = PFN_DOWN(balloon_stats.hotplug_start_paddr + balloon_stats.hotplug_size); + + for (i = 0; i < rc; ++i, ++pfn) { + BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap) && + phys_to_machine_mapping_valid(pfn)); + set_phys_to_machine(pfn, frame_list[i]); + } + + balloon_stats.hotplug_size += rc << PAGE_SHIFT; + balloon_stats.current_pages += rc; + +out_1: + spin_unlock_irqrestore(&balloon_lock, flags); + +out_0: + return rc < 0 ? rc : rc != nr_pages; } +static void hotplug_allocated_memory(void) +{ + int nid, ret; + struct memory_block *mem; + unsigned long pfn, pfn_limit; + + nid = memory_add_physaddr_to_nid(balloon_stats.hotplug_start_paddr); + + ret = add_registered_memory(nid, balloon_stats.hotplug_start_paddr, + balloon_stats.hotplug_size); + + if (ret) { + pr_err("%s: add_registered_memory: Memory hotplug failed: %i\n", + __func__, ret); + goto error; + } + + if (xen_pv_domain()) { + pfn = PFN_DOWN(balloon_stats.hotplug_start_paddr); + pfn_limit = pfn + (balloon_stats.hotplug_size >> PAGE_SHIFT); + + for (; pfn < pfn_limit; ++pfn) + if (!PageHighMem(pfn_to_page(pfn))) + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)__va(pfn << PAGE_SHIFT), + mfn_pte(pfn_to_mfn(pfn), PAGE_KERNEL), 0)); + } + + ret = online_pages(PFN_DOWN(balloon_stats.hotplug_start_paddr), + balloon_stats.hotplug_size >> PAGE_SHIFT); + + if (ret) { + pr_err("%s: online_pages: Failed: %i\n", __func__, ret); + goto error; + } + + pfn = PFN_DOWN(balloon_stats.hotplug_start_paddr); + pfn_limit = pfn + (balloon_stats.hotplug_size >> PAGE_SHIFT); + + for (; pfn < pfn_limit; pfn += PAGES_PER_SECTION) { + mem = find_memory_block(__pfn_to_section(pfn)); + BUG_ON(!mem); + BUG_ON(!present_section_nr(mem->phys_index)); + mutex_lock(&mem->state_mutex); + mem->state = MEM_ONLINE; + mutex_unlock(&mem->state_mutex); + } + + goto out; + +error: + balloon_stats.current_pages -= balloon_stats.hotplug_size >> PAGE_SHIFT; + balloon_stats.target_pages -= balloon_stats.hotplug_size >> PAGE_SHIFT; + +out: + balloon_stats.hotplug_start_paddr = 0; + balloon_stats.hotplug_size = 0; +} +#else +static inline u64 is_memory_resource_reserved(void) +{ + return 0; +} + +static inline int allocate_additional_memory(unsigned long nr_pages) +{ + /* + * CONFIG_XEN_BALLOON_MEMORY_HOTPLUG is not set. + * balloon_stats.target_pages could not be bigger + * than balloon_stats.current_pages because additional + * memory allocation is not possible. + */ + balloon_stats.target_pages = balloon_stats.current_pages; + + return 0; +} + +static inline void hotplug_allocated_memory(void) +{ +} +#endif /* CONFIG_XEN_BALLOON_MEMORY_HOTPLUG */ + static int increase_reservation(unsigned long nr_pages) { unsigned long pfn, i, flags; @@ -222,10 +454,23 @@ static int increase_reservation(unsigned set_xen_guest_handle(reservation.extent_start, frame_list); reservation.nr_extents = nr_pages; rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation); - if (rc < 0) + if (rc < nr_pages) { + if (rc > 0) { + int ret; + + /* We hit the Xen hard limit: reprobe. */ + reservation.nr_extents = rc; + ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, + &reservation); + BUG_ON(ret != rc); + } + if (rc >= 0) + balloon_stats.hard_limit = (balloon_stats.current_pages + rc - + balloon_stats.driver_pages); goto out; + } - for (i = 0; i < rc; i++) { + for (i = 0; i < nr_pages; i++) { page = balloon_retrieve(); BUG_ON(page == NULL); @@ -236,7 +481,7 @@ static int increase_reservation(unsigned set_phys_to_machine(pfn, frame_list[i]); /* Link back into the page tables if not highmem. */ - if (pfn < max_low_pfn) { + if (xen_pv_domain() && !PageHighMem(page)) { int ret; ret = HYPERVISOR_update_va_mapping( (unsigned long)__va(pfn << PAGE_SHIFT), @@ -251,12 +496,13 @@ static int increase_reservation(unsigned __free_page(page); } - balloon_stats.current_pages += rc; + balloon_stats.current_pages += nr_pages; + totalram_pages = balloon_stats.current_pages; out: spin_unlock_irqrestore(&balloon_lock, flags); - return rc < 0 ? rc : rc != nr_pages; + return 0; } static int decrease_reservation(unsigned long nr_pages) @@ -314,6 +560,7 @@ static int decrease_reservation(unsigned BUG_ON(ret != nr_pages); balloon_stats.current_pages -= nr_pages; + totalram_pages = balloon_stats.current_pages; spin_unlock_irqrestore(&balloon_lock, flags); @@ -334,9 +581,15 @@ static void balloon_process(struct work_ mutex_lock(&balloon_mutex); do { - credit = current_target() - balloon_stats.current_pages; - if (credit > 0) + credit = balloon_stats.target_pages - balloon_stats.current_pages; + + if (credit > 0) { + if (balloon_stats.balloon_low || balloon_stats.balloon_high) need_sleep = (increase_reservation(credit) != 0); + else + need_sleep = (allocate_additional_memory(credit) != 0); + } + if (credit < 0) need_sleep = (decrease_reservation(-credit) != 0); @@ -347,8 +600,10 @@ static void balloon_process(struct work_ } while ((credit != 0) && !need_sleep); /* Schedule more work if there is some still to be done. */ - if (current_target() != balloon_stats.current_pages) + if (balloon_stats.target_pages != balloon_stats.current_pages) mod_timer(&balloon_timer, jiffies + HZ); + else if (is_memory_resource_reserved()) + hotplug_allocated_memory(); mutex_unlock(&balloon_mutex); } @@ -357,7 +612,8 @@ static void balloon_process(struct work_ static void balloon_set_new_target(unsigned long target) { /* No need for lock. Not read-modify-write updates. */ - balloon_stats.target_pages = target; + balloon_stats.hard_limit = ~0UL; + balloon_stats.target_pages = max(target, balloon_stats.min_target_pages); schedule_work(&balloon_worker); } @@ -405,24 +661,52 @@ static int __init balloon_init(void) unsigned long pfn; struct page *page; - if (!xen_pv_domain()) + if (!xen_domain()) return -ENODEV; pr_info("xen_balloon: Initialising balloon driver.\n"); + if (xen_pv_domain()) balloon_stats.current_pages = min(xen_start_info->nr_pages, max_pfn); + else + balloon_stats.current_pages = max_pfn; + init_totalram_pages = totalram_pages = balloon_stats.current_pages; + balloon_stats.min_target_pages = min_target_pages(); balloon_stats.target_pages = balloon_stats.current_pages; balloon_stats.balloon_low = 0; balloon_stats.balloon_high = 0; balloon_stats.driver_pages = 0UL; + balloon_stats.hard_limit = ~0UL; +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG + balloon_stats.boot_max_pfn = max_pfn; + balloon_stats.hotplug_start_paddr = 0; + balloon_stats.hotplug_size = 0; +#endif init_timer(&balloon_timer); balloon_timer.data = 0; balloon_timer.function = balloon_alarm; +#ifdef CONFIG_XEN_SELFBALLOONING + balloon_stats.selfballooning_enabled = use_selfballooning; + balloon_stats.selfballoon_interval = 1; + balloon_stats.selfballoon_downhysteresis = 8; + balloon_stats.selfballoon_uphysteresis = 1; +#ifdef CONFIG_FRONTSWAP + balloon_stats.frontswap_selfshrinking = use_frontswap_selfshrink; + balloon_stats.frontswap_hysteresis = 20; + balloon_stats.frontswap_inertia = 1; +#endif + init_timer(&selfballoon_timer); + selfballoon_timer.data = 0; + selfballoon_timer.function = selfballoon_alarm; + mod_timer(&selfballoon_timer, jiffies + balloon_stats.selfballoon_interval * HZ); +#endif + register_balloon(&balloon_sysdev); /* Initialise the balloon with excess memory space. */ + if (xen_pv_domain()) for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) { page = pfn_to_page(pfn); if (!PageReserved(page)) @@ -460,6 +745,9 @@ BALLOON_SHOW(current_kb, "%lu\n", PAGES2 BALLOON_SHOW(low_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_low)); BALLOON_SHOW(high_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_high)); BALLOON_SHOW(driver_kb, "%lu\n", PAGES2KB(balloon_stats.driver_pages)); +BALLOON_SHOW(hard_limit_kb, + (balloon_stats.hard_limit!=~0UL) ? "%lu\n" : "???\n", + (balloon_stats.hard_limit!=~0UL) ? PAGES2KB(balloon_stats.hard_limit) : 0); static ssize_t show_target_kb(struct sys_device *dev, struct sysdev_attribute *attr, char *buf) @@ -488,6 +776,30 @@ static ssize_t store_target_kb(struct sy static SYSDEV_ATTR(target_kb, S_IRUGO | S_IWUSR, show_target_kb, store_target_kb); +static ssize_t show_min_target_kb(struct sys_device *dev, struct sysdev_attribute *attr, + char *buf) +{ + return sprintf(buf, "%lu\n", PAGES2KB(balloon_stats.min_target_pages)); +} + +static ssize_t store_min_target_kb(struct sys_device *dev, + struct sysdev_attribute *attr, + const char *buf, + size_t count) +{ + char *endchar; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + balloon_stats.min_target_pages = + (simple_strtoull(buf, &endchar, 0) * 1024) >> PAGE_SHIFT; + + return count; +} + +static SYSDEV_ATTR(min_target_kb, S_IRUGO | S_IWUSR, + show_min_target_kb, store_min_target_kb); static ssize_t show_target(struct sys_device *dev, struct sysdev_attribute *attr, char *buf) @@ -518,12 +830,206 @@ static ssize_t store_target(struct sys_d static SYSDEV_ATTR(target, S_IRUGO | S_IWUSR, show_target, store_target); +#ifdef CONFIG_XEN_SELFBALLOONING +static ssize_t show_selfballooning(struct sys_device *dev, struct sysdev_attribute *attr, + char *buf) +{ + return sprintf(buf, "%d\n", balloon_stats.selfballooning_enabled); +} + +static ssize_t store_selfballooning(struct sys_device *dev, + struct sysdev_attribute *attr, + const char *buf, + size_t count) +{ + char *endchar; + int was_enabled = balloon_stats.selfballooning_enabled; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + balloon_stats.selfballooning_enabled = !!memparse(buf, &endchar); + + if (!was_enabled && balloon_stats.selfballooning_enabled) { + mod_timer(&selfballoon_timer, + jiffies + balloon_stats.selfballoon_interval * HZ); + force_enable_selfballooning = 1; + } else + force_enable_selfballooning = 0; + + return count; +} + +static SYSDEV_ATTR(selfballooning, S_IRUGO | S_IWUSR, + show_selfballooning, store_selfballooning); + +static ssize_t show_selfballoon_interval(struct sys_device *dev, struct sysdev_attribute *attr, + char *buf) +{ + return sprintf(buf, "%d\n", balloon_stats.selfballoon_interval); +} + +static ssize_t store_selfballoon_interval(struct sys_device *dev, + struct sysdev_attribute *attr, + const char *buf, + size_t count) +{ + char *endchar; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + balloon_stats.selfballoon_interval = memparse(buf, &endchar); + return count; +} + +static SYSDEV_ATTR(selfballoon_interval, S_IRUGO | S_IWUSR, + show_selfballoon_interval, store_selfballoon_interval); + +static ssize_t show_selfballoon_downhys(struct sys_device *dev, struct sysdev_attribute *attr, + char *buf) +{ + return sprintf(buf, "%d\n", balloon_stats.selfballoon_downhysteresis); +} + +static ssize_t store_selfballoon_downhys(struct sys_device *dev, + struct sysdev_attribute *attr, + const char *buf, + size_t count) +{ + char *endchar; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + balloon_stats.selfballoon_downhysteresis = memparse(buf, &endchar); + return count; +} + +static SYSDEV_ATTR(selfballoon_downhysteresis, S_IRUGO | S_IWUSR, + show_selfballoon_downhys, store_selfballoon_downhys); + + +static ssize_t show_selfballoon_uphys(struct sys_device *dev, struct sysdev_attribute *attr, + char *buf) +{ + return sprintf(buf, "%d\n", balloon_stats.selfballoon_uphysteresis); +} + +static ssize_t store_selfballoon_uphys(struct sys_device *dev, + struct sysdev_attribute *attr, + const char *buf, + size_t count) +{ + char *endchar; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + balloon_stats.selfballoon_uphysteresis = memparse(buf, &endchar); + return count; +} + +static SYSDEV_ATTR(selfballoon_uphysteresis, S_IRUGO | S_IWUSR, + show_selfballoon_uphys, store_selfballoon_uphys); + +#ifdef CONFIG_FRONTSWAP +static ssize_t show_frontswap_selfshrinking(struct sys_device *dev, struct sysdev_attribute *attr, + char *buf) +{ + return sprintf(buf, "%d\n", balloon_stats.frontswap_selfshrinking); +} + + +static ssize_t store_frontswap_selfshrinking(struct sys_device *dev, + struct sysdev_attribute *attr, + const char *buf, + size_t count) +{ + char *endchar; + int was_enabled = balloon_stats.frontswap_selfshrinking; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + balloon_stats.frontswap_selfshrinking = !!memparse(buf, &endchar); + + if (!was_enabled && !balloon_stats.selfballooning_enabled && + balloon_stats.frontswap_selfshrinking) + mod_timer(&selfballoon_timer, + jiffies + balloon_stats.selfballoon_interval * HZ); + + return count; +} + +static SYSDEV_ATTR(frontswap_selfshrinking, S_IRUGO | S_IWUSR, + show_frontswap_selfshrinking, store_frontswap_selfshrinking); + +static unsigned long frontswap_inertia_counter = 0; + + +static ssize_t show_frontswap_inertia(struct sys_device *dev, struct sysdev_attribute *attr, + char *buf) +{ + return sprintf(buf, "%d\n", balloon_stats.frontswap_inertia); +} +static ssize_t store_frontswap_inertia(struct sys_device *dev, + struct sysdev_attribute *attr, + const char *buf, + size_t count) +{ + char *endchar; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + balloon_stats.frontswap_inertia = memparse(buf, &endchar); + frontswap_inertia_counter = balloon_stats.frontswap_inertia; + return count; +} + +static SYSDEV_ATTR(frontswap_inertia, S_IRUGO | S_IWUSR, + show_frontswap_inertia, store_frontswap_inertia); + +static ssize_t show_frontswap_hysteresis(struct sys_device *dev, struct sysdev_attribute *attr, + char *buf) +{ + return sprintf(buf, "%d\n", balloon_stats.frontswap_hysteresis); +} + +static ssize_t store_frontswap_hysteresis(struct sys_device *dev, + struct sysdev_attribute *attr, + const char *buf, + size_t count) +{ + char *endchar; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + balloon_stats.frontswap_hysteresis = memparse(buf, &endchar); + return count; +} + +static SYSDEV_ATTR(frontswap_hysteresis, S_IRUGO | S_IWUSR, + show_frontswap_hysteresis, store_frontswap_hysteresis); + +#endif /* CONFIG_FRONTSWAP */ +#endif /* CONFIG_XEN_SELFBALLOONING */ static struct sysdev_attribute *balloon_attrs[] = { &attr_target_kb, + &attr_min_target_kb, &attr_target, +#ifdef CONFIG_XEN_SELFBALLOONING + &attr_selfballooning, + &attr_selfballoon_interval, + &attr_selfballoon_downhysteresis, + &attr_selfballoon_uphysteresis, +#ifdef CONFIG_FRONTSWAP + &attr_frontswap_selfshrinking, + &attr_frontswap_hysteresis, + &attr_frontswap_inertia, +#endif +#endif }; + static struct attribute *balloon_info_attrs[] = { &attr_current_kb.attr, &attr_low_kb.attr, @@ -529,6 +1035,7 @@ static struct attribute *balloon_info_at &attr_low_kb.attr, &attr_high_kb.attr, &attr_driver_kb.attr, + &attr_hard_limit_kb.attr, NULL }; @@ -578,4 +1085,76 @@ static int register_balloon(struct sys_d return error; } +#ifdef CONFIG_XEN_SELFBALLOONING +#ifdef CONFIG_FRONTSWAP +static void frontswap_selfshrink(void) +{ + static unsigned long cur_frontswap_pages = 0; + static unsigned long last_frontswap_pages = 0; + static unsigned long tgt_frontswap_pages = 0; + + if (!balloon_stats.frontswap_hysteresis) + return; + last_frontswap_pages = cur_frontswap_pages; + cur_frontswap_pages = frontswap_curr_pages(); + if (!cur_frontswap_pages || (cur_frontswap_pages > last_frontswap_pages)) { + frontswap_inertia_counter = balloon_stats.frontswap_inertia; + return; + } + if (frontswap_inertia_counter && --frontswap_inertia_counter) + return; + if ( cur_frontswap_pages <= balloon_stats.frontswap_hysteresis) + tgt_frontswap_pages = 0; + else tgt_frontswap_pages = cur_frontswap_pages - + (cur_frontswap_pages / balloon_stats.frontswap_hysteresis); + frontswap_shrink(tgt_frontswap_pages); +} + +static int __init no_frontswap_selfshrink_setup(char *s) +{ + use_frontswap_selfshrink = 0; + return 1; +} + +__setup("noselfshrink", no_frontswap_selfshrink_setup); +#endif + +static void selfballoon_process(struct work_struct *work) +{ + extern unsigned long vm_get_committed_as(void); + unsigned long cur_pages, goal_pages, tgt_pages; + int reset_timer = 0; + + if (balloon_stats.selfballooning_enabled && + (force_enable_selfballooning || cleancache_enabled)) { + tgt_pages = cur_pages = totalram_pages; + goal_pages = vm_get_committed_as(); + if (cur_pages > goal_pages) + tgt_pages = cur_pages - + (cur_pages - goal_pages) / balloon_stats.selfballoon_downhysteresis; + else if (cur_pages < goal_pages) + tgt_pages = cur_pages + + (goal_pages - cur_pages) / balloon_stats.selfballoon_uphysteresis; + balloon_set_new_target(tgt_pages); + reset_timer = 1; + } +#ifdef CONFIG_FRONTSWAP + if (balloon_stats.frontswap_selfshrinking && frontswap_enabled) { + frontswap_selfshrink(); + reset_timer = 1; + } +#endif + if (reset_timer) + mod_timer(&selfballoon_timer, jiffies + balloon_stats.selfballoon_interval * HZ); +} + +static int __init noselfballooning_setup(char *s) +{ + use_selfballooning = 0; + return 1; +} + +__setup("noselfballooning", noselfballooning_setup); +#endif + MODULE_LICENSE("GPL"); diff -NruabBEp linux-2.6.34.5/drivers/xen/Kconfig l-t/drivers/xen/Kconfig --- linux-2.6.34.5/drivers/xen/Kconfig 2010-08-20 22:52:05.000000000 +0400 +++ l-t/drivers/xen/Kconfig 2010-08-24 12:38:12.000000000 +0400 @@ -9,6 +9,26 @@ config XEN_BALLOON the system to expand the domain's memory allocation, or alternatively return unneeded memory to the system. +config XEN_BALLOON_MEMORY_HOTPLUG + bool "Xen memory balloon driver with memory hotplug support" + default y + depends on XEN_BALLOON && MEMORY_HOTPLUG + help + Xen memory balloon driver with memory hotplug support allows expanding + memory available for the system above limit declared at system startup. + It is very useful on critical systems which require long run without + rebooting. + +config XEN_SELFBALLOONING + bool "dynamically self-balloon kernel memory to target" + depends on XEN && XEN_BALLOON_MEMORY_HOTPLUG + default y + help + Self-ballooning dynamically balloons available kernel memory driven + by the current usage of anonymous memory ("committed AS") and + controlled by various sysfs-settable parameters. May be overridden + by the noselfballooning kernel boot parameter + config XEN_SCRUB_PAGES bool "Scrub pages before returning them to system" depends on XEN_BALLOON diff -NruabBEp linux-2.6.34.5/drivers/xen/Makefile l-t/drivers/xen/Makefile --- linux-2.6.34.5/drivers/xen/Makefile 2010-08-20 22:52:05.000000000 +0400 +++ l-t/drivers/xen/Makefile 2010-08-24 12:38:34.000000000 +0400 @@ -1,5 +1,6 @@ obj-y += grant-table.o features.o events.o manage.o obj-y += xenbus/ +obj-y += tmem.o nostackp := $(call cc-option, -fno-stack-protector) CFLAGS_features.o := $(nostackp) diff -NruabBEp linux-2.6.34.5/drivers/xen/tmem.c l-t/drivers/xen/tmem.c --- linux-2.6.34.5/drivers/xen/tmem.c 1970-01-01 03:00:00.000000000 +0300 +++ l-t/drivers/xen/tmem.c 2010-08-24 12:36:51.000000000 +0400 @@ -0,0 +1,320 @@ +/* + * Xen implementation for transcendent memory (tmem) + * + * Copyright (C) 2009-2010 Oracle Corp. All rights reserved. + * Author: Dan Magenheimer + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +/* xen tmem foundation ops/hypercalls */ + +static inline int xen_tmem_op(u32 tmem_cmd, u32 tmem_pool, u64 object, + u32 index, unsigned long gmfn, u32 tmem_offset, u32 pfn_offset, u32 len) +{ + struct tmem_op op; + int rc = 0; + + op.cmd = tmem_cmd; + op.pool_id = tmem_pool; + op.u.gen.object = object; + op.u.gen.index = index; + op.u.gen.tmem_offset = tmem_offset; + op.u.gen.pfn_offset = pfn_offset; + op.u.gen.len = len; + set_xen_guest_handle(op.u.gen.gmfn, (void *)gmfn); + rc = HYPERVISOR_tmem_op(&op); + return rc; +} + +static int xen_tmem_new_pool(struct tmem_pool_uuid uuid, + u32 flags, unsigned long pagesize) +{ + struct tmem_op op; + int rc = 0, pageshift; + + for (pageshift = 0; pagesize != 1; pageshift++) + pagesize >>= 1; + flags |= (pageshift - 12) << TMEM_POOL_PAGESIZE_SHIFT; + op.cmd = TMEM_NEW_POOL; + op.u.new.uuid[0] = uuid.uuid_lo; + op.u.new.uuid[1] = uuid.uuid_hi; + op.u.new.flags = flags; + rc = HYPERVISOR_tmem_op(&op); + return rc; +} + +/* xen generic tmem ops */ + +static int xen_tmem_put_page(u32 pool_id, u64 object, u32 index, + unsigned long pfn) +{ + unsigned long gmfn = xen_pv_domain() ? pfn_to_mfn(pfn) : pfn; + + return xen_tmem_op(TMEM_PUT_PAGE, pool_id, object, index, + gmfn, 0, 0, 0); +} + +static int xen_tmem_get_page(u32 pool_id, u64 object, u32 index, + unsigned long pfn) +{ + unsigned long gmfn = xen_pv_domain() ? pfn_to_mfn(pfn) : pfn; + + return xen_tmem_op(TMEM_GET_PAGE, pool_id, object, index, + gmfn, 0, 0, 0); +} + +static int xen_tmem_flush_page(u32 pool_id, u64 object, u32 index) +{ + return xen_tmem_op(TMEM_FLUSH_PAGE, pool_id, object, index, + 0, 0, 0, 0); +} + +static int xen_tmem_flush_object(u32 pool_id, u64 object) +{ + return xen_tmem_op(TMEM_FLUSH_OBJECT, pool_id, object, 0, 0, 0, 0, 0); +} + +static int xen_tmem_destroy_pool(u32 pool_id) +{ + return xen_tmem_op(TMEM_DESTROY_POOL, pool_id, 0, 0, 0, 0, 0, 0); +} + +int tmem_enabled = 0; + +static int __init enable_tmem(char *s) +{ + tmem_enabled = 1; + return 1; +} + +__setup("tmem", enable_tmem); + +/* cleancache ops */ + +static void tmem_cleancache_put_page(int pool, ino_t inode, pgoff_t index, + struct page *page) +{ + u32 ind = (u32) index; + unsigned long pfn = page_to_pfn(page); + + if (pool < 0) + return; + if (ind != index) + return; + mb(); /* ensure page is quiescent; tmem may address it with an alias */ + (void)xen_tmem_put_page((u32)pool, (u64)inode, ind, pfn); +} + +static int tmem_cleancache_get_page(int pool, ino_t inode, pgoff_t index, + struct page *page) +{ + u32 ind = (u32) index; + unsigned long pfn = page_to_pfn(page); + int ret; + + /* translate return values to linux semantics */ + if (pool < 0) + return -1; + if (ind != index) + return -1; + ret = xen_tmem_get_page((u32)pool, (u64)inode, ind, pfn); + if (ret == 1) + return 0; + else + return -1; +} + +static void tmem_cleancache_flush_page(int pool, ino_t inode, pgoff_t index) +{ + u32 ind = (u32) index; + + if (pool < 0) + return; + if (ind != index) + return; + (void)xen_tmem_flush_page((u32)pool, (u64)inode, ind); +} + +static void tmem_cleancache_flush_inode(int pool, ino_t inode) +{ + if (pool < 0) + return; + (void)xen_tmem_flush_object((u32)pool, (u64)inode); +} + +static void tmem_cleancache_flush_fs(int pool) +{ + if (pool < 0) + return; + (void)xen_tmem_destroy_pool((u32)pool); +} + +static int tmem_cleancache_init_fs(size_t pagesize) +{ + struct tmem_pool_uuid uuid_private = TMEM_POOL_PRIVATE_UUID; + + return xen_tmem_new_pool(uuid_private, 0, pagesize); +} + +static int tmem_cleancache_init_shared_fs(char *uuid, size_t pagesize) +{ + struct tmem_pool_uuid shared_uuid; + + shared_uuid.uuid_lo = *(u64 *)uuid; + shared_uuid.uuid_hi = *(u64 *)(&uuid[8]); + return xen_tmem_new_pool(shared_uuid, TMEM_POOL_SHARED, pagesize); +} + +static int use_cleancache = 1; + +static int __init no_cleancache(char *s) +{ + use_cleancache = 0; + return 1; +} + +__setup("nocleancache", no_cleancache); + +/* frontswap tmem operations */ + +/* + * Swizzling increases objects per swaptype, increasing tmem concurrency + * for heavy swaploads. Later, larger nr_cpus -> larger SWIZ_BITS + */ +#define SWIZ_BITS 4 +#define SWIZ_MASK ((1 << SWIZ_BITS) - 1) +#define oswiz(_type, _ind) ((_type << SWIZ_BITS) | (_ind & SWIZ_MASK)) +#define iswiz(_ind) (_ind >> SWIZ_BITS) + +/* returns 0 if the page was successfully put into frontswap, -1 if not */ +static int tmem_frontswap_put_page(int pool, unsigned type, pgoff_t offset, + struct page *page) +{ + u64 ind64 = (u64)offset; + u32 ind = (u32)offset; + unsigned long pfn = page_to_pfn(page); + int ret; + + if (ind64 != ind) + return -1; + mb(); /* ensure page is quiescent; tmem may address it with an alias */ + ret = xen_tmem_put_page(pool, oswiz(type, ind), iswiz(ind), pfn); + /* translate Xen tmem return values to linux semantics */ + if (ret == 1) + return 0; + else + return -1; +} + +/* returns 0 if the page was successfully gotten from frontswap, -1 if + * was not present (should never happen!) */ +static int tmem_frontswap_get_page(int pool, unsigned type, pgoff_t offset, + struct page *page) +{ + u64 ind64 = (u64)offset; + u32 ind = (u32)offset; + unsigned long pfn = page_to_pfn(page); + int ret; + + if (ind64 != ind) + return -1; + ret = xen_tmem_get_page(pool, oswiz(type, ind), iswiz(ind), pfn); + /* translate Xen tmem return values to linux semantics */ + if (ret == 1) + return 0; + else + return -1; +} + +/* flush a single page from frontswap */ +static void tmem_frontswap_flush_page(int pool, unsigned type, pgoff_t offset) +{ + u64 ind64 = (u64)offset; + u32 ind = (u32)offset; + + if (ind64 != ind) + return; + (void) xen_tmem_flush_page(pool, oswiz(type, ind), iswiz(ind)); +} + +/* flush all pages from the passed swaptype */ +static void tmem_frontswap_flush_area(int pool, unsigned type) +{ + int ind; + + for (ind = SWIZ_MASK; ind >= 0; ind--) + (void)xen_tmem_flush_object(pool, oswiz(type, ind)); +} + +static int tmem_frontswap_init(unsigned ignored) +{ + struct tmem_pool_uuid private = TMEM_POOL_PRIVATE_UUID; + static int tmem_frontswap_poolid = -1; + int ret; + + if (tmem_frontswap_poolid < 0) + ret = (int)xen_tmem_new_pool(private, TMEM_POOL_PERSIST, + PAGE_SIZE); + else + ret = tmem_frontswap_poolid; + return ret; +} + +static int use_frontswap = 1; + +static int __init no_frontswap(char *s) +{ + use_frontswap = 0; + return 1; +} + +__setup("nofrontswap", no_frontswap); + +static int __init xen_tmem_init(void) +{ + if (!xen_domain()) + return 0; +#ifdef CONFIG_FRONTSWAP + if (tmem_enabled && use_frontswap) { + frontswap_ops.put_page = tmem_frontswap_put_page; + frontswap_ops.get_page = tmem_frontswap_get_page; + frontswap_ops.flush_page = tmem_frontswap_flush_page; + frontswap_ops.flush_area = tmem_frontswap_flush_area; + frontswap_ops.init = tmem_frontswap_init; + printk(KERN_INFO "frontswap enabled, RAM provided by " + "Xen Transcendent Memory\n"); + } +#endif +#ifdef CONFIG_CLEANCACHE + if (tmem_enabled && use_cleancache) { + cleancache_ops.put_page = tmem_cleancache_put_page; + cleancache_ops.get_page = tmem_cleancache_get_page; + cleancache_ops.flush_page = tmem_cleancache_flush_page; + cleancache_ops.flush_inode = tmem_cleancache_flush_inode; + cleancache_ops.flush_fs = tmem_cleancache_flush_fs; + cleancache_ops.init_shared_fs = tmem_cleancache_init_shared_fs; + cleancache_ops.init_fs = tmem_cleancache_init_fs; + printk(KERN_INFO "cleancache enabled, RAM provided by " + "Xen Transcendent Memory\n"); + } +#endif + return 0; +} + +module_init(xen_tmem_init) diff -NruabBEp linux-2.6.34.5/fs/buffer.c l-t/fs/buffer.c --- linux-2.6.34.5/fs/buffer.c 2010-08-20 22:52:05.000000000 +0400 +++ l-t/fs/buffer.c 2010-08-24 12:36:23.000000000 +0400 @@ -41,6 +41,7 @@ #include #include #include +#include static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); @@ -276,6 +277,11 @@ void invalidate_bdev(struct block_device invalidate_bh_lrus(); invalidate_mapping_pages(mapping, 0, -1); + + /* 99% of the time, we don't need to flush the cleancache on the bdev. + * But, for the strange corners, lets be cautious + */ + cleancache_flush_inode(mapping); } EXPORT_SYMBOL(invalidate_bdev); diff -NruabBEp linux-2.6.34.5/fs/ext3/super.c l-t/fs/ext3/super.c --- linux-2.6.34.5/fs/ext3/super.c 2010-08-20 22:52:05.000000000 +0400 +++ l-t/fs/ext3/super.c 2010-08-24 12:36:27.000000000 +0400 @@ -37,6 +37,7 @@ #include #include #include +#include #include @@ -1344,6 +1345,7 @@ static int ext3_setup_super(struct super } else { ext3_msg(sb, KERN_INFO, "using internal journal"); } + sb->cleancache_poolid = cleancache_init_fs(PAGE_SIZE); return res; } diff -NruabBEp linux-2.6.34.5/fs/mpage.c l-t/fs/mpage.c --- linux-2.6.34.5/fs/mpage.c 2010-08-20 22:52:05.000000000 +0400 +++ l-t/fs/mpage.c 2010-08-24 12:36:34.000000000 +0400 @@ -27,6 +27,7 @@ #include #include #include +#include /* * I/O completion handler for multipage BIOs. @@ -286,6 +287,12 @@ do_mpage_readpage(struct bio *bio, struc SetPageMappedToDisk(page); } + if (fully_mapped && blocks_per_page == 1 && !PageUptodate(page) && + cleancache_get_page(page) == 0) { + SetPageUptodate(page); + goto confused; + } + /* * This page will go to BIO. Do we need to send this BIO off first? */ diff -NruabBEp linux-2.6.34.5/fs/super.c l-t/fs/super.c --- linux-2.6.34.5/fs/super.c 2010-08-20 22:52:05.000000000 +0400 +++ l-t/fs/super.c 2010-08-24 12:36:37.000000000 +0400 @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include "internal.h" @@ -105,6 +106,7 @@ static struct super_block *alloc_super(s s->s_qcop = sb_quotactl_ops; s->s_op = &default_op; s->s_time_gran = 1000000000; + s->cleancache_poolid = -1; } out: return s; @@ -195,6 +197,11 @@ void deactivate_super(struct super_block vfs_dq_off(s, 0); down_write(&s->s_umount); fs->kill_sb(s); + if (s->cleancache_poolid >= 0) { + int cleancache_poolid = s->cleancache_poolid; + s->cleancache_poolid = -1; /* avoid races */ + cleancache_flush_fs(cleancache_poolid); + } put_filesystem(fs); put_super(s); } @@ -221,6 +228,7 @@ void deactivate_locked_super(struct supe spin_unlock(&sb_lock); vfs_dq_off(s, 0); fs->kill_sb(s); + cleancache_flush_fs(s->cleancache_poolid); put_filesystem(fs); put_super(s); } else { diff -NruabBEp linux-2.6.34.5/include/linux/cleancache.h l-t/include/linux/cleancache.h --- linux-2.6.34.5/include/linux/cleancache.h 1970-01-01 03:00:00.000000000 +0300 +++ l-t/include/linux/cleancache.h 2010-08-24 12:33:21.000000000 +0400 @@ -0,0 +1,88 @@ +#ifndef _LINUX_CLEANCACHE_H +#define _LINUX_CLEANCACHE_H + +#include +#include + +struct cleancache_ops { + int (*init_fs)(size_t); + int (*init_shared_fs)(char *uuid, size_t); + int (*get_page)(int, ino_t, pgoff_t, struct page *); + void (*put_page)(int, ino_t, pgoff_t, struct page *); + void (*flush_page)(int, ino_t, pgoff_t); + void (*flush_inode)(int, ino_t); + void (*flush_fs)(int); +}; + +extern struct cleancache_ops cleancache_ops; +extern int __cleancache_get_page(struct page *); +extern void __cleancache_put_page(struct page *); +extern void __cleancache_flush_page(struct address_space *, struct page *); +extern void __cleancache_flush_inode(struct address_space *); + +#ifdef CONFIG_CLEANCACHE +#define cleancache_enabled (cleancache_ops.init_fs) +#else +#define cleancache_enabled (0) +#endif + +/* called by a cleancache-enabled filesystem at time of mount */ +static inline int cleancache_init_fs(size_t pagesize) +{ + int ret = -1; + + if (cleancache_enabled) + ret = (*cleancache_ops.init_fs)(pagesize); + return ret; +} + +/* called by a cleancache-enabled clustered filesystem at time of mount */ +static inline int cleancache_init_shared_fs(char *uuid, size_t pagesize) +{ + int ret = -1; + + if (cleancache_enabled) + ret = (*cleancache_ops.init_shared_fs)(uuid, pagesize); + return ret; +} + +static inline int cleancache_get_page(struct page *page) +{ + int ret = -1; + + if (cleancache_enabled) + ret = __cleancache_get_page(page); + return ret; +} + +static inline void cleancache_put_page(struct page *page) +{ + if (cleancache_enabled) + __cleancache_put_page(page); +} + +static inline void cleancache_flush_page(struct address_space *mapping, + struct page *page) +{ + if (cleancache_enabled) + __cleancache_flush_page(mapping, page); +} + +static inline void cleancache_flush_inode(struct address_space *mapping) +{ + if (cleancache_enabled) + __cleancache_flush_inode(mapping); +} + +/* + * called by any cleancache-enabled filesystem at time of unmount; + * note that pool_id is surrendered and may be returned by a subsequent + * cleancache_init_fs or cleancache_init_shared_fs + */ +static inline void cleancache_flush_fs(int pool_id) +{ + if (cleancache_enabled && pool_id >= 0) + (*cleancache_ops.flush_fs)(pool_id); +} + +#endif /* _LINUX_CLEANCACHE_H */ diff -NruabBEp linux-2.6.34.5/include/linux/frontswap.h l-t/include/linux/frontswap.h --- linux-2.6.34.5/include/linux/frontswap.h 1970-01-01 03:00:00.000000000 +0300 +++ l-t/include/linux/frontswap.h 2010-08-24 12:33:25.000000000 +0400 @@ -0,0 +1,97 @@ +#ifndef _LINUX_FRONTSWAP_H +#define _LINUX_FRONTSWAP_H + +#include +#include + +struct frontswap_ops { + int (*init)(unsigned); /* frontswap enabled if non-NULL */ + int (*put_page)(int, unsigned, pgoff_t, struct page *); + int (*get_page)(int, unsigned, pgoff_t, struct page *); + void (*flush_page)(int, unsigned, pgoff_t); + void (*flush_area)(int, unsigned); +}; + +extern int frontswap_poolid; + +extern struct frontswap_ops frontswap_ops; +extern void frontswap_shrink(unsigned long); +extern unsigned long frontswap_curr_pages(void); + +extern int __frontswap_put_page(struct page *page); +extern int __frontswap_get_page(struct page *page); +extern void __frontswap_flush_page(unsigned, pgoff_t); +extern void __frontswap_flush_area(unsigned); + +#ifdef CONFIG_FRONTSWAP +#define frontswap_enabled (frontswap_ops.init) +#else +/* all inline routines become no-ops and all externs are ignored */ +#define frontswap_enabled ((struct frontswap_ops *)NULL) +#endif + +static inline int frontswap_test(struct swap_info_struct *sis, pgoff_t offset) +{ + int ret = 0; + + if (frontswap_enabled && sis->frontswap_map) + ret = test_bit(offset % BITS_PER_LONG, + &sis->frontswap_map[offset/BITS_PER_LONG]); + return ret; +} + +static inline void frontswap_set(struct swap_info_struct *sis, pgoff_t offset) +{ + if (frontswap_enabled && sis->frontswap_map) + set_bit(offset % BITS_PER_LONG, + &sis->frontswap_map[offset/BITS_PER_LONG]); +} + +static inline void frontswap_clear(struct swap_info_struct *sis, pgoff_t offset) +{ + if (frontswap_enabled && sis->frontswap_map) + clear_bit(offset % BITS_PER_LONG, + &sis->frontswap_map[offset/BITS_PER_LONG]); +} + + +static inline void frontswap_init(unsigned type) +{ + if (frontswap_enabled) { + /* only need one poolid regardless of number of swap types */ + if (frontswap_poolid < 0) + frontswap_poolid = (*frontswap_ops.init)(type); + } +} + +static inline int frontswap_put_page(struct page *page) +{ + int ret = 0; + + if (frontswap_enabled && frontswap_poolid >= 0) + ret = __frontswap_put_page(page); + return ret; +} + +static inline int frontswap_get_page(struct page *page) +{ + int ret = 0; + + if (frontswap_enabled && frontswap_poolid >= 0) + ret = __frontswap_get_page(page); + return ret; +} + +static inline void frontswap_flush_page(unsigned type, pgoff_t offset) +{ + if (frontswap_enabled && frontswap_poolid >= 0) + __frontswap_flush_page(type, offset); +} + +static inline void frontswap_flush_area(unsigned type) +{ + if (frontswap_enabled && frontswap_poolid >= 0) + __frontswap_flush_area(type); +} + +#endif /* _LINUX_FRONTSWAP_H */ diff -NruabBEp linux-2.6.34.5/include/linux/fs.h l-t/include/linux/fs.h --- linux-2.6.34.5/include/linux/fs.h 2010-08-20 22:52:05.000000000 +0400 +++ l-t/include/linux/fs.h 2010-08-24 12:33:29.000000000 +0400 @@ -1383,6 +1383,13 @@ struct super_block { * generic_show_options() */ char *s_options; + +#ifndef __GENKSYMS__ + /* + * Saved pool identifier for cleancache (-1 means none) + */ + int cleancache_poolid; +#endif }; extern struct timespec current_fs_time(struct super_block *sb); diff -NruabBEp linux-2.6.34.5/include/linux/memory_hotplug.h l-t/include/linux/memory_hotplug.h --- linux-2.6.34.5/include/linux/memory_hotplug.h 2010-08-20 22:52:05.000000000 +0400 +++ l-t/include/linux/memory_hotplug.h 2010-08-24 12:12:05.000000000 +0400 @@ -202,6 +202,7 @@ static inline int is_mem_section_removab } #endif /* CONFIG_MEMORY_HOTREMOVE */ +extern int add_registered_memory(int nid, u64 start, u64 size); extern int add_memory(int nid, u64 start, u64 size); extern int arch_add_memory(int nid, u64 start, u64 size); extern int remove_memory(u64 start, u64 size); diff -NruabBEp linux-2.6.34.5/include/linux/swapfile.h l-t/include/linux/swapfile.h --- linux-2.6.34.5/include/linux/swapfile.h 1970-01-01 03:00:00.000000000 +0300 +++ l-t/include/linux/swapfile.h 2010-08-24 12:33:33.000000000 +0400 @@ -0,0 +1,13 @@ +#ifndef _LINUX_SWAPFILE_H +#define _LINUX_SWAPFILE_H + +/* + * these were static in swapfile.c but frontswap.c needs them and we don't + * want to expose them to the dozens of source files that include swap.h + */ +extern spinlock_t swap_lock; +extern struct swap_list_t swap_list; +extern struct swap_info_struct *swap_info[]; +extern int try_to_unuse(unsigned int, bool, unsigned long); + +#endif /* _LINUX_SWAPFILE_H */ diff -NruabBEp linux-2.6.34.5/include/linux/swap.h l-t/include/linux/swap.h --- linux-2.6.34.5/include/linux/swap.h 2010-08-20 22:52:05.000000000 +0400 +++ l-t/include/linux/swap.h 2010-08-24 12:33:37.000000000 +0400 @@ -182,6 +182,10 @@ struct swap_info_struct { struct block_device *bdev; /* swap device or bdev of swap file */ struct file *swap_file; /* seldom referenced */ unsigned int old_block_size; /* seldom referenced */ +#ifndef __GENKSYMS__ + unsigned long *frontswap_map; /* frontswap in-use, one bit per page */ + unsigned int frontswap_pages; /* frontswap pages in-use counter */ +#endif }; struct swap_list_t { diff -NruabBEp linux-2.6.34.5/include/xen/interface/tmem.h l-t/include/xen/interface/tmem.h --- linux-2.6.34.5/include/xen/interface/tmem.h 1970-01-01 03:00:00.000000000 +0300 +++ l-t/include/xen/interface/tmem.h 2010-08-24 13:15:49.000000000 +0400 @@ -0,0 +1,43 @@ +/* + * include/xen/interface/tmem.h + * + * Interface to Xen implementation of transcendent memory + * + * Copyright (C) 2009 Dan Magenheimer, Oracle Corp. + */ + +#include + +#define TMEM_CONTROL 0 +#define TMEM_NEW_POOL 1 +#define TMEM_DESTROY_POOL 2 +#define TMEM_NEW_PAGE 3 +#define TMEM_PUT_PAGE 4 +#define TMEM_GET_PAGE 5 +#define TMEM_FLUSH_PAGE 6 +#define TMEM_FLUSH_OBJECT 7 +#define TMEM_READ 8 +#define TMEM_WRITE 9 +#define TMEM_XCHG 10 + +/* Subops for HYPERVISOR_tmem_op(TMEM_CONTROL) */ +#define TMEMC_THAW 0 +#define TMEMC_FREEZE 1 +#define TMEMC_FLUSH 2 +#define TMEMC_DESTROY 3 +#define TMEMC_LIST 4 +#define TMEMC_SET_WEIGHT 5 +#define TMEMC_SET_CAP 6 +#define TMEMC_SET_COMPRESS 7 + +/* Bits for HYPERVISOR_tmem_op(TMEM_NEW_POOL) */ +#define TMEM_POOL_PERSIST 1 +#define TMEM_POOL_SHARED 2 +#define TMEM_POOL_PAGESIZE_SHIFT 4 +#define TMEM_POOL_PAGESIZE_MASK 0xf +#define TMEM_POOL_VERSION_SHIFT 24 +#define TMEM_POOL_VERSION_MASK 0xff + +/* Special errno values */ +#define EFROZEN 1000 +#define EEMPTY 1001 diff -NruabBEp linux-2.6.34.5/include/xen/interface/xen.h l-t/include/xen/interface/xen.h --- linux-2.6.34.5/include/xen/interface/xen.h 2010-08-20 22:52:05.000000000 +0400 +++ l-t/include/xen/interface/xen.h 2010-08-24 12:34:45.000000000 +0400 @@ -58,6 +58,7 @@ #define __HYPERVISOR_event_channel_op 32 #define __HYPERVISOR_physdev_op 33 #define __HYPERVISOR_hvm_op 34 +#define __HYPERVISOR_tmem_op 38 /* Architecture-specific hypercall definitions. */ #define __HYPERVISOR_arch_0 48 @@ -461,6 +462,27 @@ typedef uint8_t xen_domain_handle_t[16]; #define __mk_unsigned_long(x) x ## UL #define mk_unsigned_long(x) __mk_unsigned_long(x) +struct tmem_op { + uint32_t cmd; + int32_t pool_id; + union { + struct { /* for cmd == TMEM_NEW_POOL */ + uint64_t uuid[2]; + uint32_t flags; + } new; + struct { + uint64_t object; + uint32_t index; + uint32_t tmem_offset; + uint32_t pfn_offset; + uint32_t len; + GUEST_HANDLE(void) gmfn; /* guest machine page frame */ + } gen; + } u; +}; +typedef struct tmem_op tmem_op_t; +DEFINE_GUEST_HANDLE_STRUCT(tmem_op_t); + #else /* __ASSEMBLY__ */ /* In assembly code we cannot use C numeric constant suffixes. */ diff -NruabBEp linux-2.6.34.5/include/xen/tmem.h l-t/include/xen/tmem.h --- linux-2.6.34.5/include/xen/tmem.h 1970-01-01 03:00:00.000000000 +0300 +++ l-t/include/xen/tmem.h 2010-08-24 13:15:44.000000000 +0400 @@ -0,0 +1,22 @@ +/* + * include/xen/tmem.h + * + * Interface to transcendent memory + * + * Copyright (C) 2008,2009 Dan Magenheimer, Oracle Corp. + */ + +#include + +struct tmem_pool_uuid { + u64 uuid_lo; + u64 uuid_hi; +}; + +#define TMEM_POOL_PRIVATE_UUID { 0, 0 } + +/* flags for tmem_ops.new_pool */ +#define TMEM_POOL_PERSIST 1 +#define TMEM_POOL_SHARED 2 + +extern int tmem_enabled; diff -NruabBEp linux-2.6.34.5/mm/cleancache.c l-t/mm/cleancache.c --- linux-2.6.34.5/mm/cleancache.c 1970-01-01 03:00:00.000000000 +0300 +++ l-t/mm/cleancache.c 2010-08-24 12:32:15.000000000 +0400 @@ -0,0 +1,169 @@ +/* + * Cleancache frontend + * + * This code provides the generic "frontend" layer to call a matching + * "backend" driver implementation of cleancache. See + * Documentation/vm/cleancache.txt for more information. + * + * Copyright (C) 2009-2010 Oracle Corp. All rights reserved. + * Author: Dan Magenheimer + * + * This work is licensed under the terms of the GNU GPL, version 2. + */ + +#include +#include +#include +#include + +/* + * cleancache_ops contains the pointers to the cleancache "backend" + * implementation functions + */ +struct cleancache_ops cleancache_ops; +EXPORT_SYMBOL(cleancache_ops); + +/* useful stats available in /sys/kernel/mm/cleancache */ +static unsigned long succ_gets; +static unsigned long failed_gets; +static unsigned long puts; +static unsigned long flushes; + +/* + * "Get" data from cleancache associated with the poolid/inode/index + * that were specified when the data was put to cleanache and, if + * successful, use it to fill the specified page with data and return 0. + * The pageframe is unchanged and returns -1 if the get fails. + * Page must be locked by caller. + */ +int __cleancache_get_page(struct page *page) +{ + int ret = -1; + int pool_id; + + VM_BUG_ON(!PageLocked(page)); + pool_id = page->mapping->host->i_sb->cleancache_poolid; + if (pool_id >= 0) { + ret = (*cleancache_ops.get_page)(pool_id, + page->mapping->host->i_ino, + page->index, + page); + if (ret == 0) + succ_gets++; + else + failed_gets++; + } + return ret; +} +EXPORT_SYMBOL(__cleancache_get_page); + +/* + * "Put" data from a page to cleancache and associate it with the + * (previously-obtained per-filesystem) poolid and the page's, + * inode and page index. Page must be locked. Note that a put_page + * always "succeeds", though a subsequent get_page may succeed or fail. + */ +void __cleancache_put_page(struct page *page) +{ + int pool_id; + + VM_BUG_ON(!PageLocked(page)); + pool_id = page->mapping->host->i_sb->cleancache_poolid; + if (pool_id >= 0) { + (*cleancache_ops.put_page)(pool_id, page->mapping->host->i_ino, + page->index, page); + puts++; + } +} + +/* + * Flush any data from cleancache associated with the poolid and the + * page's inode and page index so that a subsequent "get" will fail. + */ +void __cleancache_flush_page(struct address_space *mapping, struct page *page) +{ + int pool_id = mapping->host->i_sb->cleancache_poolid; + + if (pool_id >= 0) { + VM_BUG_ON(!PageLocked(page)); + (*cleancache_ops.flush_page)(pool_id, mapping->host->i_ino, + page->index); + flushes++; + } +} +EXPORT_SYMBOL(__cleancache_flush_page); + +/* + * Flush all data from cleancache associated with the poolid and the + * mappings's inode so that all subsequent gets to this poolid/inode + * will fail. + */ +void __cleancache_flush_inode(struct address_space *mapping) +{ + int pool_id = mapping->host->i_sb->cleancache_poolid; + + if (pool_id >= 0) + (*cleancache_ops.flush_inode)(pool_id, mapping->host->i_ino); +} +EXPORT_SYMBOL(__cleancache_flush_inode); + +#ifdef CONFIG_SYSFS + +/* see Documentation/ABI/xxx/sysfs-kernel-mm-cleancache */ + +#define CLEANCACHE_ATTR_RO(_name) \ + static struct kobj_attribute _name##_attr = __ATTR_RO(_name) + +static ssize_t succ_gets_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", succ_gets); +} +CLEANCACHE_ATTR_RO(succ_gets); + +static ssize_t failed_gets_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", failed_gets); +} +CLEANCACHE_ATTR_RO(failed_gets); + +static ssize_t puts_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", puts); +} +CLEANCACHE_ATTR_RO(puts); + +static ssize_t flushes_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", flushes); +} +CLEANCACHE_ATTR_RO(flushes); + +static struct attribute *cleancache_attrs[] = { + &succ_gets_attr.attr, + &failed_gets_attr.attr, + &puts_attr.attr, + &flushes_attr.attr, + NULL, +}; + +static struct attribute_group cleancache_attr_group = { + .attrs = cleancache_attrs, + .name = "cleancache", +}; + +#endif /* CONFIG_SYSFS */ + +static int __init init_cleancache(void) +{ +#ifdef CONFIG_SYSFS + int err; + + err = sysfs_create_group(mm_kobj, &cleancache_attr_group); +#endif /* CONFIG_SYSFS */ + return 0; +} +module_init(init_cleancache) diff -NruabBEp linux-2.6.34.5/mm/filemap.c l-t/mm/filemap.c --- linux-2.6.34.5/mm/filemap.c 2010-08-20 22:52:05.000000000 +0400 +++ l-t/mm/filemap.c 2010-08-24 12:32:18.000000000 +0400 @@ -34,6 +34,7 @@ #include /* for BUG_ON(!in_atomic()) only */ #include #include /* for page_is_file_cache() */ +#include #include "internal.h" /* @@ -119,6 +120,16 @@ void __remove_from_page_cache(struct pag { struct address_space *mapping = page->mapping; + /* + * if we're uptodate, flush out into the cleancache, otherwise + * invalidate any existing cleancache entries. We can't leave + * stale data around in the cleancache once our page is gone + */ + if (PageUptodate(page)) + cleancache_put_page(page); + else + cleancache_flush_page(mapping, page); + radix_tree_delete(&mapping->page_tree, page->index); page->mapping = NULL; mapping->nrpages--; diff -NruabBEp linux-2.6.34.5/mm/frontswap.c l-t/mm/frontswap.c --- linux-2.6.34.5/mm/frontswap.c 1970-01-01 03:00:00.000000000 +0300 +++ l-t/mm/frontswap.c 2010-08-24 12:32:23.000000000 +0400 @@ -0,0 +1,307 @@ +/* + * Frontswap frontend + * + * This code provides the generic "frontend" layer to call a matching + * "backend" driver implementation of frontswap. See + * Documentation/vm/frontswap.txt for more information. + * + * Copyright (C) 2009-2010 Oracle Corp. All rights reserved. + * Author: Dan Magenheimer + * + * This work is licensed under the terms of the GNU GPL, version 2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * frontswap_ops contains the pointers to the frontswap "backend" + * implementation functions + */ +struct frontswap_ops frontswap_ops; +EXPORT_SYMBOL(frontswap_ops); + +/* one poolid is shared by frontswap across all "fronted" swap devices */ +int frontswap_poolid = -1; +EXPORT_SYMBOL(frontswap_poolid); + +/* useful stats available in /sys/kernel/mm/frontswap */ +static unsigned long gets; +static unsigned long succ_puts; +static unsigned long failed_puts; +static unsigned long flushes; + +/* + * "Put" data from a page to frontswap and associate it with the page's + * swaptype and offset. Page must be locked and in the swap cache. + * If frontswap already contains a page with matching swaptype and + * offset, the frontswap implmentation may either overwrite the data + * and return success or flush the page from frontswap and return failure + */ +int __frontswap_put_page(struct page *page) +{ + int ret = 0, dup = 0; + swp_entry_t entry = { .val = page_private(page), }; + int type = swp_type(entry); + struct swap_info_struct *sis = swap_info[type]; + unsigned long offset = (unsigned long)swp_offset(entry); + + VM_BUG_ON(!PageLocked(page)); + if (frontswap_test(sis, offset)) + dup = 1; + ret = (*frontswap_ops.put_page)(frontswap_poolid, type, offset, page); + if (ret == 0) { + frontswap_set(sis, offset); + succ_puts++; + if (!dup) + sis->frontswap_pages++; + } else if (dup) { + /* + failed dup always results in automatic flush of + the (older) page from frontswap + */ + frontswap_clear(sis, offset); + sis->frontswap_pages--; + failed_puts++; + } else + failed_puts++; + return ret; +} + +/* + * "Get" data from frontswap associated with swaptype and offset that were + * specified when the data was put to frontswap and use it to fill the + * specified page with data. Page must be locked and in the swap cache + */ +int __frontswap_get_page(struct page *page) +{ + int ret = 0; + swp_entry_t entry = { .val = page_private(page), }; + int type = swp_type(entry); + struct swap_info_struct *sis = swap_info[type]; + unsigned long offset = (unsigned long)swp_offset(entry); + + VM_BUG_ON(!PageLocked(page)); + if (frontswap_test(sis, offset)) + ret = (*frontswap_ops.get_page)(frontswap_poolid, + type, offset, page); + if (ret == 0) + gets++; + return ret; +} + +/* + * Flush any data from frontswap associated with the specified swaptype + * and offset so that a subsequent "get" will fail. + */ +void __frontswap_flush_page(unsigned type, pgoff_t offset) +{ + struct swap_info_struct *sis = swap_info[type]; + + if (frontswap_test(sis, offset)) { + (*frontswap_ops.flush_page)(frontswap_poolid, type, offset); + sis->frontswap_pages--; + frontswap_clear(sis, offset); + flushes++; + } +} + +/* + * Flush all data from frontswap associated with all offsets for the + * specified swaptype. + */ +void __frontswap_flush_area(unsigned type) +{ + struct swap_info_struct *sis = swap_info[type]; + + (*frontswap_ops.flush_area)(frontswap_poolid, type); + sis->frontswap_pages = 0; + memset(sis->frontswap_map, 0, sis->max / sizeof(long)); +} + +/* + * Frontswap, like a true swap device, may unnecessarily retain pages + * under certain circumstances; "shrink" frontswap is essentially a + * "partial swapoff" and works by calling try_to_unuse to attempt to + * unuse enough frontswap pages to attempt to -- subject to memory + * constraints -- reduce the number of pages in frontswap + */ +void frontswap_shrink(unsigned long target_pages) +{ + int wrapped = 0; + bool locked = false; + + for (wrapped = 0; wrapped <= 3; wrapped++) { + + struct swap_info_struct *si = NULL; + unsigned long total_pages = 0, total_pages_to_unuse; + unsigned long pages = 0, unuse_pages = 0; + int type; + + /* + * we don't want to hold swap_lock while doing a very + * lengthy try_to_unuse, but swap_list may change + * so restart scan from swap_list.head each time + */ + spin_lock(&swap_lock); + locked = true; + total_pages = 0; + for (type = swap_list.head; type >= 0; type = si->next) { + si = swap_info[type]; + total_pages += si->frontswap_pages; + } + if (total_pages <= target_pages) + goto out; + total_pages_to_unuse = total_pages - target_pages; + for (type = swap_list.head; type >= 0; type = si->next) { + si = swap_info[type]; + if (total_pages_to_unuse < si->frontswap_pages) + pages = unuse_pages = total_pages_to_unuse; + else { + pages = si->frontswap_pages; + unuse_pages = 0; /* unuse all */ + } + if (security_vm_enough_memory_kern(pages)) + continue; + vm_unacct_memory(pages); + break; + } + if (type < 0) + goto out; + locked = false; + spin_unlock(&swap_lock); + current->flags |= PF_OOM_ORIGIN; + try_to_unuse(type, true, unuse_pages); + current->flags &= ~PF_OOM_ORIGIN; + } + +out: + if (locked) + spin_unlock(&swap_lock); + return; +} +EXPORT_SYMBOL(frontswap_shrink); + +/* + * count and return the number of pages frontswap pages across all + * swap devices. This is exported so that a kernel module can + * determine current usage without reading sysfs. + */ +unsigned long frontswap_curr_pages(void) +{ + int type; + unsigned long totalpages = 0; + struct swap_info_struct *si = NULL; + + spin_lock(&swap_lock); + for (type = swap_list.head; type >= 0; type = si->next) { + si = swap_info[type]; + totalpages += si->frontswap_pages; + } + spin_unlock(&swap_lock); + return totalpages; +} +EXPORT_SYMBOL(frontswap_curr_pages); + +#ifdef CONFIG_SYSFS + +/* see Documentation/ABI/xxx/sysfs-kernel-mm-frontswap */ + +#define FRONTSWAP_ATTR_RO(_name) \ + static struct kobj_attribute _name##_attr = __ATTR_RO(_name) +#define FRONTSWAP_ATTR(_name) \ + static struct kobj_attribute _name##_attr = \ + __ATTR(_name, 0644, _name##_show, _name##_store) + +static ssize_t curr_pages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", frontswap_curr_pages()); +} + +static ssize_t curr_pages_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long target_pages; + int err; + + err = strict_strtoul(buf, 10, &target_pages); + if (err) + return -EINVAL; + + frontswap_shrink(target_pages); + + return count; +} +FRONTSWAP_ATTR(curr_pages); + +static ssize_t succ_puts_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", succ_puts); +} +FRONTSWAP_ATTR_RO(succ_puts); + +static ssize_t failed_puts_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", failed_puts); +} +FRONTSWAP_ATTR_RO(failed_puts); + +static ssize_t gets_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", gets); +} +FRONTSWAP_ATTR_RO(gets); + +static ssize_t flushes_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", flushes); +} +FRONTSWAP_ATTR_RO(flushes); + +static struct attribute *frontswap_attrs[] = { + &curr_pages_attr.attr, + &succ_puts_attr.attr, + &failed_puts_attr.attr, + &gets_attr.attr, + &flushes_attr.attr, + NULL, +}; + +static struct attribute_group frontswap_attr_group = { + .attrs = frontswap_attrs, + .name = "frontswap", +}; + +#endif /* CONFIG_SYSFS */ + +static int __init init_frontswap(void) +{ +#ifdef CONFIG_SYSFS + int err; + + err = sysfs_create_group(mm_kobj, &frontswap_attr_group); +#endif /* CONFIG_SYSFS */ + return 0; +} + +static void __exit exit_frontswap(void) +{ + frontswap_shrink(0UL); +} + +module_init(init_frontswap); +module_exit(exit_frontswap); diff -NruabBEp linux-2.6.34.5/mm/Kconfig l-t/mm/Kconfig --- linux-2.6.34.5/mm/Kconfig 2010-08-20 22:52:05.000000000 +0400 +++ l-t/mm/Kconfig 2010-08-24 12:32:56.000000000 +0400 @@ -287,3 +287,42 @@ config NOMMU_INITIAL_TRIM_EXCESS of 1 says that all excess pages should be trimmed. See Documentation/nommu-mmap.txt for more information. + +config CLEANCACHE + bool "Enable cleancache pseudo-RAM driver to cache clean pages" + default y + help + Cleancache can be thought of as a page-granularity victim cache + for clean pages that the kernel's pageframe replacement algorithm + (PFRA) would like to keep around, but can't since there isn't enough + memory. So when the PFRA "evicts" a page, it first attempts to put + it into a synchronous concurrency-safe page-oriented pseudo-RAM + device (such as Xen's Transcendent Memory, aka "tmem") which is not + directly accessible or addressable by the kernel and is of unknown + (and possibly time-varying) size. And when a cleancache-enabled + filesystem wishes to access a page in a file on disk, it first + checks cleancache to see if it already contains it; if it does, + the page is copied into the kernel and a disk access is avoided. + When a pseudo-RAM device is available, a significant I/O reduction + may be achieved. When none is available, all cleancache calls + are reduced to a single pointer-compare-against-NULL resulting + in a negligible performance hit. + + If unsure, say Y to enable cleancache + +config FRONTSWAP + bool "Enable frontswap pseudo-RAM driver to cache swap pages" + default y + help + Frontswap is so named because it can be thought of as the opposite of + a "backing" store for a swap device. The storage is assumed to be + a synchronous concurrency-safe page-oriented pseudo-RAM device (such + as Xen's Transcendent Memory, aka "tmem") which is not directly + accessible or addressable by the kernel and is of unknown (and + possibly time-varying) size. When a pseudo-RAM device is available, + a signficant swap I/O reduction may be achieved. When none is + available, all frontswap calls are reduced to a single pointer- + compare-against-NULL resulting in a negligible performance hit. + + If unsure, say Y to enable frontswap. + diff -NruabBEp linux-2.6.34.5/mm/Makefile l-t/mm/Makefile --- linux-2.6.34.5/mm/Makefile 2010-08-20 22:52:05.000000000 +0400 +++ l-t/mm/Makefile 2010-08-24 12:32:59.000000000 +0400 @@ -17,6 +17,7 @@ obj-y += init-mm.o obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o +obj-$(CONFIG_FRONTSWAP) += frontswap.o obj-$(CONFIG_HAS_DMA) += dmapool.o obj-$(CONFIG_HUGETLBFS) += hugetlb.o obj-$(CONFIG_NUMA) += mempolicy.o @@ -44,3 +45,4 @@ obj-$(CONFIG_MEMORY_FAILURE) += memory-f obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o +obj-$(CONFIG_CLEANCACHE) += cleancache.o diff -NruabBEp linux-2.6.34.5/mm/memory_hotplug.c l-t/mm/memory_hotplug.c --- linux-2.6.34.5/mm/memory_hotplug.c 2010-08-20 22:52:05.000000000 +0400 +++ l-t/mm/memory_hotplug.c 2010-08-24 12:27:42.000000000 +0400 @@ -481,22 +481,13 @@ static void rollback_node_hotadd(int nid return; } - /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ -int __ref add_memory(int nid, u64 start, u64 size) +static int __ref __add_memory(int nid, u64 start, u64 size) { pg_data_t *pgdat = NULL; int new_pgdat = 0; - struct resource *res; int ret; - lock_system_sleep(); - - res = register_memory_resource(start, size); - ret = -EEXIST; - if (!res) - goto out; - if (!node_online(nid)) { pgdat = hotadd_new_pgdat(nid, start); ret = -ENOMEM; @@ -530,10 +521,44 @@ int __ref add_memory(int nid, u64 start, goto out; error: - /* rollback pgdat allocation and others */ + /* rollback pgdat allocation */ if (new_pgdat) rollback_node_hotadd(nid, pgdat); - if (res) + +out: + return ret; +} + +int add_registered_memory(int nid, u64 start, u64 size) +{ + int ret; + + lock_system_sleep(); + ret = __add_memory(nid, start, size); + + unlock_system_sleep(); + + return ret; +} +EXPORT_SYMBOL_GPL(add_registered_memory); + +int add_memory(int nid, u64 start, u64 size) +{ + int ret = -EEXIST; + struct resource *res; + + lock_system_sleep(); + + res = register_memory_resource(start, size); + + if (!res) + goto out; + + ret = __add_memory(nid, start, size); + + if (!ret) + goto out; + release_memory_resource(res); out: diff -NruabBEp linux-2.6.34.5/mm/mmap.c l-t/mm/mmap.c --- linux-2.6.34.5/mm/mmap.c 2010-08-20 22:52:05.000000000 +0400 +++ l-t/mm/mmap.c 2010-08-24 12:31:52.000000000 +0400 @@ -87,6 +87,12 @@ int sysctl_overcommit_ratio = 50; /* def int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; struct percpu_counter vm_committed_as; +unsigned long vm_get_committed_as(void) +{ + return percpu_counter_read_positive(&vm_committed_as); +} +EXPORT_SYMBOL(vm_get_committed_as); + /* * Check that a process has enough memory to allocate a new virtual * mapping. 0 means there is enough memory for the allocation to diff -NruabBEp linux-2.6.34.5/mm/page_io.c l-t/mm/page_io.c --- linux-2.6.34.5/mm/page_io.c 2010-08-20 22:52:05.000000000 +0400 +++ l-t/mm/page_io.c 2010-08-24 12:31:30.000000000 +0400 @@ -18,6 +18,7 @@ #include #include #include +#include #include static struct bio *get_swap_bio(gfp_t gfp_flags, @@ -98,6 +99,12 @@ int swap_writepage(struct page *page, st unlock_page(page); goto out; } + if (frontswap_put_page(page) == 0) { + set_page_writeback(page); + unlock_page(page); + end_page_writeback(page); + goto out; + } bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write); if (bio == NULL) { set_page_dirty(page); @@ -122,6 +129,11 @@ int swap_readpage(struct page *page) VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(PageUptodate(page)); + if (frontswap_get_page(page) == 0) { + SetPageUptodate(page); + unlock_page(page); + goto out; + } bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); if (bio == NULL) { unlock_page(page); diff -NruabBEp linux-2.6.34.5/mm/swapfile.c l-t/mm/swapfile.c --- linux-2.6.34.5/mm/swapfile.c 2010-08-20 22:52:05.000000000 +0400 +++ l-t/mm/swapfile.c 2010-08-24 12:31:56.000000000 +0400 @@ -30,6 +30,8 @@ #include #include #include +#include +#include #include #include @@ -41,7 +43,7 @@ static bool swap_count_continued(struct static void free_swap_count_continuations(struct swap_info_struct *); static sector_t map_swap_entry(swp_entry_t, struct block_device**); -static DEFINE_SPINLOCK(swap_lock); +DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; long nr_swap_pages; long total_swap_pages; @@ -52,9 +54,9 @@ static const char Unused_file[] = "Unuse static const char Bad_offset[] = "Bad swap offset entry "; static const char Unused_offset[] = "Unused swap offset entry "; -static struct swap_list_t swap_list = {-1, -1}; +struct swap_list_t swap_list = {-1, -1}; -static struct swap_info_struct *swap_info[MAX_SWAPFILES]; +struct swap_info_struct *swap_info[MAX_SWAPFILES]; static DEFINE_MUTEX(swapon_mutex); @@ -585,6 +587,7 @@ static unsigned char swap_entry_free(str swap_list.next = p->type; nr_swap_pages++; p->inuse_pages--; + frontswap_flush_page(p->type, offset); } return usage; @@ -1027,7 +1030,7 @@ static int unuse_mm(struct mm_struct *mm * Recycle to start on reaching the end, returning 0 when empty. */ static unsigned int find_next_to_unuse(struct swap_info_struct *si, - unsigned int prev) + unsigned int prev, bool frontswap) { unsigned int max = si->max; unsigned int i = prev; @@ -1053,6 +1056,12 @@ static unsigned int find_next_to_unuse(s prev = 0; i = 1; } + if (frontswap) { + if (frontswap_test(si, i)) + break; + else + continue; + } count = si->swap_map[i]; if (count && swap_count(count) != SWAP_MAP_BAD) break; @@ -1064,8 +1073,12 @@ static unsigned int find_next_to_unuse(s * We completely avoid races by reading each swap page in advance, * and then search for the process using it. All the necessary * page table adjustments can then be made atomically. + * + * if the boolean frontswap is true, only unuse pages_to_unuse pages; + * pages_to_unuse==0 means all pages */ -static int try_to_unuse(unsigned int type) +int try_to_unuse(unsigned int type, bool frontswap, + unsigned long pages_to_unuse) { struct swap_info_struct *si = swap_info[type]; struct mm_struct *start_mm; @@ -1098,7 +1111,7 @@ static int try_to_unuse(unsigned int typ * one pass through swap_map is enough, but not necessarily: * there are races when an instance of an entry might be missed. */ - while ((i = find_next_to_unuse(si, i)) != 0) { + while ((i = find_next_to_unuse(si, i, frontswap)) != 0) { if (signal_pending(current)) { retval = -EINTR; break; @@ -1265,6 +1278,11 @@ static int try_to_unuse(unsigned int typ * interactive performance. */ cond_resched(); + if (frontswap && pages_to_unuse > 0) { + if (!--pages_to_unuse) + break; + } + } mmput(start_mm); @@ -1590,7 +1608,7 @@ SYSCALL_DEFINE1(swapoff, const char __us spin_unlock(&swap_lock); current->flags |= PF_OOM_ORIGIN; - err = try_to_unuse(type); + err = try_to_unuse(type, false, 0); current->flags &= ~PF_OOM_ORIGIN; if (err) { @@ -1642,9 +1660,12 @@ SYSCALL_DEFINE1(swapoff, const char __us swap_map = p->swap_map; p->swap_map = NULL; p->flags = 0; + frontswap_flush_area(type); spin_unlock(&swap_lock); mutex_unlock(&swapon_mutex); vfree(swap_map); + if (p->frontswap_map) + vfree(p->frontswap_map); /* Destroy swap account informatin */ swap_cgroup_swapoff(type); @@ -1800,6 +1821,7 @@ SYSCALL_DEFINE2(swapon, const char __use unsigned long maxpages; unsigned long swapfilepages; unsigned char *swap_map = NULL; + unsigned long *frontswap_map = NULL; struct page *page = NULL; struct inode *inode = NULL; int did_down = 0; @@ -2020,6 +2042,12 @@ SYSCALL_DEFINE2(swapon, const char __use error = -EINVAL; goto bad_swap; } + /* frontswap enabled? set up bit-per-page map for frontswap */ + if (frontswap_enabled) { + frontswap_map = vmalloc(maxpages / sizeof(long)); + if (frontswap_map) + memset(frontswap_map, 0, maxpages / sizeof(long)); + } if (p->bdev) { if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { @@ -2038,16 +2066,18 @@ SYSCALL_DEFINE2(swapon, const char __use else p->prio = --least_priority; p->swap_map = swap_map; + p->frontswap_map = frontswap_map; p->flags |= SWP_WRITEOK; nr_swap_pages += nr_good_pages; total_swap_pages += nr_good_pages; printk(KERN_INFO "Adding %uk swap on %s. " - "Priority:%d extents:%d across:%lluk %s%s\n", + "Priority:%d extents:%d across:%lluk %s%s%s\n", nr_good_pages<<(PAGE_SHIFT-10), name, p->prio, nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), (p->flags & SWP_SOLIDSTATE) ? "SS" : "", - (p->flags & SWP_DISCARDABLE) ? "D" : ""); + (p->flags & SWP_DISCARDABLE) ? "D" : "", + (p->frontswap_map) ? "FS" : ""); /* insert swap space into swap_list: */ prev = -1; @@ -2061,6 +2091,7 @@ SYSCALL_DEFINE2(swapon, const char __use swap_list.head = swap_list.next = type; else swap_info[prev]->next = type; + frontswap_init(type); spin_unlock(&swap_lock); mutex_unlock(&swapon_mutex); error = 0; @@ -2245,6 +2276,10 @@ int valid_swaphandles(swp_entry_t entry, base++; spin_lock(&swap_lock); + if (frontswap_test(si, target)) { + spin_unlock(&swap_lock); + return 0; + } if (end > si->max) /* don't go beyond end of map */ end = si->max; @@ -2255,6 +2290,9 @@ int valid_swaphandles(swp_entry_t entry, break; if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD) break; + /* Don't read in frontswap pages */ + if (frontswap_test(si, toff)) + break; } /* Count contiguous allocated slots below our target */ for (toff = target; --toff >= base; nr_pages++) { @@ -2263,6 +2301,9 @@ int valid_swaphandles(swp_entry_t entry, break; if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD) break; + /* Don't read in frontswap pages */ + if (frontswap_test(si, toff)) + break; } spin_unlock(&swap_lock); diff -NruabBEp linux-2.6.34.5/mm/truncate.c l-t/mm/truncate.c --- linux-2.6.34.5/mm/truncate.c 2010-08-20 22:52:05.000000000 +0400 +++ l-t/mm/truncate.c 2010-08-24 12:31:43.000000000 +0400 @@ -19,6 +19,7 @@ #include #include /* grr. try_to_release_page, do_invalidatepage */ +#include #include "internal.h" @@ -51,6 +52,7 @@ void do_invalidatepage(struct page *page static inline void truncate_partial_page(struct page *page, unsigned partial) { zero_user_segment(page, partial, PAGE_CACHE_SIZE); + cleancache_flush_page(page->mapping, page); if (page_has_private(page)) do_invalidatepage(page, partial); } @@ -108,6 +110,10 @@ truncate_complete_page(struct address_sp clear_page_mlock(page); remove_from_page_cache(page); ClearPageMappedToDisk(page); + /* this must be after the remove_from_page_cache which + * calls cleancache_put_page (and note page->mapping is now NULL) + */ + cleancache_flush_page(mapping, page); page_cache_release(page); /* pagecache ref */ return 0; } @@ -215,6 +221,7 @@ void truncate_inode_pages_range(struct a pgoff_t next; int i; + cleancache_flush_inode(mapping); if (mapping->nrpages == 0) return; @@ -290,6 +297,7 @@ void truncate_inode_pages_range(struct a pagevec_release(&pvec); mem_cgroup_uncharge_end(); } + cleancache_flush_inode(mapping); } EXPORT_SYMBOL(truncate_inode_pages_range); @@ -428,6 +436,7 @@ int invalidate_inode_pages2_range(struct int did_range_unmap = 0; int wrapped = 0; + cleancache_flush_inode(mapping); pagevec_init(&pvec, 0); next = start; while (next <= end && !wrapped && @@ -486,6 +495,7 @@ int invalidate_inode_pages2_range(struct mem_cgroup_uncharge_end(); cond_resched(); } + cleancache_flush_inode(mapping); return ret; } EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);