static inline struct io_context *ioc_task_link(struct io_context *ioc)
@@ -104,6 +106,7 @@ int put_io_context(struct io_context *io
void exit_io_context(void);
struct io_context *get_io_context(gfp_t gfp_flags, int node);
struct io_context *alloc_io_context(gfp_t gfp_flags, int node);
+void init_io_context(struct io_context *ioc);
void copy_io_context(struct io_context **pdst, struct io_context **psrc);
#else
static inline void exit_io_context(void)
diff -Ndupr linux-2.6.27-rc1-mm1.cg1/include/linux/memcontrol.h linux-2.6.27-rc1-mm1.cg2/include/linux/memcontrol.h
--- linux-2.6.27-rc1-mm1.cg1/include/linux/memcontrol.h 2008-08-01 19:03:21.000000000 +0900
+++ linux-2.6.27-rc1-mm1.cg2/include/linux/memcontrol.h 2008-08-01 19:22:10.000000000 +0900
@@ -54,6 +54,10 @@ struct page_cgroup {
struct list_head lru; /* per cgroup LRU list */
struct mem_cgroup *mem_cgroup;
#endif /* CONFIG_CGROUP_MEM_RES_CTLR */
+#ifdef CONFIG_CGROUP_BIO
+ struct list_head blist; /* for bio_cgroup page list */
+ struct bio_cgroup *bio_cgroup;
+#endif
struct page *page;
int flags;
};
diff -Ndupr linux-2.6.27-rc1-mm1.cg1/init/Kconfig linux-2.6.27-rc1-mm1.cg2/init/Kconfig
--- linux-2.6.27-rc1-mm1.cg1/init/Kconfig 2008-08-01 19:03:21.000000000 +0900
+++ linux-2.6.27-rc1-mm1.cg2/init/Kconfig 2008-08-01 19:18:38.000000000 +0900
@@ -418,9 +418,20 @@ config CGROUP_MEMRLIMIT_CTLR
memory RSS and Page Cache control. Virtual address space control
is provided by this controller.
+config CGROUP_BIO
+ bool "Block I/O cgroup subsystem"
+ depends on CGROUPS
+ select MM_OWNER
+ help
+ Provides a Resource Controller which enables to track the onwner
+ of every Block I/O.
+ The information this subsystem provides can be used from any
+ kind of module such as dm-ioband device mapper modules or
+ the cfq-scheduler.
+
config CGROUP_PAGE
def_bool y
- depends on CGROUP_MEM_RES_CTLR
+ depends on CGROUP_MEM_RES_CTLR || CGROUP_BIO
config SYSFS_DEPRECATED
bool
diff -Ndupr linux-2.6.27-rc1-mm1.cg1/mm/Makefile linux-2.6.27-rc1-mm1.cg2/mm/Makefile
--- linux-2.6.27-rc1-mm1.cg1/mm/Makefile 2008-08-01 19:03:21.000000000 +0900
+++ linux-2.6.27-rc1-mm1.cg2/mm/Makefile 2008-08-01 19:18:38.000000000 +0900
@@ -35,4 +35,5 @@ obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_SMP) += allocpercpu.o
obj-$(CONFIG_QUICKLIST) += quicklist.o
obj-$(CONFIG_CGROUP_PAGE) += memcontrol.o
+obj-$(CONFIG_CGROUP_BIO) += biocontrol.o
obj-$(CONFIG_CGROUP_MEMRLIMIT_CTLR) += memrlimitcgroup.o
diff -Ndupr linux-2.6.27-rc1-mm1.cg1/mm/biocontrol.c linux-2.6.27-rc1-mm1.cg2/mm/biocontrol.c
--- linux-2.6.27-rc1-mm1.cg1/mm/biocontrol.c 1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.27-rc1-mm1.cg2/mm/biocontrol.c 2008-08-01 19:35:51.000000000 +0900
@@ -0,0 +1,233 @@
+/* biocontrol.c - Block I/O Controller
+ *
+ * Copyright IBM Corporation, 2007
+ * Author Balbir Singh <balbir@linux.vnet.ibm.com>
+ *
+ * Copyright 2007 OpenVZ SWsoft Inc
+ * Author: Pavel Emelianov <xemul@openvz.org>
+ *
+ * Copyright VA Linux Systems Japan, 2008
+ * Author Hirokazu Takahashi <taka@valinux.co.jp>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/module.h>
+#include <linux/cgroup.h>
+#include <linux/mm.h>
+#include <linux/blkdev.h>
+#include <linux/smp.h>
+#include <linux/bit_spinlock.h>
+#include <linux/idr.h>
+#include <linux/err.h>
+#include <linux/biocontrol.h>
+
+/* return corresponding bio_cgroup object of a cgroup */
+static inline struct bio_cgroup *cgroup_bio(struct cgroup *cgrp)
+{
+ return container_of(cgroup_subsys_state(cgrp, bio_cgroup_subsys_id),
+ struct bio_cgroup, css);
+}
+
+static struct idr bio_cgroup_id;
+static DEFINE_SPINLOCK(bio_cgroup_idr_lock);
+
+static struct cgroup_subsys_state *
+bio_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+ struct bio_cgroup *biog;
+ struct io_context *ioc;
+ int error;
+
+ if (!cgrp->parent) {
+ static struct bio_cgroup default_bio_cgroup;
+ static struct io_context default_bio_io_context;
+
+ biog = &default_bio_cgroup;
+ ioc = &default_bio_io_context;
+ init_io_context(ioc);
+
+ idr_init(&bio_cgroup_id);
+ biog->id = 0;
+
+ page_cgroup_init();
+ } else {
+ biog = kzalloc(sizeof(*biog), GFP_KERNEL);
+ ioc = alloc_io_context(GFP_KERNEL, -1);
+ if (!ioc || !biog) {
+ error = -ENOMEM;
+ goto out;
+ }
+retry:
+ if (unlikely(!idr_pre_get(&bio_cgroup_id, GFP_KERNEL))) {
+ error = -EAGAIN;
+ goto out;
+ }
+ spin_lock_irq(&bio_cgroup_idr_lock);
+ error = idr_get_new_above(&bio_cgroup_id,
+ (void *)biog, 1, &biog->id);
+ spin_unlock_irq(&bio_cgroup_idr_lock);
+ if (error == -EAGAIN)
+ goto retry;
+ else if (error)
+ goto out;
+ }
+
+ ioc->id = biog->id;
+ biog->io_context = ioc;
+
+ INIT_LIST_HEAD(&biog->page_list);
+ spin_lock_init(&biog->page_list_lock);
+
+ /* Bind the cgroup to bio_cgroup object we just created */
+ biog->css.cgroup = cgrp;
+
+ return &biog->css;
+out:
+ if (ioc)
+ put_io_context(ioc);
+ kfree(biog);
+ return ERR_PTR(error);
+}
+
+#define FORCE_UNCHARGE_BATCH (128)
+static void bio_cgroup_force_empty(struct bio_cgroup *biog)
+{
+ struct page_cgroup *pc;
+ struct page *page;
+ int count = FORCE_UNCHARGE_BATCH;
+ struct list_head *list = &biog->page_list;
+ unsigned long flags;
+
+ spin_lock_irqsave(&biog->page_list_lock, flags);
+ while (!list_empty(list)) {
+ pc = list_entry(list->prev, struct page_cgroup, blist);
+ page = pc->page;
+ get_page(page);
+ spin_unlock_irqrestore(&biog->page_list_lock, flags);
+ mem_cgroup_uncharge_page(page);
+ put_page(page);
+ if (--count <= 0) {
+ count = FORCE_UNCHARGE_BATCH;
+ cond_resched();
+ }
+ spin_lock_irqsave(&biog->page_list_lock, flags);
+ }
+ spin_unlock_irqrestore(&biog->page_list_lock, flags);
+ return;
+}
+
+static void bio_cgroup_pre_destroy(struct cgroup_subsys *ss,
+ struct cgroup *cgrp)
+{
+ struct bio_cgroup *biog = cgroup_bio(cgrp);
+ bio_cgroup_force_empty(biog);
+}
+
+static void bio_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+ struct bio_cgroup *biog = cgroup_bio(cgrp);
+
+ put_io_context(biog->io_context);
+
+ spin_lock_irq(&bio_cgroup_idr_lock);
+ idr_remove(&bio_cgroup_id, biog->id);
+ spin_unlock_irq(&bio_cgroup_idr_lock);
+
+ kfree(biog);
+}
+
+struct bio_cgroup *find_bio_cgroup(int id)
+{
+ struct bio_cgroup *biog;
+ spin_lock_irq(&bio_cgroup_idr_lock);
+ biog = (struct bio_cgroup *)
+ idr_find(&bio_cgroup_id, id);
+ spin_unlock_irq(&bio_cgroup_idr_lock);
+ get_bio_cgroup(biog);
+ return biog;
+}
+
+struct io_context *get_bio_cgroup_iocontext(struct bio *bio)
+{
+ struct io_context *ioc;
+ struct page_cgroup *pc;
+ struct bio_cgroup *biog;
+ struct page *page = bio_iovec_idx(bio, 0)->bv_page;
+
+ lock_page_cgroup(page);
+ pc = page_get_page_cgroup(page);
+ if (pc)
+ biog = pc->bio_cgroup;
+ else
+ biog = bio_cgroup_from_task(rcu_dereference(init_mm.owner ));
+ ioc = biog->io_context; /* default io_context for this cgroup */
+ atomic_inc(&ioc->refcount);
+ unlock_page_cgroup(page);
+ return ioc;
+}
+EXPORT_SYMBOL(get_bio_cgroup_iocontext);
+
+static u64 bio_id_read(struct cgroup *cgrp, struct cftype *cft)
+{
+ struct bio_cgroup *biog = cgroup_bio(cgrp);
+
+ return (u64) biog->id;
+}
+
+
+static struct cftype bio_files[] = {
+ {
+ .name = "id",
+ .read_u64 = bio_id_read,
+ },
+};
+
+static int bio_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
+{
+ if (bio_cgroup_disabled())
+ return 0;
+ return cgroup_add_files(cont, ss, bio_files, ARRAY_SIZE(bio_files));
+}
+
+static void bio_cgroup_move_task(struct cgroup_subsys *ss,
+ struct cgroup *cont,
+ struct cgroup *old_cont,
+ struct task_struct *p)
+{
+ struct mm_struct *mm;
+ struct bio_cgroup *biog, *old_biog;
+
+ if (bio_cgroup_disabled())
+ return;
+
+ mm = get_task_mm(p);
+ if (mm == NULL)
+ return;
+
+ biog = cgroup_bio(cont);
+ old_biog = cgroup_bio(old_cont);
+
+ mmput(mm);
+ return;
+}
+
+
+struct cgroup_subsys bio_cgroup_subsys = {
+ .name = "bio",
+ .subsys_id = bio_cgroup_subsys_id,
+ .create = bio_cgroup_create,
+ .destroy = bio_cgroup_destroy,
+ .pre_destroy = bio_cgroup_pre_destroy,
+ .populate = bio_cgroup_populate,
+ .attach = bio_cgroup_move_task,
+ .early_init = 0,
+};
diff -Ndupr linux-2.6.27-rc1-mm1.cg1/mm/memcontrol.c linux-2.6.27-rc1-mm1.cg2/mm/memcontrol.c
--- linux-2.6.27-rc1-mm1.cg1/mm/memcontrol.c 2008-08-01 19:49:38.000000000 +0900
+++ linux-2.6.27-rc1-mm1.cg2/mm/memcontrol.c 2008-08-01 19:49:53.000000000 +0900
@@ -20,6 +20,7 @@
#include <linux/res_counter.h>
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
+#include <linux/biocontrol.h>
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/page-flags.h>
@@ -1019,11 +1020,12 @@ struct page_cgroup *page_get_page_cgroup
* < 0 if the cgroup is over its limit
*/
static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
- gfp_t gfp_mask, enum charge_type ctype,
- struct mem_cgroup *memcg)
+ gfp_t gfp_mask, enum charge_type ctype,
+ struct mem_cgroup *memcg, struct bio_cgroup *biocg)
{
struct page_cgroup *pc;
struct mem_cgroup *mem;
+ struct bio_cgroup *biog;
pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask);
if (unlikely(pc == NULL))
@@ -1035,18 +1037,15 @@ static int mem_cgroup_charge_common(stru
* thread group leader migrates. It's possible that mm is not
* set, if so charge the init_mm (happens for pagecache usage).
*/
- if (likely(!memcg)) {
- rcu_read_lock();
- mem = mm_get_mem_cgroup(mm);
- rcu_read_unlock();
- } else {
- mem = memcg;
- get_mem_cgroup(mem);
- }
+ rcu_read_lock();
+ mem = memcg ? memcg : mm_get_mem_cgroup(mm);
+ biog = biocg ? biocg : mm_get_bio_cgroup(mm);
+ rcu_read_unlock();
if (mem_cgroup_try_to_allocate(mem, gfp_mask) < 0)
goto out;
set_mem_cgroup(pc, mem);
+ set_bio_cgroup(pc, biog);
pc->page = page;
/*
* If a page is accounted as a page cache, insert to inactive list.
@@ -1065,18 +1064,21 @@ static int mem_cgroup_charge_common(stru
if (unlikely(page_get_page_cgroup(page))) {
unlock_page_cgroup(page);
clear_mem_cgroup(pc);
+ clear_bio_cgroup(pc);
kmem_cache_free(page_cgroup_cache, pc);
goto done;
}
page_assign_page_cgroup(page, pc);
This function can be more simplified, there is some unnecessary code
from old version.
I think it is neccessary to attach the task p to new biog.
I tried to put a process into a group by writing to
"/cgroup/bio/BGROUP/tasks" but failed.
Could you tell me what you actually did? I will try the same thing.
--
Ryo Tsuruta <ryov@valinux.co.jp>
I wanted to test my own scheduler which uses bio tracking information.
SO I tried your patch, especially, get_bio_cgroup_iocontext(), to get
the io_context from bio.
In my test, I made some threads with certain iopriorities run
concurrently. To schedule these threads based on their iopriorities,
I made BGROUP directories for each iopriorities.
e.g. /cgroup/bio/be0 ... /cgroup/bio/be7
Then, I tried to attach the processes to the appropriate groups.
But the processes stayed in the original group(id=0).
...
I am sorry but I have to leave now and I cannot come here next week.
--> I will take summer holidays.
I will reply to you later.
Thanks,
- Takuya Yoshikawa
--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
08-08-2008, 11:41 AM
Ryo Tsuruta
bio-cgroup: Implement the bio-cgroup
Hi Yoshikawa-san,
> I wanted to test my own scheduler which uses bio tracking information.
> SO I tried your patch, especially, get_bio_cgroup_iocontext(), to get
> the io_context from bio.
>
> In my test, I made some threads with certain iopriorities run
> concurrently. To schedule these threads based on their iopriorities,
> I made BGROUP directories for each iopriorities.
> e.g. /cgroup/bio/be0 ... /cgroup/bio/be7
> Then, I tried to attach the processes to the appropriate groups.
>
> But the processes stayed in the original group(id=0).
In the current implementation, when a process moves to an another cgroup:
- Already allocated memory does not move to the cgroup, still remains.
- Only allocated memory after move belongs to the cgroup.
This behavior follows the memory controller.
Memory does not move between cgroups since it is so heavy operation,
but it would be worth under some sort of conditions.
Could you try to move a process between cgroups in the following way?
static inline struct io_context *ioc_task_link(struct io_context *ioc)
@@ -104,6 +106,7 @@ int put_io_context(struct io_context *io
void exit_io_context(void);
struct io_context *get_io_context(gfp_t gfp_flags, int node);
struct io_context *alloc_io_context(gfp_t gfp_flags, int node);
+void init_io_context(struct io_context *ioc);
void copy_io_context(struct io_context **pdst, struct io_context **psrc);
#else
static inline void exit_io_context(void)
diff -Ndupr linux-2.6.27-rc1-mm1.cg1/include/linux/memcontrol.h linux-2.6.27-rc1-mm1.cg2/include/linux/memcontrol.h
--- linux-2.6.27-rc1-mm1.cg1/include/linux/memcontrol.h 2008-08-12 14:47:22.000000000 +0900
+++ linux-2.6.27-rc1-mm1.cg2/include/linux/memcontrol.h 2008-08-12 14:48:00.000000000 +0900
@@ -54,6 +54,10 @@ struct page_cgroup {
struct list_head lru; /* per cgroup LRU list */
struct mem_cgroup *mem_cgroup;
#endif /* CONFIG_CGROUP_MEM_RES_CTLR */
+#ifdef CONFIG_CGROUP_BIO
+ struct list_head blist; /* for bio_cgroup page list */
+ struct bio_cgroup *bio_cgroup;
+#endif
struct page *page;
int flags;
};
diff -Ndupr linux-2.6.27-rc1-mm1.cg1/init/Kconfig linux-2.6.27-rc1-mm1.cg2/init/Kconfig
--- linux-2.6.27-rc1-mm1.cg1/init/Kconfig 2008-08-12 14:47:22.000000000 +0900
+++ linux-2.6.27-rc1-mm1.cg2/init/Kconfig 2008-08-12 14:48:00.000000000 +0900
@@ -418,9 +418,20 @@ config CGROUP_MEMRLIMIT_CTLR
memory RSS and Page Cache control. Virtual address space control
is provided by this controller.
+config CGROUP_BIO
+ bool "Block I/O cgroup subsystem"
+ depends on CGROUPS
+ select MM_OWNER
+ help
+ Provides a Resource Controller which enables to track the onwner
+ of every Block I/O.
+ The information this subsystem provides can be used from any
+ kind of module such as dm-ioband device mapper modules or
+ the cfq-scheduler.
+
config CGROUP_PAGE
def_bool y
- depends on CGROUP_MEM_RES_CTLR
+ depends on CGROUP_MEM_RES_CTLR || CGROUP_BIO
config SYSFS_DEPRECATED
bool
diff -Ndupr linux-2.6.27-rc1-mm1.cg1/mm/Makefile linux-2.6.27-rc1-mm1.cg2/mm/Makefile
--- linux-2.6.27-rc1-mm1.cg1/mm/Makefile 2008-08-12 14:47:22.000000000 +0900
+++ linux-2.6.27-rc1-mm1.cg2/mm/Makefile 2008-08-12 14:48:00.000000000 +0900
@@ -35,4 +35,5 @@ obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_SMP) += allocpercpu.o
obj-$(CONFIG_QUICKLIST) += quicklist.o
obj-$(CONFIG_CGROUP_PAGE) += memcontrol.o
+obj-$(CONFIG_CGROUP_BIO) += biocontrol.o
obj-$(CONFIG_CGROUP_MEMRLIMIT_CTLR) += memrlimitcgroup.o
diff -Ndupr linux-2.6.27-rc1-mm1.cg1/mm/biocontrol.c linux-2.6.27-rc1-mm1.cg2/mm/biocontrol.c
--- linux-2.6.27-rc1-mm1.cg1/mm/biocontrol.c 1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.27-rc1-mm1.cg2/mm/biocontrol.c 2008-08-12 15:06:12.000000000 +0900
@@ -0,0 +1,219 @@
+/* biocontrol.c - Block I/O Controller
+ *
+ * Copyright IBM Corporation, 2007
+ * Author Balbir Singh <balbir@linux.vnet.ibm.com>
+ *
+ * Copyright 2007 OpenVZ SWsoft Inc
+ * Author: Pavel Emelianov <xemul@openvz.org>
+ *
+ * Copyright VA Linux Systems Japan, 2008
+ * Author Hirokazu Takahashi <taka@valinux.co.jp>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/module.h>
+#include <linux/cgroup.h>
+#include <linux/mm.h>
+#include <linux/blkdev.h>
+#include <linux/smp.h>
+#include <linux/bit_spinlock.h>
+#include <linux/idr.h>
+#include <linux/err.h>
+#include <linux/biocontrol.h>
+
+/* return corresponding bio_cgroup object of a cgroup */
+static inline struct bio_cgroup *cgroup_bio(struct cgroup *cgrp)
+{
+ return container_of(cgroup_subsys_state(cgrp, bio_cgroup_subsys_id),
+ struct bio_cgroup, css);
+}
+
+static struct idr bio_cgroup_id;
+static DEFINE_SPINLOCK(bio_cgroup_idr_lock);
+
+static struct cgroup_subsys_state *
+bio_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+ struct bio_cgroup *biog;
+ struct io_context *ioc;
+ int error;
+
+ if (!cgrp->parent) {
+ static struct bio_cgroup default_bio_cgroup;
+ static struct io_context default_bio_io_context;
+
+ biog = &default_bio_cgroup;
+ ioc = &default_bio_io_context;
+ init_io_context(ioc);
+
+ idr_init(&bio_cgroup_id);
+ biog->id = 0;
+
+ page_cgroup_init();
+ } else {
+ biog = kzalloc(sizeof(*biog), GFP_KERNEL);
+ ioc = alloc_io_context(GFP_KERNEL, -1);
+ if (!ioc || !biog) {
+ error = -ENOMEM;
+ goto out;
+ }
+retry:
+ if (unlikely(!idr_pre_get(&bio_cgroup_id, GFP_KERNEL))) {
+ error = -EAGAIN;
+ goto out;
+ }
+ spin_lock_irq(&bio_cgroup_idr_lock);
+ error = idr_get_new_above(&bio_cgroup_id,
+ (void *)biog, 1, &biog->id);
+ spin_unlock_irq(&bio_cgroup_idr_lock);
+ if (error == -EAGAIN)
+ goto retry;
+ else if (error)
+ goto out;
+ }
+
+ ioc->id = biog->id;
+ biog->io_context = ioc;
+
+ INIT_LIST_HEAD(&biog->page_list);
+ spin_lock_init(&biog->page_list_lock);
+
+ /* Bind the cgroup to bio_cgroup object we just created */
+ biog->css.cgroup = cgrp;
+
+ return &biog->css;
+out:
+ if (ioc)
+ put_io_context(ioc);
+ kfree(biog);
+ return ERR_PTR(error);
+}
+
+#define FORCE_UNCHARGE_BATCH (128)
+static void bio_cgroup_force_empty(struct bio_cgroup *biog)
+{
+ struct page_cgroup *pc;
+ struct page *page;
+ int count = FORCE_UNCHARGE_BATCH;
+ struct list_head *list = &biog->page_list;
+ unsigned long flags;
+
+ spin_lock_irqsave(&biog->page_list_lock, flags);
+ while (!list_empty(list)) {
+ pc = list_entry(list->prev, struct page_cgroup, blist);
+ page = pc->page;
+ get_page(page);
+ spin_unlock_irqrestore(&biog->page_list_lock, flags);
+ mem_cgroup_uncharge_page(page);
+ put_page(page);
+ if (--count <= 0) {
+ count = FORCE_UNCHARGE_BATCH;
+ cond_resched();
+ }
+ spin_lock_irqsave(&biog->page_list_lock, flags);
+ }
+ spin_unlock_irqrestore(&biog->page_list_lock, flags);
+ return;
+}
+
+static void bio_cgroup_pre_destroy(struct cgroup_subsys *ss,
+ struct cgroup *cgrp)
+{
+ struct bio_cgroup *biog = cgroup_bio(cgrp);
+ bio_cgroup_force_empty(biog);
+}
+
+static void bio_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+ struct bio_cgroup *biog = cgroup_bio(cgrp);
+
+ put_io_context(biog->io_context);
+
+ spin_lock_irq(&bio_cgroup_idr_lock);
+ idr_remove(&bio_cgroup_id, biog->id);
+ spin_unlock_irq(&bio_cgroup_idr_lock);
+
+ kfree(biog);
+}
+
+struct bio_cgroup *find_bio_cgroup(int id)
+{
+ struct bio_cgroup *biog;
+ spin_lock_irq(&bio_cgroup_idr_lock);
+ biog = (struct bio_cgroup *)
+ idr_find(&bio_cgroup_id, id);
+ spin_unlock_irq(&bio_cgroup_idr_lock);
+ get_bio_cgroup(biog);
+ return biog;
+}
+
+struct io_context *get_bio_cgroup_iocontext(struct bio *bio)
+{
+ struct io_context *ioc;
+ struct page_cgroup *pc;
+ struct bio_cgroup *biog;
+ struct page *page = bio_iovec_idx(bio, 0)->bv_page;
+
+ lock_page_cgroup(page);
+ pc = page_get_page_cgroup(page);
+ if (pc)
+ biog = pc->bio_cgroup;
+ else
+ biog = bio_cgroup_from_task(rcu_dereference(init_mm.owner ));
+ ioc = biog->io_context; /* default io_context for this cgroup */
+ atomic_inc(&ioc->refcount);
+ unlock_page_cgroup(page);
+ return ioc;
+}
+EXPORT_SYMBOL(get_bio_cgroup_iocontext);
+
+static u64 bio_id_read(struct cgroup *cgrp, struct cftype *cft)
+{
+ struct bio_cgroup *biog = cgroup_bio(cgrp);
+
+ return (u64) biog->id;
+}
+
+
+static struct cftype bio_files[] = {
+ {
+ .name = "id",
+ .read_u64 = bio_id_read,
+ },
+};
+
+static int bio_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
+{
+ if (bio_cgroup_disabled())
+ return 0;
+ return cgroup_add_files(cont, ss, bio_files, ARRAY_SIZE(bio_files));
+}
+
+static void bio_cgroup_move_task(struct cgroup_subsys *ss,
+ struct cgroup *cont,
+ struct cgroup *old_cont,
+ struct task_struct *p)
+{
+ /* do nothing */
+}
+
+
+struct cgroup_subsys bio_cgroup_subsys = {
+ .name = "bio",
+ .subsys_id = bio_cgroup_subsys_id,
+ .create = bio_cgroup_create,
+ .destroy = bio_cgroup_destroy,
+ .pre_destroy = bio_cgroup_pre_destroy,
+ .populate = bio_cgroup_populate,
+ .attach = bio_cgroup_move_task,
+ .early_init = 0,
+};
diff -Ndupr linux-2.6.27-rc1-mm1.cg1/mm/memcontrol.c linux-2.6.27-rc1-mm1.cg2/mm/memcontrol.c
--- linux-2.6.27-rc1-mm1.cg1/mm/memcontrol.c 2008-08-12 14:47:40.000000000 +0900
+++ linux-2.6.27-rc1-mm1.cg2/mm/memcontrol.c 2008-08-12 14:48:00.000000000 +0900
@@ -20,6 +20,7 @@
#include <linux/res_counter.h>
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
+#include <linux/biocontrol.h>
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/page-flags.h>
@@ -1019,11 +1020,12 @@ struct page_cgroup *page_get_page_cgroup
* < 0 if the cgroup is over its limit
*/
static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
- gfp_t gfp_mask, enum charge_type ctype,
- struct mem_cgroup *memcg)
+ gfp_t gfp_mask, enum charge_type ctype,
+ struct mem_cgroup *memcg, struct bio_cgroup *biocg)
{
struct page_cgroup *pc;
struct mem_cgroup *mem;
+ struct bio_cgroup *biog;
pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask);
if (unlikely(pc == NULL))
@@ -1035,18 +1037,15 @@ static int mem_cgroup_charge_common(stru
* thread group leader migrates. It's possible that mm is not
* set, if so charge the init_mm (happens for pagecache usage).
*/
- if (likely(!memcg)) {
- rcu_read_lock();
- mem = mm_get_mem_cgroup(mm);
- rcu_read_unlock();
- } else {
- mem = memcg;
- get_mem_cgroup(mem);
- }
+ rcu_read_lock();
+ mem = memcg ? memcg : mm_get_mem_cgroup(mm);
+ biog = biocg ? biocg : mm_get_bio_cgroup(mm);
+ rcu_read_unlock();
if (mem_cgroup_try_to_allocate(mem, gfp_mask) < 0)
goto out;
set_mem_cgroup(pc, mem);
+ set_bio_cgroup(pc, biog);
pc->page = page;
/*
* If a page is accounted as a page cache, insert to inactive list.
@@ -1065,18 +1064,21 @@ static int mem_cgroup_charge_common(stru
if (unlikely(page_get_page_cgroup(page))) {
unlock_page_cgroup(page);
clear_mem_cgroup(pc);
+ clear_bio_cgroup(pc);
kmem_cache_free(page_cgroup_cache, pc);
goto done;
}
page_assign_page_cgroup(page, pc);
static inline struct io_context *ioc_task_link(struct io_context *ioc)
@@ -104,6 +106,7 @@ int put_io_context(struct io_context *io
void exit_io_context(void);
struct io_context *get_io_context(gfp_t gfp_flags, int node);
struct io_context *alloc_io_context(gfp_t gfp_flags, int node);
+void init_io_context(struct io_context *ioc);
void copy_io_context(struct io_context **pdst, struct io_context **psrc);
#else
static inline void exit_io_context(void)
diff -Ndupr linux-2.6.27-rc1-mm1.cg1/include/linux/memcontrol.h linux-2.6.27-rc1-mm1.cg2/include/linux/memcontrol.h
--- linux-2.6.27-rc1-mm1.cg1/include/linux/memcontrol.h 2008-09-19 18:50:59.000000000 +0900
+++ linux-2.6.27-rc1-mm1.cg2/include/linux/memcontrol.h 2008-09-19 18:50:59.000000000 +0900
@@ -54,6 +54,10 @@ struct page_cgroup {
struct list_head lru; /* per cgroup LRU list */
struct mem_cgroup *mem_cgroup;
#endif /* CONFIG_CGROUP_MEM_RES_CTLR */
+#ifdef CONFIG_CGROUP_BIO
+ struct list_head blist; /* for bio_cgroup page list */
+ struct bio_cgroup *bio_cgroup;
+#endif
struct page *page;
int flags;
};
diff -Ndupr linux-2.6.27-rc1-mm1.cg1/init/Kconfig linux-2.6.27-rc1-mm1.cg2/init/Kconfig
--- linux-2.6.27-rc1-mm1.cg1/init/Kconfig 2008-09-19 18:50:59.000000000 +0900
+++ linux-2.6.27-rc1-mm1.cg2/init/Kconfig 2008-09-19 18:50:59.000000000 +0900
@@ -418,9 +418,20 @@ config CGROUP_MEMRLIMIT_CTLR
memory RSS and Page Cache control. Virtual address space control
is provided by this controller.
+config CGROUP_BIO
+ bool "Block I/O cgroup subsystem"
+ depends on CGROUPS
+ select MM_OWNER
+ help
+ Provides a Resource Controller which enables to track the onwner
+ of every Block I/O.
+ The information this subsystem provides can be used from any
+ kind of module such as dm-ioband device mapper modules or
+ the cfq-scheduler.
+
config CGROUP_PAGE
def_bool y
- depends on CGROUP_MEM_RES_CTLR
+ depends on CGROUP_MEM_RES_CTLR || CGROUP_BIO
config SYSFS_DEPRECATED
bool
diff -Ndupr linux-2.6.27-rc1-mm1.cg1/mm/biocontrol.c linux-2.6.27-rc1-mm1.cg2/mm/biocontrol.c
--- linux-2.6.27-rc1-mm1.cg1/mm/biocontrol.c 1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.27-rc1-mm1.cg2/mm/biocontrol.c 2008-09-19 18:50:59.000000000 +0900
@@ -0,0 +1,219 @@
+/* biocontrol.c - Block I/O Controller
+ *
+ * Copyright IBM Corporation, 2007
+ * Author Balbir Singh <balbir@linux.vnet.ibm.com>
+ *
+ * Copyright 2007 OpenVZ SWsoft Inc
+ * Author: Pavel Emelianov <xemul@openvz.org>
+ *
+ * Copyright VA Linux Systems Japan, 2008
+ * Author Hirokazu Takahashi <taka@valinux.co.jp>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/module.h>
+#include <linux/cgroup.h>
+#include <linux/mm.h>
+#include <linux/blkdev.h>
+#include <linux/smp.h>
+#include <linux/bit_spinlock.h>
+#include <linux/idr.h>
+#include <linux/err.h>
+#include <linux/biocontrol.h>
+
+/* return corresponding bio_cgroup object of a cgroup */
+static inline struct bio_cgroup *cgroup_bio(struct cgroup *cgrp)
+{
+ return container_of(cgroup_subsys_state(cgrp, bio_cgroup_subsys_id),
+ struct bio_cgroup, css);
+}
+
+static struct idr bio_cgroup_id;
+static DEFINE_SPINLOCK(bio_cgroup_idr_lock);
+
+static struct cgroup_subsys_state *
+bio_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+ struct bio_cgroup *biog;
+ struct io_context *ioc;
+ int error;
+
+ if (!cgrp->parent) {
+ static struct bio_cgroup default_bio_cgroup;
+ static struct io_context default_bio_io_context;
+
+ biog = &default_bio_cgroup;
+ ioc = &default_bio_io_context;
+ init_io_context(ioc);
+
+ idr_init(&bio_cgroup_id);
+ biog->id = 0;
+
+ page_cgroup_init();
+ } else {
+ biog = kzalloc(sizeof(*biog), GFP_KERNEL);
+ ioc = alloc_io_context(GFP_KERNEL, -1);
+ if (!ioc || !biog) {
+ error = -ENOMEM;
+ goto out;
+ }
+retry:
+ if (unlikely(!idr_pre_get(&bio_cgroup_id, GFP_KERNEL))) {
+ error = -EAGAIN;
+ goto out;
+ }
+ spin_lock_irq(&bio_cgroup_idr_lock);
+ error = idr_get_new_above(&bio_cgroup_id,
+ (void *)biog, 1, &biog->id);
+ spin_unlock_irq(&bio_cgroup_idr_lock);
+ if (error == -EAGAIN)
+ goto retry;
+ else if (error)
+ goto out;
+ }
+
+ ioc->id = biog->id;
+ biog->io_context = ioc;
+
+ INIT_LIST_HEAD(&biog->page_list);
+ spin_lock_init(&biog->page_list_lock);
+
+ /* Bind the cgroup to bio_cgroup object we just created */
+ biog->css.cgroup = cgrp;
+
+ return &biog->css;
+out:
+ if (ioc)
+ put_io_context(ioc);
+ kfree(biog);
+ return ERR_PTR(error);
+}
+
+#define FORCE_UNCHARGE_BATCH (128)
+static void bio_cgroup_force_empty(struct bio_cgroup *biog)
+{
+ struct page_cgroup *pc;
+ struct page *page;
+ int count = FORCE_UNCHARGE_BATCH;
+ struct list_head *list = &biog->page_list;
+ unsigned long flags;
+
+ spin_lock_irqsave(&biog->page_list_lock, flags);
+ while (!list_empty(list)) {
+ pc = list_entry(list->prev, struct page_cgroup, blist);
+ page = pc->page;
+ get_page(page);
+ spin_unlock_irqrestore(&biog->page_list_lock, flags);
+ mem_cgroup_uncharge_page(page);
+ put_page(page);
+ if (--count <= 0) {
+ count = FORCE_UNCHARGE_BATCH;
+ cond_resched();
+ }
+ spin_lock_irqsave(&biog->page_list_lock, flags);
+ }
+ spin_unlock_irqrestore(&biog->page_list_lock, flags);
+ return;
+}
+
+static void bio_cgroup_pre_destroy(struct cgroup_subsys *ss,
+ struct cgroup *cgrp)
+{
+ struct bio_cgroup *biog = cgroup_bio(cgrp);
+ bio_cgroup_force_empty(biog);
+}
+
+static void bio_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+ struct bio_cgroup *biog = cgroup_bio(cgrp);
+
+ put_io_context(biog->io_context);
+
+ spin_lock_irq(&bio_cgroup_idr_lock);
+ idr_remove(&bio_cgroup_id, biog->id);
+ spin_unlock_irq(&bio_cgroup_idr_lock);
+
+ kfree(biog);
+}
+
+struct bio_cgroup *find_bio_cgroup(int id)
+{
+ struct bio_cgroup *biog;
+ spin_lock_irq(&bio_cgroup_idr_lock);
+ biog = (struct bio_cgroup *)
+ idr_find(&bio_cgroup_id, id);
+ spin_unlock_irq(&bio_cgroup_idr_lock);
+ get_bio_cgroup(biog);
+ return biog;
+}
+
+struct io_context *get_bio_cgroup_iocontext(struct bio *bio)
+{
+ struct io_context *ioc;
+ struct page_cgroup *pc;
+ struct bio_cgroup *biog;
+ struct page *page = bio_iovec_idx(bio, 0)->bv_page;
+
+ lock_page_cgroup(page);
+ pc = page_get_page_cgroup(page);
+ if (pc)
+ biog = pc->bio_cgroup;
+ else
+ biog = bio_cgroup_from_task(rcu_dereference(init_mm.owner ));
+ ioc = biog->io_context; /* default io_context for this cgroup */
+ atomic_inc(&ioc->refcount);
+ unlock_page_cgroup(page);
+ return ioc;
+}
+EXPORT_SYMBOL(get_bio_cgroup_iocontext);
+
+static u64 bio_id_read(struct cgroup *cgrp, struct cftype *cft)
+{
+ struct bio_cgroup *biog = cgroup_bio(cgrp);
+
+ return (u64) biog->id;
+}
+
+
+static struct cftype bio_files[] = {
+ {
+ .name = "id",
+ .read_u64 = bio_id_read,
+ },
+};
+
+static int bio_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
+{
+ if (bio_cgroup_disabled())
+ return 0;
+ return cgroup_add_files(cont, ss, bio_files, ARRAY_SIZE(bio_files));
+}
+
+static void bio_cgroup_move_task(struct cgroup_subsys *ss,
+ struct cgroup *cont,
+ struct cgroup *old_cont,
+ struct task_struct *p)
+{
+ /* do nothing */
+}
+
+
+struct cgroup_subsys bio_cgroup_subsys = {
+ .name = "bio",
+ .subsys_id = bio_cgroup_subsys_id,
+ .create = bio_cgroup_create,
+ .destroy = bio_cgroup_destroy,
+ .pre_destroy = bio_cgroup_pre_destroy,
+ .populate = bio_cgroup_populate,
+ .attach = bio_cgroup_move_task,
+ .early_init = 0,
+};
diff -Ndupr linux-2.6.27-rc1-mm1.cg1/mm/Makefile linux-2.6.27-rc1-mm1.cg2/mm/Makefile
--- linux-2.6.27-rc1-mm1.cg1/mm/Makefile 2008-09-19 18:50:59.000000000 +0900
+++ linux-2.6.27-rc1-mm1.cg2/mm/Makefile 2008-09-19 18:50:59.000000000 +0900
@@ -35,4 +35,5 @@ obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_SMP) += allocpercpu.o
obj-$(CONFIG_QUICKLIST) += quicklist.o
obj-$(CONFIG_CGROUP_PAGE) += memcontrol.o
+obj-$(CONFIG_CGROUP_BIO) += biocontrol.o
obj-$(CONFIG_CGROUP_MEMRLIMIT_CTLR) += memrlimitcgroup.o
diff -Ndupr linux-2.6.27-rc1-mm1.cg1/mm/memcontrol.c linux-2.6.27-rc1-mm1.cg2/mm/memcontrol.c
--- linux-2.6.27-rc1-mm1.cg1/mm/memcontrol.c 2008-09-19 18:50:59.000000000 +0900
+++ linux-2.6.27-rc1-mm1.cg2/mm/memcontrol.c 2008-09-19 18:50:59.000000000 +0900
@@ -20,6 +20,7 @@
#include <linux/res_counter.h>
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
+#include <linux/biocontrol.h>
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/page-flags.h>
@@ -1019,11 +1020,12 @@ struct page_cgroup *page_get_page_cgroup
* < 0 if the cgroup is over its limit
*/
static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
- gfp_t gfp_mask, enum charge_type ctype,
- struct mem_cgroup *memcg)
+ gfp_t gfp_mask, enum charge_type ctype,
+ struct mem_cgroup *memcg, struct bio_cgroup *biocg)
{
struct page_cgroup *pc;
struct mem_cgroup *mem;
+ struct bio_cgroup *biog;
pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask);
if (unlikely(pc == NULL))
@@ -1035,18 +1037,15 @@ static int mem_cgroup_charge_common(stru
* thread group leader migrates. It's possible that mm is not
* set, if so charge the init_mm (happens for pagecache usage).
*/
- if (likely(!memcg)) {
- rcu_read_lock();
- mem = mm_get_mem_cgroup(mm);
- rcu_read_unlock();
- } else {
- mem = memcg;
- get_mem_cgroup(mem);
- }
+ rcu_read_lock();
+ mem = memcg ? memcg : mm_get_mem_cgroup(mm);
+ biog = biocg ? biocg : mm_get_bio_cgroup(mm);
+ rcu_read_unlock();
if (mem_cgroup_try_to_allocate(mem, gfp_mask) < 0)
goto out;
set_mem_cgroup(pc, mem);
+ set_bio_cgroup(pc, biog);
pc->page = page;
/*
* If a page is accounted as a page cache, insert to inactive list.
@@ -1065,18 +1064,21 @@ static int mem_cgroup_charge_common(stru
if (unlikely(page_get_page_cgroup(page))) {
unlock_page_cgroup(page);
clear_mem_cgroup(pc);
+ clear_bio_cgroup(pc);
kmem_cache_free(page_cgroup_cache, pc);
goto done;
}
page_assign_page_cgroup(page, pc);
static inline struct io_context *ioc_task_link(struct io_context *ioc)
@@ -104,6 +106,7 @@ int put_io_context(struct io_context *io
void exit_io_context(void);
struct io_context *get_io_context(gfp_t gfp_flags, int node);
struct io_context *alloc_io_context(gfp_t gfp_flags, int node);
+void init_io_context(struct io_context *ioc);
void copy_io_context(struct io_context **pdst, struct io_context **psrc);
#else
static inline void exit_io_context(void)
diff -Ndupr linux-2.6.27-rc5-mm1.cg1/include/linux/memcontrol.h linux-2.6.27-rc5-mm1.cg2/include/linux/memcontrol.h
--- linux-2.6.27-rc5-mm1.cg1/include/linux/memcontrol.h 2008-09-24 16:34:48.000000000 +0900
+++ linux-2.6.27-rc5-mm1.cg2/include/linux/memcontrol.h 2008-09-24 16:34:51.000000000 +0900
@@ -54,6 +54,10 @@ struct page_cgroup {
struct list_head lru; /* per cgroup LRU list */
struct mem_cgroup *mem_cgroup;
#endif /* CONFIG_CGROUP_MEM_RES_CTLR */
+#ifdef CONFIG_CGROUP_BIO
+ struct list_head blist; /* for bio_cgroup page list */
+ struct bio_cgroup *bio_cgroup;
+#endif
struct page *page;
int flags;
};
diff -Ndupr linux-2.6.27-rc5-mm1.cg1/init/Kconfig linux-2.6.27-rc5-mm1.cg2/init/Kconfig
--- linux-2.6.27-rc5-mm1.cg1/init/Kconfig 2008-09-24 16:34:48.000000000 +0900
+++ linux-2.6.27-rc5-mm1.cg2/init/Kconfig 2008-09-24 16:34:51.000000000 +0900
@@ -425,9 +425,20 @@ config CGROUP_MEMRLIMIT_CTLR
memory RSS and Page Cache control. Virtual address space control
is provided by this controller.
+config CGROUP_BIO
+ bool "Block I/O cgroup subsystem"
+ depends on CGROUPS
+ select MM_OWNER
+ help
+ Provides a Resource Controller which enables to track the onwner
+ of every Block I/O.
+ The information this subsystem provides can be used from any
+ kind of module such as dm-ioband device mapper modules or
+ the cfq-scheduler.
+
config CGROUP_PAGE
def_bool y
- depends on CGROUP_MEM_RES_CTLR
+ depends on CGROUP_MEM_RES_CTLR || CGROUP_BIO
config SYSFS_DEPRECATED
bool
diff -Ndupr linux-2.6.27-rc5-mm1.cg1/mm/biocontrol.c linux-2.6.27-rc5-mm1.cg2/mm/biocontrol.c
--- linux-2.6.27-rc5-mm1.cg1/mm/biocontrol.c 1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.27-rc5-mm1.cg2/mm/biocontrol.c 2008-09-24 16:34:51.000000000 +0900
@@ -0,0 +1,219 @@
+/* biocontrol.c - Block I/O Controller
+ *
+ * Copyright IBM Corporation, 2007
+ * Author Balbir Singh <balbir@linux.vnet.ibm.com>
+ *
+ * Copyright 2007 OpenVZ SWsoft Inc
+ * Author: Pavel Emelianov <xemul@openvz.org>
+ *
+ * Copyright VA Linux Systems Japan, 2008
+ * Author Hirokazu Takahashi <taka@valinux.co.jp>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/module.h>
+#include <linux/cgroup.h>
+#include <linux/mm.h>
+#include <linux/blkdev.h>
+#include <linux/smp.h>
+#include <linux/bit_spinlock.h>
+#include <linux/idr.h>
+#include <linux/err.h>
+#include <linux/biocontrol.h>
+
+/* return corresponding bio_cgroup object of a cgroup */
+static inline struct bio_cgroup *cgroup_bio(struct cgroup *cgrp)
+{
+ return container_of(cgroup_subsys_state(cgrp, bio_cgroup_subsys_id),
+ struct bio_cgroup, css);
+}
+
+static struct idr bio_cgroup_id;
+static DEFINE_SPINLOCK(bio_cgroup_idr_lock);
+
+static struct cgroup_subsys_state *
+bio_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+ struct bio_cgroup *biog;
+ struct io_context *ioc;
+ int error;
+
+ if (!cgrp->parent) {
+ static struct bio_cgroup default_bio_cgroup;
+ static struct io_context default_bio_io_context;
+
+ biog = &default_bio_cgroup;
+ ioc = &default_bio_io_context;
+ init_io_context(ioc);
+
+ idr_init(&bio_cgroup_id);
+ biog->id = 0;
+
+ page_cgroup_init();
+ } else {
+ biog = kzalloc(sizeof(*biog), GFP_KERNEL);
+ ioc = alloc_io_context(GFP_KERNEL, -1);
+ if (!ioc || !biog) {
+ error = -ENOMEM;
+ goto out;
+ }
+retry:
+ if (unlikely(!idr_pre_get(&bio_cgroup_id, GFP_KERNEL))) {
+ error = -EAGAIN;
+ goto out;
+ }
+ spin_lock_irq(&bio_cgroup_idr_lock);
+ error = idr_get_new_above(&bio_cgroup_id,
+ (void *)biog, 1, &biog->id);
+ spin_unlock_irq(&bio_cgroup_idr_lock);
+ if (error == -EAGAIN)
+ goto retry;
+ else if (error)
+ goto out;
+ }
+
+ ioc->id = biog->id;
+ biog->io_context = ioc;
+
+ INIT_LIST_HEAD(&biog->page_list);
+ spin_lock_init(&biog->page_list_lock);
+
+ /* Bind the cgroup to bio_cgroup object we just created */
+ biog->css.cgroup = cgrp;
+
+ return &biog->css;
+out:
+ if (ioc)
+ put_io_context(ioc);
+ kfree(biog);
+ return ERR_PTR(error);
+}
+
+#define FORCE_UNCHARGE_BATCH (128)
+static void bio_cgroup_force_empty(struct bio_cgroup *biog)
+{
+ struct page_cgroup *pc;
+ struct page *page;
+ int count = FORCE_UNCHARGE_BATCH;
+ struct list_head *list = &biog->page_list;
+ unsigned long flags;
+
+ spin_lock_irqsave(&biog->page_list_lock, flags);
+ while (!list_empty(list)) {
+ pc = list_entry(list->prev, struct page_cgroup, blist);
+ page = pc->page;
+ get_page(page);
+ spin_unlock_irqrestore(&biog->page_list_lock, flags);
+ mem_cgroup_uncharge_page(page);
+ put_page(page);
+ if (--count <= 0) {
+ count = FORCE_UNCHARGE_BATCH;
+ cond_resched();
+ }
+ spin_lock_irqsave(&biog->page_list_lock, flags);
+ }
+ spin_unlock_irqrestore(&biog->page_list_lock, flags);
+ return;
+}
+
+static void bio_cgroup_pre_destroy(struct cgroup_subsys *ss,
+ struct cgroup *cgrp)
+{
+ struct bio_cgroup *biog = cgroup_bio(cgrp);
+ bio_cgroup_force_empty(biog);
+}
+
+static void bio_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+ struct bio_cgroup *biog = cgroup_bio(cgrp);
+
+ put_io_context(biog->io_context);
+
+ spin_lock_irq(&bio_cgroup_idr_lock);
+ idr_remove(&bio_cgroup_id, biog->id);
+ spin_unlock_irq(&bio_cgroup_idr_lock);
+
+ kfree(biog);
+}
+
+struct bio_cgroup *find_bio_cgroup(int id)
+{
+ struct bio_cgroup *biog;
+ spin_lock_irq(&bio_cgroup_idr_lock);
+ biog = (struct bio_cgroup *)
+ idr_find(&bio_cgroup_id, id);
+ spin_unlock_irq(&bio_cgroup_idr_lock);
+ get_bio_cgroup(biog);
+ return biog;
+}
+
+struct io_context *get_bio_cgroup_iocontext(struct bio *bio)
+{
+ struct io_context *ioc;
+ struct page_cgroup *pc;
+ struct bio_cgroup *biog;
+ struct page *page = bio_iovec_idx(bio, 0)->bv_page;
+
+ lock_page_cgroup(page);
+ pc = page_get_page_cgroup(page);
+ if (pc)
+ biog = pc->bio_cgroup;
+ else
+ biog = bio_cgroup_from_task(rcu_dereference(init_mm.owner ));
+ ioc = biog->io_context; /* default io_context for this cgroup */
+ atomic_inc(&ioc->refcount);
+ unlock_page_cgroup(page);
+ return ioc;
+}
+EXPORT_SYMBOL(get_bio_cgroup_iocontext);
+
+static u64 bio_id_read(struct cgroup *cgrp, struct cftype *cft)
+{
+ struct bio_cgroup *biog = cgroup_bio(cgrp);
+
+ return (u64) biog->id;
+}
+
+
+static struct cftype bio_files[] = {
+ {
+ .name = "id",
+ .read_u64 = bio_id_read,
+ },
+};
+
+static int bio_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
+{
+ if (bio_cgroup_disabled())
+ return 0;
+ return cgroup_add_files(cont, ss, bio_files, ARRAY_SIZE(bio_files));
+}
+
+static void bio_cgroup_move_task(struct cgroup_subsys *ss,
+ struct cgroup *cont,
+ struct cgroup *old_cont,
+ struct task_struct *p)
+{
+ /* do nothing */
+}
+
+
+struct cgroup_subsys bio_cgroup_subsys = {
+ .name = "bio",
+ .subsys_id = bio_cgroup_subsys_id,
+ .create = bio_cgroup_create,
+ .destroy = bio_cgroup_destroy,
+ .pre_destroy = bio_cgroup_pre_destroy,
+ .populate = bio_cgroup_populate,
+ .attach = bio_cgroup_move_task,
+ .early_init = 0,
+};
diff -Ndupr linux-2.6.27-rc5-mm1.cg1/mm/Makefile linux-2.6.27-rc5-mm1.cg2/mm/Makefile
--- linux-2.6.27-rc5-mm1.cg1/mm/Makefile 2008-09-24 16:34:48.000000000 +0900
+++ linux-2.6.27-rc5-mm1.cg2/mm/Makefile 2008-09-24 16:34:51.000000000 +0900
@@ -35,5 +35,6 @@ obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_SMP) += allocpercpu.o
obj-$(CONFIG_QUICKLIST) += quicklist.o
obj-$(CONFIG_CGROUP_PAGE) += memcontrol.o
+obj-$(CONFIG_CGROUP_BIO) += biocontrol.o
obj-$(CONFIG_CGROUP_MEMRLIMIT_CTLR) += memrlimitcgroup.o
obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
diff -Ndupr linux-2.6.27-rc5-mm1.cg1/mm/memcontrol.c linux-2.6.27-rc5-mm1.cg2/mm/memcontrol.c
--- linux-2.6.27-rc5-mm1.cg1/mm/memcontrol.c 2008-09-24 16:34:48.000000000 +0900
+++ linux-2.6.27-rc5-mm1.cg2/mm/memcontrol.c 2008-09-24 16:34:51.000000000 +0900
@@ -20,6 +20,7 @@
#include <linux/res_counter.h>
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
+#include <linux/biocontrol.h>
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/page-flags.h>
@@ -1021,11 +1022,12 @@ struct page_cgroup *page_get_page_cgroup
* < 0 if the cgroup is over its limit
*/
static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
- gfp_t gfp_mask, enum charge_type ctype,
- struct mem_cgroup *memcg)
+ gfp_t gfp_mask, enum charge_type ctype,
+ struct mem_cgroup *memcg, struct bio_cgroup *biocg)
{
struct page_cgroup *pc;
struct mem_cgroup *mem;
+ struct bio_cgroup *biog;
pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask);
if (unlikely(pc == NULL))
@@ -1037,18 +1039,15 @@ static int mem_cgroup_charge_common(stru
* thread group leader migrates. It's possible that mm is not
* set, if so charge the init_mm (happens for pagecache usage).
*/
- if (likely(!memcg)) {
- rcu_read_lock();
- mem = mm_get_mem_cgroup(mm);
- rcu_read_unlock();
- } else {
- mem = memcg;
- get_mem_cgroup(mem);
- }
+ rcu_read_lock();
+ mem = memcg ? memcg : mm_get_mem_cgroup(mm);
+ biog = biocg ? biocg : mm_get_bio_cgroup(mm);
+ rcu_read_unlock();
if (mem_cgroup_try_to_allocate(mem, gfp_mask) < 0)
goto out;
set_mem_cgroup(pc, mem);
+ set_bio_cgroup(pc, biog);
pc->page = page;
/*
* If a page is accounted as a page cache, insert to inactive list.
@@ -1067,18 +1066,21 @@ static int mem_cgroup_charge_common(stru
if (unlikely(page_get_page_cgroup(page))) {
unlock_page_cgroup(page);
clear_mem_cgroup(pc);
+ clear_bio_cgroup(pc);
kmem_cache_free(page_cgroup_cache, pc);
goto done;
}
page_assign_page_cgroup(page, pc);