[Xen-devel] Re: [PATCH] xen: add xenfs to allow usermode <-> Xen

Andrew Morton wrote:

On Tue, 16 Dec 2008 12:27:37 -0800
Jeremy Fitzhardinge <jeremy@xxxxxxxx> wrote:

[ Reviewers: This is in drivers/xen to keep it close to the code it is and will 
be using.  Would people
  prefer to see it in fs/xenfs? -J ]


Well it would be nice to keep filesystems under ./fs, but `grep -rl
register_filesystem .' says we screwed that pooch.

From: Alex Zeffertt <alex.zeffertt@xxxxxxxxxxxxx>

The xenfs filesystem exports various interfaces to usermode.  Initially
this exports a file to allow usermode to interact with xenbus/xenstore.

Traditionally this appeared in /proc/xen.  Rather than extending procfs,
this patch adds a backward-compat mountpoint on /proc/xen, and provides
a xenfs filesystem which can be mounted there.


...

--- /dev/null
+++ b/drivers/xen/xenfs/super.c
@@ -0,0 +1,65 @@
+/*
+ *  xenfs.c - a filesystem for passing info between the a domain and
+ *  the hypervisor.
+ *
+ * 2008-10-07  Alex Zeffertt    Replaced /proc/xen/xenbus with xenfs filesystem
+ *                              and /proc/xen compatibility mount point.
+ *                              Turned xenfs into a loadable module.
+ */
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+
+#include "xenfs.h"
+
+#include <asm/xen/hypervisor.h>
+
+#define XENFS_MAGIC    0xabba1974


Move to include/linux/magic.h, please.

OK.

+MODULE_DESCRIPTION("Xen filesystem");
+MODULE_LICENSE("GPL");


MODULE_AUTHOR()?

There's at least 4 candidates for that role, and Citrix is the officialcopyright holder, so I'm not sure what I'd put there.

./MAINTAINERS?


Covered by the blanket Xen entry (which could do with a refresh).

+static int xenfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+       static struct tree_descr xenfs_files[] = {
+               [2] = {"xenbus", &xenbus_file_ops, S_IRUSR|S_IWUSR},
+               {""},
+       };
+
+       return simple_fill_super(sb, XENFS_MAGIC, xenfs_files);
+}
+

...

+struct xenbus_transaction_holder {
+       struct list_head list;
+       struct xenbus_transaction handle;
+};
+
+struct read_buffer {
+       struct list_head list;
+       unsigned int cons;
+       unsigned int len;
+       char msg[];
+};

+struct xenbus_file_priv {
+       /* In-progress transaction. */
+       struct list_head transactions;
+
+       /* Active watches. */
+       struct list_head watches;
+
+       /* Partial request. */
+       unsigned int len;
+       union {
+               struct xsd_sockmsg msg;
+               char buffer[PAGE_SIZE];
+       } u;
+
+       /* Response queue. */
+       struct list_head read_buffers;
+       wait_queue_head_t read_waitq;
+
+       struct mutex reply_mutex;
+};


It would be useful to document the locking for the list_head's (at least).


Hm, and add some, by the looks of it.

+
+static ssize_t xenbus_file_read(struct file *filp,
+                              char __user *ubuf,
+                              size_t len, loff_t *ppos)
+{
+       struct xenbus_file_priv *u = filp->private_data;
+       struct read_buffer *rb;
+       int i, ret;
+
+       mutex_lock(&u->reply_mutex);
+       while (list_empty(&u->read_buffers)) {
+               mutex_unlock(&u->reply_mutex);
+               ret = wait_event_interruptible(u->read_waitq,
+                                              !list_empty(&u->read_buffers));
+               if (ret)
+                       return ret;
+               mutex_lock(&u->reply_mutex);
+       }
+
+       rb = list_entry(u->read_buffers.next, struct read_buffer, list);
+       for (i = 0; i < len;) {
+               ret = put_user(rb->msg[rb->cons], ubuf + i);


So mmap_sem nests inside ->reply_mutex.  Has this all been carefully
tested with lockdep?

I run with lockdep enabled habitually, and I've seen no squeaks fromthis code.

The mmap_sem nesting would be a problem if this even implementedmappable files, but it is OK now?

+               if (ret) {
+                       mutex_unlock(&u->reply_mutex);
+                       return ret;
+               }
+               i++;
+               rb->cons++;
+               if (rb->cons == rb->len) {
+                       list_del(&rb->list);
+                       kfree(rb);
+                       if (list_empty(&u->read_buffers))
+                               break;
+                       rb = list_entry(u->read_buffers.next,
+                                       struct read_buffer, list);
+               }
+       }


This code will misbehave if it ever receives a read_buffer with len==0.

+       mutex_unlock(&u->reply_mutex);
+
+       return i;
+}
+
+static int queue_reply(struct list_head *list, char *data, unsigned int len)
+{
+       struct read_buffer *rb;
+
+       if (len == 0)
+               return 0;


That should fix it.

+       rb = kmalloc(sizeof(*rb) + len, GFP_KERNEL);
+       if (rb == NULL)
+               return -ENOMEM;
+
+       rb->cons = 0;
+       rb->len = len;
+
+       memcpy(rb->msg, data, len);
+
+       list_add_tail(&rb->list, list);


So the caller of this function must hold ->reply_mutex.

You thought that was secret but I found it out!

No, it needn't. The list its adding to is a local staging list, whichis spliced to the real reply list once everything has been successfullyallocated (otherwise it just gets thrown out). In practice all thecallers to hold the ->reply_mutex.

+       return 0;
+}
+
+/* Free all the read_buffer s on a list.
+ * Caller must have sole reference to list.
+ */
+static void queue_cleanup(struct list_head  *list)
+{
+       struct read_buffer *rb;
+
+       while (!list_empty(list)) {
+               rb = list_entry(list->next, struct read_buffer, list);
+               list_del(list->next);
+               kfree(rb);
+       }
+}
+
+struct watch_adapter {
+       struct list_head list;


locking is?

Missing.

+       struct xenbus_watch watch;
+       struct xenbus_file_priv *dev_data;
+       char *token;
+};
+

...

+static LIST_HEAD(watch_list);
+
+static ssize_t xenbus_file_write(struct file *filp,
+                               const char __user *ubuf,
+                               size_t len, loff_t *ppos)
+{
+       struct xenbus_file_priv *u = filp->private_data;
+       struct xenbus_transaction_holder *trans = NULL;
+       uint32_t msg_type;
+       void *reply;
+       char *path, *token;
+       int err, rc = len;
+       int ret;
+       LIST_HEAD(staging_q);
+
+       if ((len + u->len) > sizeof(u->u.buffer)) {
+               rc = -EINVAL;
+               goto out;
+       }


Now what's this doing?

One would expect a large write to get chunked up into smaller writes by
the kernel.

This code is poorly documented.

Each logical write is a xenbus message packet. If usermode passes apartial write then it gets accumulated until we have a full message.

+       if (copy_from_user(u->u.buffer + u->len, ubuf, len) != 0) {
+               rc = -EFAULT;
+               goto out;
+       }
+
+       u->len += len;
+       if ((u->len < sizeof(u->u.msg)) ||
+           (u->len < (sizeof(u->u.msg) + u->u.msg.len)))
+               return rc;
+
+       msg_type = u->u.msg.type;
+
+       switch (msg_type) {
+       case XS_TRANSACTION_START:
+       case XS_TRANSACTION_END:
+       case XS_DIRECTORY:
+       case XS_READ:
+       case XS_GET_PERMS:
+       case XS_RELEASE:
+       case XS_GET_DOMAIN_PATH:
+       case XS_WRITE:
+       case XS_MKDIR:
+       case XS_RM:
+       case XS_SET_PERMS:
+               if (msg_type == XS_TRANSACTION_START) {
+                       trans = kmalloc(sizeof(*trans), GFP_KERNEL);
+                       if (!trans) {
+                               rc = -ENOMEM;
+                               goto out;
+                       }
+               }
+
+               reply = xenbus_dev_request_and_reply(&u->u.msg);
+               if (IS_ERR(reply)) {
+                       kfree(trans);
+                       rc = PTR_ERR(reply);
+                       goto out;
+               }
+
+               if (msg_type == XS_TRANSACTION_START) {
+                       trans->handle.id = simple_strtoul(reply, NULL, 0);
+                       list_add(&trans->list, &u->transactions);
+               } else if (msg_type == XS_TRANSACTION_END) {
+                       list_for_each_entry(trans, &u->transactions, list)
+                               if (trans->handle.id == u->u.msg.tx_id)
+                                       break;
+                       BUG_ON(&trans->list == &u->transactions);
+                       list_del(&trans->list);
+                       kfree(trans);
+               }
+               mutex_lock(&u->reply_mutex);
+               ret = queue_reply(&staging_q,
+                                 (char *)&u->u.msg, sizeof(u->u.msg));
+               if (!ret)
+                       ret = queue_reply(&staging_q,
+                                         (char *)reply, u->u.msg.len);
+               if (!ret) {
+                       list_splice_tail(&staging_q, &u->read_buffers);
+                       wake_up(&u->read_waitq);
+               } else {
+                       queue_cleanup(&staging_q);
+                       rc = ret;
+               }
+               mutex_unlock(&u->reply_mutex);
+               kfree(reply);
+               break;
+
+       case XS_WATCH:
+       case XS_UNWATCH: {
+               struct watch_adapter *watch, *tmp_watch;
+               static const char XS_RESP[] = "OK";
+               struct xsd_sockmsg hdr;
+
+               path = u->u.buffer + sizeof(u->u.msg);
+               token = memchr(path, 0, u->u.msg.len);
+               if (token == NULL) {
+                       rc = -EILSEQ;
+                       goto out;
+               }
+               token++;
+
+               if (msg_type == XS_WATCH) {
+                       watch = kzalloc(sizeof(*watch), GFP_KERNEL);
+                       if (watch == NULL) {
+                               rc = -ENOMEM;
+                               goto out;
+                       }
+                       watch->watch.node =
+                               kmalloc(strlen(path)+1, GFP_KERNEL);
+                       if (watch->watch.node == NULL) {
+                               kfree(watch);
+                               rc = -ENOMEM;
+                               goto out;
+                       }
+                       strcpy((char *)watch->watch.node, path);
+                       watch->watch.callback = watch_fired;
+                       watch->token =
+                               kmalloc(strlen(token)+1, GFP_KERNEL);
+                       if (watch->token == NULL) {
+                               kfree(watch->watch.node);
+                               kfree(watch);
+                               rc = -ENOMEM;
+                               goto out;
+                       }
+                       strcpy(watch->token, token);
+                       watch->dev_data = u;
+
+                       err = register_xenbus_watch(&watch->watch);
+                       if (err) {
+                               free_watch_adapter(watch);
+                               rc = err;
+                               goto out;
+                       }
+
+                       list_add(&watch->list, &u->watches);
+               } else {
+                       list_for_each_entry_safe(watch, tmp_watch,
+                                                &u->watches, list) {
+                               if (!strcmp(watch->token, token) &&
+                                   !strcmp(watch->watch.node, path)) {
+                                       unregister_xenbus_watch(&watch->watch);
+                                       list_del(&watch->list);
+                                       free_watch_adapter(watch);
+                                       break;
+                               }
+                       }
+               }
+
+               hdr.type = msg_type;
+               hdr.len = sizeof(XS_RESP);
+               mutex_lock(&u->reply_mutex);
+               ret = queue_reply(&staging_q, (char *)&hdr, sizeof(hdr));
+               if (!ret)
+                       ret = queue_reply(&staging_q, (char *)XS_RESP, hdr.len);
+               if (!ret) {
+                       list_splice_tail(&staging_q, &u->read_buffers);
+                       wake_up(&u->read_waitq);
+               } else {
+                       queue_cleanup(&staging_q);
+                       rc = ret;
+               }
+               mutex_unlock(&u->reply_mutex);
+               break;
+       }
+
+       default:
+               rc = -EINVAL;
+               break;
+       }
+
+ out:
+       u->len = 0;
+       return rc;
+}


This function implements a new kernel ABI.  How are reviewers to review
the design of this proposed ABI?

It's closer to a network protocol. This is pretty much a raw interfaceto the xenbus protocol allowing guests to access the host xenbusnamespace. The read side just returns the naked xenbus protocol data.The write side needs to peek into it to maintain some local state, likewhat transactions are currently going. But in general the kerneldoesn't inspect what's being written particularly closely.

...

+static unsigned int xenbus_file_poll(struct file *file, poll_table *wait)
+{
+       struct xenbus_file_priv *u = file->private_data;
+
+       poll_wait(file, &u->read_waitq, wait);
+       if (!list_empty(&u->read_buffers))
+               return POLLIN | POLLRDNORM;


I wonder if we needed the lock or some open-coded barrier to safely run
list_empty().


Not sure.

Thanks for looking over this. I'll post an updated patch after a quickround of testing.


   J

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel

WARNING - OLD ARCHIVES

xen-devel

[Xen-devel] Re: [PATCH] xen: add xenfs to allow usermode <-> Xen interac