📄 lguest.c
字号:
/* Our block (disk) device should be really simple: the Guest asks for a block * number and we read or write that position in the file. Unfortunately, that * was amazingly slow: the Guest waits until the read is finished before * running anything else, even if it could have been doing useful work. * * We could use async I/O, except it's reputed to suck so hard that characters * actually go missing from your code when you try to use it. * * So we farm the I/O out to thread, and communicate with it via a pipe. *//* This hangs off device->priv. */struct vblk_info{ /* The size of the file. */ off64_t len; /* The file descriptor for the file. */ int fd; /* IO thread listens on this file descriptor [0]. */ int workpipe[2]; /* IO thread writes to this file descriptor to mark it done, then * Launcher triggers interrupt to Guest. */ int done_fd;};/*:*//*L:210 * The Disk * * Remember that the block device is handled by a separate I/O thread. We head * straight into the core of that thread here: */static bool service_io(struct device *dev){ struct vblk_info *vblk = dev->priv; unsigned int head, out_num, in_num, wlen; int ret; struct virtio_blk_inhdr *in; struct virtio_blk_outhdr *out; struct iovec iov[dev->vq->vring.num]; off64_t off; /* See if there's a request waiting. If not, nothing to do. */ head = get_vq_desc(dev->vq, iov, &out_num, &in_num); if (head == dev->vq->vring.num) return false; /* Every block request should contain at least one output buffer * (detailing the location on disk and the type of request) and one * input buffer (to hold the result). */ if (out_num == 0 || in_num == 0) errx(1, "Bad virtblk cmd %u out=%u in=%u", head, out_num, in_num); out = convert(&iov[0], struct virtio_blk_outhdr); in = convert(&iov[out_num+in_num-1], struct virtio_blk_inhdr); off = out->sector * 512; /* The block device implements "barriers", where the Guest indicates * that it wants all previous writes to occur before this write. We * don't have a way of asking our kernel to do a barrier, so we just * synchronize all the data in the file. Pretty poor, no? */ if (out->type & VIRTIO_BLK_T_BARRIER) fdatasync(vblk->fd); /* In general the virtio block driver is allowed to try SCSI commands. * It'd be nice if we supported eject, for example, but we don't. */ if (out->type & VIRTIO_BLK_T_SCSI_CMD) { fprintf(stderr, "Scsi commands unsupported\n"); in->status = VIRTIO_BLK_S_UNSUPP; wlen = sizeof(*in); } else if (out->type & VIRTIO_BLK_T_OUT) { /* Write */ /* Move to the right location in the block file. This can fail * if they try to write past end. */ if (lseek64(vblk->fd, off, SEEK_SET) != off) err(1, "Bad seek to sector %llu", out->sector); ret = writev(vblk->fd, iov+1, out_num-1); verbose("WRITE to sector %llu: %i\n", out->sector, ret); /* Grr... Now we know how long the descriptor they sent was, we * make sure they didn't try to write over the end of the block * file (possibly extending it). */ if (ret > 0 && off + ret > vblk->len) { /* Trim it back to the correct length */ ftruncate64(vblk->fd, vblk->len); /* Die, bad Guest, die. */ errx(1, "Write past end %llu+%u", off, ret); } wlen = sizeof(*in); in->status = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR); } else { /* Read */ /* Move to the right location in the block file. This can fail * if they try to read past end. */ if (lseek64(vblk->fd, off, SEEK_SET) != off) err(1, "Bad seek to sector %llu", out->sector); ret = readv(vblk->fd, iov+1, in_num-1); verbose("READ from sector %llu: %i\n", out->sector, ret); if (ret >= 0) { wlen = sizeof(*in) + ret; in->status = VIRTIO_BLK_S_OK; } else { wlen = sizeof(*in); in->status = VIRTIO_BLK_S_IOERR; } } /* We can't trigger an IRQ, because we're not the Launcher. It does * that when we tell it we're done. */ add_used(dev->vq, head, wlen); return true;}/* This is the thread which actually services the I/O. */static int io_thread(void *_dev){ struct device *dev = _dev; struct vblk_info *vblk = dev->priv; char c; /* Close other side of workpipe so we get 0 read when main dies. */ close(vblk->workpipe[1]); /* Close the other side of the done_fd pipe. */ close(dev->fd); /* When this read fails, it means Launcher died, so we follow. */ while (read(vblk->workpipe[0], &c, 1) == 1) { /* We acknowledge each request immediately to reduce latency, * rather than waiting until we've done them all. I haven't * measured to see if it makes any difference. */ while (service_io(dev)) write(vblk->done_fd, &c, 1); } return 0;}/* Now we've seen the I/O thread, we return to the Launcher to see what happens * when the thread tells us it's completed some I/O. */static bool handle_io_finish(int fd, struct device *dev){ char c; /* If the I/O thread died, presumably it printed the error, so we * simply exit. */ if (read(dev->fd, &c, 1) != 1) exit(1); /* It did some work, so trigger the irq. */ trigger_irq(fd, dev->vq); return true;}/* When the Guest submits some I/O, we just need to wake the I/O thread. */static void handle_virtblk_output(int fd, struct virtqueue *vq){ struct vblk_info *vblk = vq->dev->priv; char c = 0; /* Wake up I/O thread and tell it to go to work! */ if (write(vblk->workpipe[1], &c, 1) != 1) /* Presumably it indicated why it died. */ exit(1);}/*L:198 This actually sets up a virtual block device. */static void setup_block_file(const char *filename){ int p[2]; struct device *dev; struct vblk_info *vblk; void *stack; u64 cap; unsigned int val; /* This is the pipe the I/O thread will use to tell us I/O is done. */ pipe(p); /* The device responds to return from I/O thread. */ dev = new_device("block", VIRTIO_ID_BLOCK, p[0], handle_io_finish); /* The device has one virtqueue, where the Guest places requests. */ add_virtqueue(dev, VIRTQUEUE_NUM, handle_virtblk_output); /* Allocate the room for our own bookkeeping */ vblk = dev->priv = malloc(sizeof(*vblk)); /* First we open the file and store the length. */ vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE); vblk->len = lseek64(vblk->fd, 0, SEEK_END); /* Tell Guest how many sectors this device has. */ cap = cpu_to_le64(vblk->len / 512); add_desc_field(dev, VIRTIO_CONFIG_BLK_F_CAPACITY, sizeof(cap), &cap); /* Tell Guest not to put in too many descriptors at once: two are used * for the in and out elements. */ val = cpu_to_le32(VIRTQUEUE_NUM - 2); add_desc_field(dev, VIRTIO_CONFIG_BLK_F_SEG_MAX, sizeof(val), &val); /* The I/O thread writes to this end of the pipe when done. */ vblk->done_fd = p[1]; /* This is the second pipe, which is how we tell the I/O thread about * more work. */ pipe(vblk->workpipe); /* Create stack for thread and run it */ stack = malloc(32768); if (clone(io_thread, stack + 32768, CLONE_VM, dev) == -1) err(1, "Creating clone"); /* We don't need to keep the I/O thread's end of the pipes open. */ close(vblk->done_fd); close(vblk->workpipe[0]); verbose("device %u: virtblock %llu sectors\n", devices.device_num, cap);}/* That's the end of device setup. *//*L:220 Finally we reach the core of the Launcher, which runs the Guest, serves * its input and output, and finally, lays it to rest. */static void __attribute__((noreturn)) run_guest(int lguest_fd){ for (;;) { unsigned long args[] = { LHREQ_BREAK, 0 }; unsigned long notify_addr; int readval; /* We read from the /dev/lguest device to run the Guest. */ readval = read(lguest_fd, ¬ify_addr, sizeof(notify_addr)); /* One unsigned long means the Guest did HCALL_NOTIFY */ if (readval == sizeof(notify_addr)) { verbose("Notify on address %#lx\n", notify_addr); handle_output(lguest_fd, notify_addr); continue; /* ENOENT means the Guest died. Reading tells us why. */ } else if (errno == ENOENT) { char reason[1024] = { 0 }; read(lguest_fd, reason, sizeof(reason)-1); errx(1, "%s", reason); /* EAGAIN means the Waker wanted us to look at some input. * Anything else means a bug or incompatible change. */ } else if (errno != EAGAIN) err(1, "Running guest failed"); /* Service input, then unset the BREAK to release the Waker. */ handle_input(lguest_fd); if (write(lguest_fd, args, sizeof(args)) < 0) err(1, "Resetting break"); }}/* * This is the end of the Launcher. The good news: we are over halfway * through! The bad news: the most fiendish part of the code still lies ahead * of us. * * Are you ready? Take a deep breath and join me in the core of the Host, in * "make Host". :*/static struct option opts[] = { { "verbose", 0, NULL, 'v' }, { "tunnet", 1, NULL, 't' }, { "block", 1, NULL, 'b' }, { "initrd", 1, NULL, 'i' }, { NULL },};static void usage(void){ errx(1, "Usage: lguest [--verbose] " "[--tunnet=(<ipaddr>|bridge:<bridgename>)\n" "|--block=<filename>|--initrd=<filename>]...\n" "<mem-in-mb> vmlinux [args...]");}/*L:105 The main routine is where the real work begins: */int main(int argc, char *argv[]){ /* Memory, top-level pagetable, code startpoint and size of the * (optional) initrd. */ unsigned long mem = 0, pgdir, start, initrd_size = 0; /* Two temporaries and the /dev/lguest file descriptor. */ int i, c, lguest_fd; /* The boot information for the Guest. */ struct boot_params *boot; /* If they specify an initrd file to load. */ const char *initrd_name = NULL; /* First we initialize the device list. Since console and network * device receive input from a file descriptor, we keep an fdset * (infds) and the maximum fd number (max_infd) with the head of the * list. We also keep a pointer to the last device, for easy appending * to the list. Finally, we keep the next interrupt number to hand out * (1: remember that 0 is used by the timer). */ FD_ZERO(&devices.infds); devices.max_infd = -1; devices.lastdev = &devices.dev; devices.next_irq = 1; /* We need to know how much memory so we can set up the device * descriptor and memory pages for the devices as we parse the command * line. So we quickly look through the arguments to find the amount * of memory now. */ for (i = 1; i < argc; i++) { if (argv[i][0] != '-') { mem = atoi(argv[i]) * 1024 * 1024; /* We start by mapping anonymous pages over all of * guest-physical memory range. This fills it with 0, * and ensures that the Guest won't be killed when it * tries to access it. */ guest_base = map_zeroed_pages(mem / getpagesize() + DEVICE_PAGES); guest_limit = mem; guest_max = mem + DEVICE_PAGES*getpagesize(); devices.descpage = get_pages(1); break; } } /* The options are fairly straight-forward */ while ((c = getopt_long(argc, argv, "v", opts, NULL)) != EOF) { switch (c) { case 'v': verbose = true; break; case 't': setup_tun_net(optarg); break; case 'b': setup_block_file(optarg); break; case 'i': initrd_name = optarg; break; default: warnx("Unknown argument %s", argv[optind]); usage(); } } /* After the other arguments we expect memory and kernel image name, * followed by command line arguments for the kernel. */ if (optind + 2 > argc) usage(); verbose("Guest base is at %p\n", guest_base); /* We always have a console device */ setup_console(); /* Now we load the kernel */ start = load_kernel(open_or_die(argv[optind+1], O_RDONLY)); /* Boot information is stashed at physical address 0 */ boot = from_guest_phys(0); /* Map the initrd image if requested (at top of physical memory) */ if (initrd_name) { initrd_size = load_initrd(initrd_name, mem); /* These are the location in the Linux boot header where the * start and size of the initrd are expected to be found. */ boot->hdr.ramdisk_image = mem - initrd_size; boot->hdr.ramdisk_size = initrd_size; /* The bootloader type 0xFF means "unknown"; that's OK. */ boot->hdr.type_of_loader = 0xFF; } /* Set up the initial linear pagetables, starting below the initrd. */ pgdir = setup_pagetables(mem, initrd_size); /* The Linux boot header contains an "E820" memory map: ours is a * simple, single region. */ boot->e820_entries = 1; boot->e820_map[0] = ((struct e820entry) { 0, mem, E820_RAM }); /* The boot header contains a command line pointer: we put the command * line after the boot header. */ boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1); /* We use a simple helper to copy the arguments separated by spaces. */ concat((char *)(boot + 1), argv+optind+2); /* Boot protocol version: 2.07 supports the fields for lguest. */ boot->hdr.version = 0x207; /* The hardware_subarch value of "1" tells the Guest it's an lguest. */ boot->hdr.hardware_subarch = 1; /* Tell the entry path not to try to reload segment registers. */ boot->hdr.loadflags |= KEEP_SEGMENTS; /* We tell the kernel to initialize the Guest: this returns the open * /dev/lguest file descriptor. */ lguest_fd = tell_kernel(pgdir, start); /* We fork off a child process, which wakes the Launcher whenever one * of the input file descriptors needs attention. Otherwise we would * run the Guest until it tries to output something. */ waker_fd = setup_waker(lguest_fd); /* Finally, run the Guest. This doesn't return. */ run_guest(lguest_fd);}/*:*//*M:999 * Mastery is done: you now know everything I do. * * But surely you have seen code, features and bugs in your wanderings which * you now yearn to attack? That is the real game, and I look forward to you * patching and forking lguest into the Your-Name-Here-visor. * * Farewell, and good coding! * Rusty Russell. */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -