📄 lguest.c
字号:
* * We lay them out of the way, just below the initrd (which is why we need to * know its size). */static unsigned long setup_pagetables(unsigned long mem, unsigned long initrd_size){ unsigned long *pgdir, *linear; unsigned int mapped_pages, i, linear_pages; unsigned int ptes_per_page = getpagesize()/sizeof(void *); mapped_pages = mem/getpagesize(); /* Each PTE page can map ptes_per_page pages: how many do we need? */ linear_pages = (mapped_pages + ptes_per_page-1)/ptes_per_page; /* We put the toplevel page directory page at the top of memory. */ pgdir = from_guest_phys(mem) - initrd_size - getpagesize(); /* Now we use the next linear_pages pages as pte pages */ linear = (void *)pgdir - linear_pages*getpagesize(); /* Linear mapping is easy: put every page's address into the mapping in * order. PAGE_PRESENT contains the flags Present, Writable and * Executable. */ for (i = 0; i < mapped_pages; i++) linear[i] = ((i * getpagesize()) | PAGE_PRESENT); /* The top level points to the linear page table pages above. */ for (i = 0; i < mapped_pages; i += ptes_per_page) { pgdir[i/ptes_per_page] = ((to_guest_phys(linear) + i*sizeof(void *)) | PAGE_PRESENT); } verbose("Linear mapping of %u pages in %u pte pages at %#lx\n", mapped_pages, linear_pages, to_guest_phys(linear)); /* We return the top level (guest-physical) address: the kernel needs * to know where it is. */ return to_guest_phys(pgdir);}/*:*//* Simple routine to roll all the commandline arguments together with spaces * between them. */static void concat(char *dst, char *args[]){ unsigned int i, len = 0; for (i = 0; args[i]; i++) { strcpy(dst+len, args[i]); strcat(dst+len, " "); len += strlen(args[i]) + 1; } /* In case it's empty. */ dst[len] = '\0';}/*L:185 This is where we actually tell the kernel to initialize the Guest. We * saw the arguments it expects when we looked at initialize() in lguest_user.c: * the base of Guest "physical" memory, the top physical page to allow, the * top level pagetable and the entry point for the Guest. */static int tell_kernel(unsigned long pgdir, unsigned long start){ unsigned long args[] = { LHREQ_INITIALIZE, (unsigned long)guest_base, guest_limit / getpagesize(), pgdir, start }; int fd; verbose("Guest: %p - %p (%#lx)\n", guest_base, guest_base + guest_limit, guest_limit); fd = open_or_die("/dev/lguest", O_RDWR); if (write(fd, args, sizeof(args)) < 0) err(1, "Writing to /dev/lguest"); /* We return the /dev/lguest file descriptor to control this Guest */ return fd;}/*:*/static void add_device_fd(int fd){ FD_SET(fd, &devices.infds); if (fd > devices.max_infd) devices.max_infd = fd;}/*L:200 * The Waker. * * With console, block and network devices, we can have lots of input which we * need to process. We could try to tell the kernel what file descriptors to * watch, but handing a file descriptor mask through to the kernel is fairly * icky. * * Instead, we fork off a process which watches the file descriptors and writes * the LHREQ_BREAK command to the /dev/lguest file descriptor to tell the Host * stop running the Guest. This causes the Launcher to return from the * /dev/lguest read with -EAGAIN, where it will write to /dev/lguest to reset * the LHREQ_BREAK and wake us up again. * * This, of course, is merely a different *kind* of icky. */static void wake_parent(int pipefd, int lguest_fd){ /* Add the pipe from the Launcher to the fdset in the device_list, so * we watch it, too. */ add_device_fd(pipefd); for (;;) { fd_set rfds = devices.infds; unsigned long args[] = { LHREQ_BREAK, 1 }; /* Wait until input is ready from one of the devices. */ select(devices.max_infd+1, &rfds, NULL, NULL, NULL); /* Is it a message from the Launcher? */ if (FD_ISSET(pipefd, &rfds)) { int fd; /* If read() returns 0, it means the Launcher has * exited. We silently follow. */ if (read(pipefd, &fd, sizeof(fd)) == 0) exit(0); /* Otherwise it's telling us to change what file * descriptors we're to listen to. Positive means * listen to a new one, negative means stop * listening. */ if (fd >= 0) FD_SET(fd, &devices.infds); else FD_CLR(-fd - 1, &devices.infds); } else /* Send LHREQ_BREAK command. */ write(lguest_fd, args, sizeof(args)); }}/* This routine just sets up a pipe to the Waker process. */static int setup_waker(int lguest_fd){ int pipefd[2], child; /* We create a pipe to talk to the Waker, and also so it knows when the * Launcher dies (and closes pipe). */ pipe(pipefd); child = fork(); if (child == -1) err(1, "forking"); if (child == 0) { /* We are the Waker: close the "writing" end of our copy of the * pipe and start waiting for input. */ close(pipefd[1]); wake_parent(pipefd[0], lguest_fd); } /* Close the reading end of our copy of the pipe. */ close(pipefd[0]); /* Here is the fd used to talk to the waker. */ return pipefd[1];}/* * Device Handling. * * When the Guest gives us a buffer, it sends an array of addresses and sizes. * We need to make sure it's not trying to reach into the Launcher itself, so * we have a convenient routine which checks it and exits with an error message * if something funny is going on: */static void *_check_pointer(unsigned long addr, unsigned int size, unsigned int line){ /* We have to separately check addr and addr+size, because size could * be huge and addr + size might wrap around. */ if (addr >= guest_limit || addr + size >= guest_limit) errx(1, "%s:%i: Invalid address %#lx", __FILE__, line, addr); /* We return a pointer for the caller's convenience, now we know it's * safe to use. */ return from_guest_phys(addr);}/* A macro which transparently hands the line number to the real function. */#define check_pointer(addr,size) _check_pointer(addr, size, __LINE__)/* Each buffer in the virtqueues is actually a chain of descriptors. This * function returns the next descriptor in the chain, or vq->vring.num if we're * at the end. */static unsigned next_desc(struct virtqueue *vq, unsigned int i){ unsigned int next; /* If this descriptor says it doesn't chain, we're done. */ if (!(vq->vring.desc[i].flags & VRING_DESC_F_NEXT)) return vq->vring.num; /* Check they're not leading us off end of descriptors. */ next = vq->vring.desc[i].next; /* Make sure compiler knows to grab that: we don't want it changing! */ wmb(); if (next >= vq->vring.num) errx(1, "Desc next is %u", next); return next;}/* This looks in the virtqueue and for the first available buffer, and converts * it to an iovec for convenient access. Since descriptors consist of some * number of output then some number of input descriptors, it's actually two * iovecs, but we pack them into one and note how many of each there were. * * This function returns the descriptor number found, or vq->vring.num (which * is never a valid descriptor number) if none was found. */static unsigned get_vq_desc(struct virtqueue *vq, struct iovec iov[], unsigned int *out_num, unsigned int *in_num){ unsigned int i, head; /* Check it isn't doing very strange things with descriptor numbers. */ if ((u16)(vq->vring.avail->idx - vq->last_avail_idx) > vq->vring.num) errx(1, "Guest moved used index from %u to %u", vq->last_avail_idx, vq->vring.avail->idx); /* If there's nothing new since last we looked, return invalid. */ if (vq->vring.avail->idx == vq->last_avail_idx) return vq->vring.num; /* Grab the next descriptor number they're advertising, and increment * the index we've seen. */ head = vq->vring.avail->ring[vq->last_avail_idx++ % vq->vring.num]; /* If their number is silly, that's a fatal mistake. */ if (head >= vq->vring.num) errx(1, "Guest says index %u is available", head); /* When we start there are none of either input nor output. */ *out_num = *in_num = 0; i = head; do { /* Grab the first descriptor, and check it's OK. */ iov[*out_num + *in_num].iov_len = vq->vring.desc[i].len; iov[*out_num + *in_num].iov_base = check_pointer(vq->vring.desc[i].addr, vq->vring.desc[i].len); /* If this is an input descriptor, increment that count. */ if (vq->vring.desc[i].flags & VRING_DESC_F_WRITE) (*in_num)++; else { /* If it's an output descriptor, they're all supposed * to come before any input descriptors. */ if (*in_num) errx(1, "Descriptor has out after in"); (*out_num)++; } /* If we've got too many, that implies a descriptor loop. */ if (*out_num + *in_num > vq->vring.num) errx(1, "Looped descriptor"); } while ((i = next_desc(vq, i)) != vq->vring.num); return head;}/* After we've used one of their buffers, we tell them about it. We'll then * want to send them an interrupt, using trigger_irq(). */static void add_used(struct virtqueue *vq, unsigned int head, int len){ struct vring_used_elem *used; /* The virtqueue contains a ring of used buffers. Get a pointer to the * next entry in that used ring. */ used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num]; used->id = head; used->len = len; /* Make sure buffer is written before we update index. */ wmb(); vq->vring.used->idx++;}/* This actually sends the interrupt for this virtqueue */static void trigger_irq(int fd, struct virtqueue *vq){ unsigned long buf[] = { LHREQ_IRQ, vq->config.irq }; /* If they don't want an interrupt, don't send one. */ if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) return; /* Send the Guest an interrupt tell them we used something up. */ if (write(fd, buf, sizeof(buf)) != 0) err(1, "Triggering irq %i", vq->config.irq);}/* And here's the combo meal deal. Supersize me! */static void add_used_and_trigger(int fd, struct virtqueue *vq, unsigned int head, int len){ add_used(vq, head, len); trigger_irq(fd, vq);}/* * The Console * * Here is the input terminal setting we save, and the routine to restore them * on exit so the user gets their terminal back. */static struct termios orig_term;static void restore_term(void){ tcsetattr(STDIN_FILENO, TCSANOW, &orig_term);}/* We associate some data with the console for our exit hack. */struct console_abort{ /* How many times have they hit ^C? */ int count; /* When did they start? */ struct timeval start;};/* This is the routine which handles console input (ie. stdin). */static bool handle_console_input(int fd, struct device *dev){ int len; unsigned int head, in_num, out_num; struct iovec iov[dev->vq->vring.num]; struct console_abort *abort = dev->priv; /* First we need a console buffer from the Guests's input virtqueue. */ head = get_vq_desc(dev->vq, iov, &out_num, &in_num); /* If they're not ready for input, stop listening to this file * descriptor. We'll start again once they add an input buffer. */ if (head == dev->vq->vring.num) return false; if (out_num) errx(1, "Output buffers in console in queue?"); /* This is why we convert to iovecs: the readv() call uses them, and so * it reads straight into the Guest's buffer. */ len = readv(dev->fd, iov, in_num); if (len <= 0) { /* This implies that the console is closed, is /dev/null, or * something went terribly wrong. */ warnx("Failed to get console input, ignoring console."); /* Put the input terminal back. */ restore_term(); /* Remove callback from input vq, so it doesn't restart us. */ dev->vq->handle_output = NULL; /* Stop listening to this fd: don't call us again. */ return false; } /* Tell the Guest about the new input. */ add_used_and_trigger(fd, dev->vq, head, len); /* Three ^C within one second? Exit. * * This is such a hack, but works surprisingly well. Each ^C has to be * in a buffer by itself, so they can't be too fast. But we check that * we get three within about a second, so they can't be too slow. */ if (len == 1 && ((char *)iov[0].iov_base)[0] == 3) { if (!abort->count++) gettimeofday(&abort->start, NULL); else if (abort->count == 3) { struct timeval now; gettimeofday(&now, NULL); if (now.tv_sec <= abort->start.tv_sec+1) { unsigned long args[] = { LHREQ_BREAK, 0 }; /* Close the fd so Waker will know it has to * exit. */ close(waker_fd); /* Just in case waker is blocked in BREAK, send * unbreak now. */ write(fd, args, sizeof(args)); exit(2); } abort->count = 0; } } else /* Any other key resets the abort counter. */ abort->count = 0; /* Everything went OK! */ return true;}/* Handling output for console is simple: we just get all the output buffers * and write them to stdout. */static void handle_console_output(int fd, struct virtqueue *vq){ unsigned int head, out, in; int len; struct iovec iov[vq->vring.num]; /* Keep getting output buffers from the Guest until we run out. */ while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) { if (in) errx(1, "Input buffers in output queue?"); len = writev(STDOUT_FILENO, iov, out); add_used_and_trigger(fd, vq, head, len); }}/* * The Network * * Handling output for network is also simple: we get all the output buffers * and write them (ignoring the first element) to this device's file descriptor * (stdout). */static void handle_net_output(int fd, struct virtqueue *vq){ unsigned int head, out, in; int len; struct iovec iov[vq->vring.num]; /* Keep getting output buffers from the Guest until we run out. */ while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) { if (in) errx(1, "Input buffers in output queue?"); /* Check header, but otherwise ignore it (we told the Guest we * supported no features, so it shouldn't have anything * interesting). */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -