📄 lguest.c
字号:
/*P:100 This is the Launcher code, a simple program which lays out the * "physical" memory for the new Guest by mapping the kernel image and the * virtual devices, then reads repeatedly from /dev/lguest to run the Guest.:*/#define _LARGEFILE64_SOURCE#define _GNU_SOURCE#include <stdio.h>#include <string.h>#include <unistd.h>#include <err.h>#include <stdint.h>#include <stdlib.h>#include <elf.h>#include <sys/mman.h>#include <sys/param.h>#include <sys/types.h>#include <sys/stat.h>#include <sys/wait.h>#include <fcntl.h>#include <stdbool.h>#include <errno.h>#include <ctype.h>#include <sys/socket.h>#include <sys/ioctl.h>#include <sys/time.h>#include <time.h>#include <netinet/in.h>#include <net/if.h>#include <linux/sockios.h>#include <linux/if_tun.h>#include <sys/uio.h>#include <termios.h>#include <getopt.h>#include <zlib.h>#include <assert.h>#include <sched.h>#include "linux/lguest_launcher.h"#include "linux/virtio_config.h"#include "linux/virtio_net.h"#include "linux/virtio_blk.h"#include "linux/virtio_console.h"#include "linux/virtio_ring.h"#include "asm-x86/bootparam.h"/*L:110 We can ignore the 38 include files we need for this program, but I do * want to draw attention to the use of kernel-style types. * * As Linus said, "C is a Spartan language, and so should your naming be." I * like these abbreviations, so we define them here. Note that u64 is always * unsigned long long, which works on all Linux systems: this means that we can * use %llu in printf for any u64. */typedef unsigned long long u64;typedef uint32_t u32;typedef uint16_t u16;typedef uint8_t u8;/*:*/#define PAGE_PRESENT 0x7 /* Present, RW, Execute */#define NET_PEERNUM 1#define BRIDGE_PFX "bridge:"#ifndef SIOCBRADDIF#define SIOCBRADDIF 0x89a2 /* add interface to bridge */#endif/* We can have up to 256 pages for devices. */#define DEVICE_PAGES 256/* This will occupy 2 pages: it must be a power of 2. */#define VIRTQUEUE_NUM 128/*L:120 verbose is both a global flag and a macro. The C preprocessor allows * this, and although I wouldn't recommend it, it works quite nicely here. */static bool verbose;#define verbose(args...) \ do { if (verbose) printf(args); } while(0)/*:*//* The pipe to send commands to the waker process */static int waker_fd;/* The pointer to the start of guest memory. */static void *guest_base;/* The maximum guest physical address allowed, and maximum possible. */static unsigned long guest_limit, guest_max;/* This is our list of devices. */struct device_list{ /* Summary information about the devices in our list: ready to pass to * select() to ask which need servicing.*/ fd_set infds; int max_infd; /* Counter to assign interrupt numbers. */ unsigned int next_irq; /* Counter to print out convenient device numbers. */ unsigned int device_num; /* The descriptor page for the devices. */ u8 *descpage; /* The tail of the last descriptor. */ unsigned int desc_used; /* A single linked list of devices. */ struct device *dev; /* ... And an end pointer so we can easily append new devices */ struct device **lastdev;};/* The list of Guest devices, based on command line arguments. */static struct device_list devices;/* The device structure describes a single device. */struct device{ /* The linked-list pointer. */ struct device *next; /* The this device's descriptor, as mapped into the Guest. */ struct lguest_device_desc *desc; /* The name of this device, for --verbose. */ const char *name; /* If handle_input is set, it wants to be called when this file * descriptor is ready. */ int fd; bool (*handle_input)(int fd, struct device *me); /* Any queues attached to this device */ struct virtqueue *vq; /* Device-specific data. */ void *priv;};/* The virtqueue structure describes a queue attached to a device. */struct virtqueue{ struct virtqueue *next; /* Which device owns me. */ struct device *dev; /* The configuration for this queue. */ struct lguest_vqconfig config; /* The actual ring of buffers. */ struct vring vring; /* Last available index we saw. */ u16 last_avail_idx; /* The routine to call when the Guest pings us. */ void (*handle_output)(int fd, struct virtqueue *me);};/* Since guest is UP and we don't run at the same time, we don't need barriers. * But I include them in the code in case others copy it. */#define wmb()/* Convert an iovec element to the given type. * * This is a fairly ugly trick: we need to know the size of the type and * alignment requirement to check the pointer is kosher. It's also nice to * have the name of the type in case we report failure. * * Typing those three things all the time is cumbersome and error prone, so we * have a macro which sets them all up and passes to the real function. */#define convert(iov, type) \ ((type *)_convert((iov), sizeof(type), __alignof__(type), #type))static void *_convert(struct iovec *iov, size_t size, size_t align, const char *name){ if (iov->iov_len != size) errx(1, "Bad iovec size %zu for %s", iov->iov_len, name); if ((unsigned long)iov->iov_base % align != 0) errx(1, "Bad alignment %p for %s", iov->iov_base, name); return iov->iov_base;}/* The virtio configuration space is defined to be little-endian. x86 is * little-endian too, but it's nice to be explicit so we have these helpers. */#define cpu_to_le16(v16) (v16)#define cpu_to_le32(v32) (v32)#define cpu_to_le64(v64) (v64)#define le16_to_cpu(v16) (v16)#define le32_to_cpu(v32) (v32)#define le64_to_cpu(v32) (v64)/*L:100 The Launcher code itself takes us out into userspace, that scary place * where pointers run wild and free! Unfortunately, like most userspace * programs, it's quite boring (which is why everyone likes to hack on the * kernel!). Perhaps if you make up an Lguest Drinking Game at this point, it * will get you through this section. Or, maybe not. * * The Launcher sets up a big chunk of memory to be the Guest's "physical" * memory and stores it in "guest_base". In other words, Guest physical == * Launcher virtual with an offset. * * This can be tough to get your head around, but usually it just means that we * use these trivial conversion functions when the Guest gives us it's * "physical" addresses: */static void *from_guest_phys(unsigned long addr){ return guest_base + addr;}static unsigned long to_guest_phys(const void *addr){ return (addr - guest_base);}/*L:130 * Loading the Kernel. * * We start with couple of simple helper routines. open_or_die() avoids * error-checking code cluttering the callers: */static int open_or_die(const char *name, int flags){ int fd = open(name, flags); if (fd < 0) err(1, "Failed to open %s", name); return fd;}/* map_zeroed_pages() takes a number of pages. */static void *map_zeroed_pages(unsigned int num){ int fd = open_or_die("/dev/zero", O_RDONLY); void *addr; /* We use a private mapping (ie. if we write to the page, it will be * copied). */ addr = mmap(NULL, getpagesize() * num, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, fd, 0); if (addr == MAP_FAILED) err(1, "Mmaping %u pages of /dev/zero", num); return addr;}/* Get some more pages for a device. */static void *get_pages(unsigned int num){ void *addr = from_guest_phys(guest_limit); guest_limit += num * getpagesize(); if (guest_limit > guest_max) errx(1, "Not enough memory for devices"); return addr;}/* This routine is used to load the kernel or initrd. It tries mmap, but if * that fails (Plan 9's kernel file isn't nicely aligned on page boundaries), * it falls back to reading the memory in. */static void map_at(int fd, void *addr, unsigned long offset, unsigned long len){ ssize_t r; /* We map writable even though for some segments are marked read-only. * The kernel really wants to be writable: it patches its own * instructions. * * MAP_PRIVATE means that the page won't be copied until a write is * done to it. This allows us to share untouched memory between * Guests. */ if (mmap(addr, len, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_FIXED|MAP_PRIVATE, fd, offset) != MAP_FAILED) return; /* pread does a seek and a read in one shot: saves a few lines. */ r = pread(fd, addr, len, offset); if (r != len) err(1, "Reading offset %lu len %lu gave %zi", offset, len, r);}/* This routine takes an open vmlinux image, which is in ELF, and maps it into * the Guest memory. ELF = Embedded Linking Format, which is the format used * by all modern binaries on Linux including the kernel. * * The ELF headers give *two* addresses: a physical address, and a virtual * address. We use the physical address; the Guest will map itself to the * virtual address. * * We return the starting address. */static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr){ Elf32_Phdr phdr[ehdr->e_phnum]; unsigned int i; /* Sanity checks on the main ELF header: an x86 executable with a * reasonable number of correctly-sized program headers. */ if (ehdr->e_type != ET_EXEC || ehdr->e_machine != EM_386 || ehdr->e_phentsize != sizeof(Elf32_Phdr) || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr)) errx(1, "Malformed elf header"); /* An ELF executable contains an ELF header and a number of "program" * headers which indicate which parts ("segments") of the program to * load where. */ /* We read in all the program headers at once: */ if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0) err(1, "Seeking to program headers"); if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr)) err(1, "Reading program headers"); /* Try all the headers: there are usually only three. A read-only one, * a read-write one, and a "note" section which isn't loadable. */ for (i = 0; i < ehdr->e_phnum; i++) { /* If this isn't a loadable segment, we ignore it */ if (phdr[i].p_type != PT_LOAD) continue; verbose("Section %i: size %i addr %p\n", i, phdr[i].p_memsz, (void *)phdr[i].p_paddr); /* We map this section of the file at its physical address. */ map_at(elf_fd, from_guest_phys(phdr[i].p_paddr), phdr[i].p_offset, phdr[i].p_filesz); } /* The entry point is given in the ELF header. */ return ehdr->e_entry;}/*L:150 A bzImage, unlike an ELF file, is not meant to be loaded. You're * supposed to jump into it and it will unpack itself. We used to have to * perform some hairy magic because the unpacking code scared me. * * Fortunately, Jeremy Fitzhardinge convinced me it wasn't that hard and wrote * a small patch to jump over the tricky bits in the Guest, so now we just read * the funky header so we know where in the file to load, and away we go! */static unsigned long load_bzimage(int fd){ struct boot_params boot; int r; /* Modern bzImages get loaded at 1M. */ void *p = from_guest_phys(0x100000); /* Go back to the start of the file and read the header. It should be * a Linux boot header (see Documentation/i386/boot.txt) */ lseek(fd, 0, SEEK_SET); read(fd, &boot, sizeof(boot)); /* Inside the setup_hdr, we expect the magic "HdrS" */ if (memcmp(&boot.hdr.header, "HdrS", 4) != 0) errx(1, "This doesn't look like a bzImage to me"); /* Skip over the extra sectors of the header. */ lseek(fd, (boot.hdr.setup_sects+1) * 512, SEEK_SET); /* Now read everything into memory. in nice big chunks. */ while ((r = read(fd, p, 65536)) > 0) p += r; /* Finally, code32_start tells us where to enter the kernel. */ return boot.hdr.code32_start;}/*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels * come wrapped up in the self-decompressing "bzImage" format. With a little * work, we can load those, too. */static unsigned long load_kernel(int fd){ Elf32_Ehdr hdr; /* Read in the first few bytes. */ if (read(fd, &hdr, sizeof(hdr)) != sizeof(hdr)) err(1, "Reading kernel"); /* If it's an ELF file, it starts with "\177ELF" */ if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0) return map_elf(fd, &hdr); /* Otherwise we assume it's a bzImage, and try to unpack it */ return load_bzimage(fd);}/* This is a trivial little helper to align pages. Andi Kleen hated it because * it calls getpagesize() twice: "it's dumb code." * * Kernel guys get really het up about optimization, even when it's not * necessary. I leave this code as a reaction against that. */static inline unsigned long page_align(unsigned long addr){ /* Add upwards and truncate downwards. */ return ((addr + getpagesize()-1) & ~(getpagesize()-1));}/*L:180 An "initial ram disk" is a disk image loaded into memory along with * the kernel which the kernel can use to boot from without needing any * drivers. Most distributions now use this as standard: the initrd contains * the code to load the appropriate driver modules for the current machine. * * Importantly, James Morris works for RedHat, and Fedora uses initrds for its * kernels. He sent me this (and tells me when I break it). */static unsigned long load_initrd(const char *name, unsigned long mem){ int ifd; struct stat st; unsigned long len; ifd = open_or_die(name, O_RDONLY); /* fstat() is needed to get the file size. */ if (fstat(ifd, &st) < 0) err(1, "fstat() on initrd '%s'", name); /* We map the initrd at the top of memory, but mmap wants it to be * page-aligned, so we round the size up for that. */ len = page_align(st.st_size); map_at(ifd, from_guest_phys(mem - len), 0, st.st_size); /* Once a file is mapped, you can close the file descriptor. It's a * little odd, but quite useful. */ close(ifd); verbose("mapped initrd %s size=%lu @ %p\n", name, len, (void*)mem-len); /* We return the initrd size. */ return len;}/* Once we know how much memory we have, we can construct simple linear page * tables which set virtual == physical which will get the Guest far enough * into the boot to create its own.
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -