📄 block-vhd.c
字号:
/*
* Copyright (c) 2008, XenSource Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of XenSource Inc. nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
* OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* A note on write transactions:
* Writes that require updating the BAT or bitmaps cannot be signaled
* as complete until all updates have reached disk. Transactions are
* used to ensure proper ordering in these cases. The two types of
* transactions are as follows:
* - Bitmap updates only: data writes that require updates to the same
* bitmap are grouped in a transaction. Only after all data writes
* in a transaction complete does the bitmap write commence. Only
* after the bitmap write finishes are the data writes signalled as
* complete.
* - BAT and bitmap updates: data writes are grouped in transactions
* as above, but a special extra write is included in the transaction,
* which zeros out the newly allocated bitmap on disk. When the data
* writes and the zero-bitmap write complete, the BAT and bitmap writes
* are started in parallel. The transaction is completed only after both
* the BAT and bitmap writes successfully return.
*/
#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/stat.h>
#include <sys/ioctl.h>
#include <uuid/uuid.h> /* For whatever reason, Linux packages this in */
/* e2fsprogs-devel. */
#include <string.h> /* for memset. */
#include <libaio.h>
#include <sys/mman.h>
#include "libvhd.h"
#include "tapdisk.h"
#include "tapdisk-driver.h"
#include "tapdisk-interface.h"
unsigned int SPB;
#define DEBUGGING 2
#define ASSERTING 1
#define MICROSOFT_COMPAT
#define VHD_BATMAP_MAX_RETRIES 10
#define __TRACE(s) \
do { \
DBG(TLOG_DBG, "%s: QUEUED: %" PRIu64 ", COMPLETED: %" \
PRIu64", RETURNED: %" PRIu64 ", DATA_ALLOCATED: " \
"%lu, BBLK: 0x%04x\n", \
s->vhd.file, s->queued, s->completed, s->returned, \
VHD_REQS_DATA - s->vreq_free_count, \
s->bat.pbw_blk); \
} while(0)
#define __ASSERT(_p) \
if (!(_p)) { \
DPRINTF("%s:%d: FAILED ASSERTION: '%s'\n", \
__FILE__, __LINE__, #_p); \
DBG(TLOG_WARN, "%s:%d: FAILED ASSERTION: '%s'\n", \
__FILE__, __LINE__, #_p); \
tlog_flush(); \
*(int*)0 = 0; \
}
#if (DEBUGGING == 1)
#define DBG(level, _f, _a...) DPRINTF(_f, ##_a)
#define ERR(err, _f, _a...) DPRINTF("ERROR: %d: " _f, err, ##_a)
#define TRACE(s) ((void)0)
#elif (DEBUGGING == 2)
#define DBG(level, _f, _a...) tlog_write(level, _f, ##_a)
#define ERR(err, _f, _a...) tlog_error(err, _f, ##_a)
#define TRACE(s) __TRACE(s)
#else
#define DBG(level, _f, _a...) ((void)0)
#define ERR(err, _f, _a...) ((void)0)
#define TRACE(s) ((void)0)
#endif
#if (ASSERTING == 1)
#define ASSERT(_p) __ASSERT(_p)
#else
#define ASSERT(_p) ((void)0)
#endif
/******VHD DEFINES******/
#define VHD_CACHE_SIZE 32
#define VHD_REQS_DATA TAPDISK_DATA_REQUESTS
#define VHD_REQS_META (VHD_CACHE_SIZE + 2)
#define VHD_REQS_TOTAL (VHD_REQS_DATA + VHD_REQS_META)
#define VHD_OP_BAT_WRITE 0
#define VHD_OP_DATA_READ 1
#define VHD_OP_DATA_WRITE 2
#define VHD_OP_BITMAP_READ 3
#define VHD_OP_BITMAP_WRITE 4
#define VHD_OP_ZERO_BM_WRITE 5
#define VHD_BM_BAT_LOCKED 0
#define VHD_BM_BAT_CLEAR 1
#define VHD_BM_BIT_CLEAR 2
#define VHD_BM_BIT_SET 3
#define VHD_BM_NOT_CACHED 4
#define VHD_BM_READ_PENDING 5
#define VHD_FLAG_OPEN_RDONLY 1
#define VHD_FLAG_OPEN_NO_CACHE 2
#define VHD_FLAG_OPEN_QUIET 4
#define VHD_FLAG_OPEN_STRICT 8
#define VHD_FLAG_OPEN_QUERY 16
#define VHD_FLAG_OPEN_PREALLOCATE 32
#define VHD_FLAG_BAT_LOCKED 1
#define VHD_FLAG_BAT_WRITE_STARTED 2
#define VHD_FLAG_BM_UPDATE_BAT 1
#define VHD_FLAG_BM_WRITE_PENDING 2
#define VHD_FLAG_BM_READ_PENDING 4
#define VHD_FLAG_BM_LOCKED 8
#define VHD_FLAG_REQ_UPDATE_BAT 1
#define VHD_FLAG_REQ_UPDATE_BITMAP 2
#define VHD_FLAG_REQ_QUEUED 4
#define VHD_FLAG_REQ_FINISHED 8
#define VHD_FLAG_TX_LIVE 1
#define VHD_FLAG_TX_UPDATE_BAT 2
typedef uint8_t vhd_flag_t;
struct vhd_state;
struct vhd_request;
struct vhd_req_list {
struct vhd_request *head;
struct vhd_request *tail;
};
struct vhd_transaction {
int error;
int closed;
int started;
int finished;
vhd_flag_t status;
struct vhd_req_list requests;
};
struct vhd_request {
int error;
uint8_t op;
vhd_flag_t flags;
td_request_t treq;
struct tiocb tiocb;
struct vhd_state *state;
struct vhd_request *next;
struct vhd_transaction *tx;
};
struct vhd_bat_state {
vhd_bat_t bat;
vhd_batmap_t batmap;
vhd_flag_t status;
uint32_t pbw_blk; /* blk num of pending write */
uint64_t pbw_offset; /* file offset of same */
struct vhd_request req; /* for writing bat table */
struct vhd_request zero_req; /* for initializing bitmaps */
char *bat_buf;
};
struct vhd_bitmap {
u32 blk;
u64 seqno; /* lru sequence number */
vhd_flag_t status;
char *map; /* map should only be modified
* in finish_bitmap_write */
char *shadow; /* in-memory bitmap changes are
* made to shadow and copied to
* map only after having been
* flushed to disk */
struct vhd_transaction tx; /* transaction data structure
* encapsulating data, bitmap,
* and bat writes */
struct vhd_req_list queue; /* data writes waiting for next
* transaction */
struct vhd_req_list waiting; /* pending requests that cannot
* be serviced until this bitmap
* is read from disk */
struct vhd_request req;
};
struct vhd_state {
vhd_flag_t flags;
/* VHD stuff */
vhd_context_t vhd;
u32 spp; /* sectors per page */
u32 spb; /* sectors per block */
u64 next_db; /* pointer to the next
* (unallocated) datablock */
struct vhd_bat_state bat;
u64 bm_lru; /* lru sequence number */
u32 bm_secs; /* size of bitmap, in sectors */
struct vhd_bitmap *bitmap[VHD_CACHE_SIZE];
int bm_free_count;
struct vhd_bitmap *bitmap_free[VHD_CACHE_SIZE];
struct vhd_bitmap bitmap_list[VHD_CACHE_SIZE];
int vreq_free_count;
struct vhd_request *vreq_free[VHD_REQS_DATA];
struct vhd_request vreq_list[VHD_REQS_DATA];
td_driver_t *driver;
uint64_t queued;
uint64_t completed;
uint64_t returned;
uint64_t reads;
uint64_t read_size;
uint64_t writes;
uint64_t write_size;
};
#define test_vhd_flag(word, flag) ((word) & (flag))
#define set_vhd_flag(word, flag) ((word) |= (flag))
#define clear_vhd_flag(word, flag) ((word) &= ~(flag))
#define bat_entry(s, blk) ((s)->bat.bat.bat[(blk)])
static void vhd_complete(void *, struct tiocb *, int);
static void finish_data_transaction(struct vhd_state *, struct vhd_bitmap *);
static struct vhd_state *_vhd_master;
static unsigned long _vhd_zsize;
static char *_vhd_zeros;
static int
vhd_initialize(struct vhd_state *s)
{
if (_vhd_zeros)
return 0;
_vhd_zsize = 2 * getpagesize();
if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_PREALLOCATE))
_vhd_zsize += VHD_BLOCK_SIZE;
_vhd_zeros = mmap(0, _vhd_zsize, PROT_READ,
MAP_SHARED | MAP_ANONYMOUS, -1, 0);
if (_vhd_zeros == MAP_FAILED) {
EPRINTF("vhd_initialize failed: %d\n", -errno);
_vhd_zeros = NULL;
_vhd_zsize = 0;
return -errno;
}
_vhd_master = s;
return 0;
}
static void
vhd_free(struct vhd_state *s)
{
if (_vhd_master != s || !_vhd_zeros)
return;
munmap(_vhd_zeros, _vhd_zsize);
_vhd_zsize = 0;
_vhd_zeros = NULL;
_vhd_master = NULL;
}
static char *
_get_vhd_zeros(const char *func, unsigned long size)
{
if (!_vhd_zeros || _vhd_zsize < size) {
EPRINTF("invalid zero request from %s: %lu, %lu, %p\n",
func, size, _vhd_zsize, _vhd_zeros);
ASSERT(0);
}
return _vhd_zeros;
}
#define vhd_zeros(size) _get_vhd_zeros(__func__, size)
static inline void
set_batmap(struct vhd_state *s, uint32_t blk)
{
if (s->bat.batmap.map) {
vhd_batmap_set(&s->vhd, &s->bat.batmap, blk);
DBG(TLOG_DBG, "block 0x%x completely full\n", blk);
}
}
static inline int
test_batmap(struct vhd_state *s, uint32_t blk)
{
if (!s->bat.batmap.map)
return 0;
return vhd_batmap_test(&s->vhd, &s->bat.batmap, blk);
}
static int
vhd_kill_footer(struct vhd_state *s)
{
int err;
off64_t end;
char *zeros;
if (s->vhd.footer.type == HD_TYPE_FIXED)
return 0;
err = posix_memalign((void **)&zeros, 512, 512);
if (err)
return -err;
err = 1;
memset(zeros, 0xc7c7c7c7, 512);
if ((end = lseek64(s->vhd.fd, 0, SEEK_END)) == -1)
goto fail;
if (lseek64(s->vhd.fd, (end - 512), SEEK_SET) == -1)
goto fail;
if (write(s->vhd.fd, zeros, 512) != 512)
goto fail;
err = 0;
fail:
free(zeros);
if (err)
return (errno ? -errno : -EIO);
return 0;
}
static inline int
find_next_free_block(struct vhd_state *s)
{
int err;
off64_t eom;
uint32_t i, entry;
err = vhd_end_of_headers(&s->vhd, &eom);
if (err)
return err;
s->next_db = secs_round_up(eom);
for (i = 0; i < s->bat.bat.entries; i++) {
entry = bat_entry(s, i);
if (entry != DD_BLK_UNUSED && entry >= s->next_db)
s->next_db = entry + s->spb + s->bm_secs;
}
return 0;
}
static void
vhd_free_bat(struct vhd_state *s)
{
free(s->bat.bat.bat);
free(s->bat.batmap.map);
free(s->bat.bat_buf);
memset(&s->bat, 0, sizeof(struct vhd_bat));
}
static int
vhd_initialize_bat(struct vhd_state *s)
{
int err, psize, batmap_required, i;
memset(&s->bat, 0, sizeof(struct vhd_bat));
psize = getpagesize();
err = vhd_read_bat(&s->vhd, &s->bat.bat);
if (err) {
EPRINTF("%s: reading bat: %d\n", s->vhd.file, err);
return err;
}
batmap_required = 1;
if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_RDONLY)) {
batmap_required = 0;
} else {
err = find_next_free_block(s);
if (err)
goto fail;
}
if (vhd_has_batmap(&s->vhd)) {
for (i = 0; i < VHD_BATMAP_MAX_RETRIES; i++) {
err = vhd_read_batmap(&s->vhd, &s->bat.batmap);
if (err) {
EPRINTF("%s: reading batmap: %d\n",
s->vhd.file, err);
if (batmap_required)
goto fail;
} else {
break;
}
}
if (err)
EPRINTF("%s: ignoring non-critical batmap error\n",
s->vhd.file);
}
err = posix_memalign((void **)&s->bat.bat_buf,
VHD_SECTOR_SIZE, VHD_SECTOR_SIZE);
if (err) {
s->bat.bat_buf = NULL;
goto fail;
}
return 0;
fail:
vhd_free_bat(s);
return err;
}
static void
vhd_free_bitmap_cache(struct vhd_state *s)
{
int i;
struct vhd_bitmap *bm;
for (i = 0; i < VHD_CACHE_SIZE; i++) {
bm = s->bitmap_list + i;
free(bm->map);
free(bm->shadow);
s->bitmap_free[i] = NULL;
}
memset(s->bitmap_list, 0, sizeof(struct vhd_bitmap) * VHD_CACHE_SIZE);
}
static int
vhd_initialize_bitmap_cache(struct vhd_state *s)
{
int i, err, map_size;
struct vhd_bitmap *bm;
memset(s->bitmap_list, 0, sizeof(struct vhd_bitmap) * VHD_CACHE_SIZE);
s->bm_lru = 0;
map_size = vhd_sectors_to_bytes(s->bm_secs);
s->bm_free_count = VHD_CACHE_SIZE;
for (i = 0; i < VHD_CACHE_SIZE; i++) {
bm = s->bitmap_list + i;
err = posix_memalign((void **)&bm->map, 512, map_size);
if (err) {
bm->map = NULL;
goto fail;
}
err = posix_memalign((void **)&bm->shadow, 512, map_size);
if (err) {
bm->shadow = NULL;
goto fail;
}
memset(bm->map, 0, map_size);
memset(bm->shadow, 0, map_size);
s->bitmap_free[i] = bm;
}
return 0;
fail:
vhd_free_bitmap_cache(s);
return err;
}
static int
vhd_initialize_dynamic_disk(struct vhd_state *s)
{
int err;
err = vhd_get_header(&s->vhd);
if (err) {
if (!test_vhd_flag(s->flags, VHD_FLAG_OPEN_QUIET))
EPRINTF("Error reading VHD DD header.\n");
return err;
}
if (s->vhd.header.hdr_ver != 0x00010000) {
EPRINTF("unsupported header version! (0x%x)\n",
s->vhd.header.hdr_ver);
return -EINVAL;
}
s->spp = getpagesize() >> VHD_SECTOR_SHIFT;
s->spb = s->vhd.header.block_size >> VHD_SECTOR_SHIFT;
s->bm_secs = secs_round_up_no_zero(s->spb >> 3);
if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_NO_CACHE))
return 0;
err = vhd_initialize_bat(s);
if (err)
return err;
err = vhd_initialize_bitmap_cache(s);
if (err) {
vhd_free_bat(s);
return err;
}
return 0;
}
static int
vhd_check_version(struct vhd_state *s)
{
if (strncmp(s->vhd.footer.crtr_app, "tap", 3))
return 0;
if (s->vhd.footer.crtr_ver > VHD_CURRENT_VERSION) {
if (!test_vhd_flag(s->flags, VHD_FLAG_OPEN_QUIET))
EPRINTF("WARNING: %s vhd creator version 0x%08x, "
"but only versions up to 0x%08x are "
"supported for IO\n", s->vhd.file,
s->vhd.footer.crtr_ver, VHD_CURRENT_VERSION);
return -EINVAL;
}
return 0;
}
static void
vhd_log_open(struct vhd_state *s)
{
char buf[5];
uint32_t i, allocated, full;
if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_QUIET))
return;
snprintf(buf, sizeof(buf), "%s", s->vhd.footer.crtr_app);
if (!vhd_type_dynamic(&s->vhd)) {
DPRINTF("%s version: %s 0x%08x\n",
s->vhd.file, buf, s->vhd.footer.crtr_ver);
return;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -