📄 logredo.c
字号:
/* * Copyright (c) International Business Machines Corp., 2000-2002 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See * the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */#include <config.h>#include <time.h>#include <stdio.h>#include <stdlib.h>#include <memory.h>#include <string.h>#include <sys/stat.h>#include <unistd.h>#include <fcntl.h>#include <errno.h>#include <assert.h>#include "jfs_types.h"#include "jfs_endian.h"#include "jfs_filsys.h"#include "jfs_superblock.h"#include "jfs_dinode.h"#include "jfs_dtree.h"#include "jfs_xtree.h"#include "jfs_logmgr.h"#include "jfs_dmap.h"#include "jfs_imap.h"#include "logredo.h"#include "logform.h"#include "devices.h"#include "debug.h"#include "utilsubs.h"#include "fsck_message.h" /* for chkdsk message logging facility *//* + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + * * L O C A L M A C R O D E F I N I T I O N S * */#define MAKEDEV(__x,__y) (dev_t)(((__x)<<16) | (__y))#define LOGPNTOB(x) ((x)<<L2LOGPSIZE)#define LOG2NUM(NUM, L2NUM)\{\ if ((NUM) <= 0)\ L2NUM = -1;\ else\ if ((NUM) == 1)\ L2NUM = 0;\ else\ {\ L2NUM = 0;\ while ( (NUM) > 1 )\ {\ L2NUM++;\ (NUM) >>= 1;\ }\ }\}/* + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + * * R E M E M B E R M E M O R Y A L L O C F A I L U R E * */int32_t Insuff_memory_for_maps = 0;char *available_stg_addr = NULL;int32_t available_stg_bytes = 0;char *bmap_stg_addr = NULL;int32_t bmap_stg_bytes = 0;/* + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + * * S T U F F F O R T H E L O G * */struct logsuper logsup; /* log super block */int32_t numdoblk; /* number of do blocks used */int32_t numnodofile; /* number of nodo file blocks used */int32_t numExtDtPg = 0; /* number of extended dtpage blocks used *//* * open file system aggregate/lv array * * logredo() processes a single log. * * In the first release, logredo will process a single log which relates * to the single fileset in a single aggregate. In some future release, * a single log may be used for multiple filesets which may or may not all * reside in the same aggregate. * */struct vopen vopen[MAX_ACTIVE];struct log_info Log;struct { uuid_t uuid; FILE *fp;} primary_vol;extern int LogOpenMode; /* logdump sets this to O_RDONLY *//* * if this flag is set then the primary superblock is * corrupt. The secondary superblock is good, but chkdsk * wasn't able to fix the primary version. logredo can * run, but must use the secondary version of the * aggregate superblock */int32_t use_2ndary_agg_superblock;/* * file system page buffer cache * * for k > 0, bufhdr[k] describes contents of buffer[k-1]. * bufhdr[0] is reserved as anchor for free/lru list: * bufhdr[0].next points to the MRU buffer (head), * bufhdr[0].prev points to the LRU buffer (tail); *//* buffer header table */struct bufhdr { int16_t next; /* 2: next on free/lru list */ int16_t prev; /* 2: previous on free/lru list */ int16_t hnext; /* 2: next on hash chain */ int16_t hprev; /* 2: previous on hash chain */ char modify; /* 1: buffer was modified */ char inuse; /* 1: buffer on hash chain */ int16_t reserve; /* 2 */ int32_t vol; /* 4: minor of agrregate/lv number */ pxd_t pxd; /* 8: on-disk page pxd */} bufhdr[NBUFPOOL]; /* (24) *//* buffer table */struct bufpool { char bytes[PSIZE];} buffer[NBUFPOOL - 1];/* * log page buffer cache * * log has its own 4 page buffer pool. */uint8_t afterdata[LOGPSIZE * 2]; /* buffer to read in redopage data *//* * Miscellaneous */caddr_t prog; /* Program name */int32_t mntcnt, bufsize;char *mntinfo;int32_t retcode; /* return code from logredo */int end_of_transaction = 0;/* * external references */extern char *optarg;extern int optind;extern int initMaps(int32_t);extern int updateMaps(int);extern int findEndOfLog(void);extern int logRead(int32_t, struct lrd *, char *);extern int logredoInit(void);extern int doCommit(struct lrd *);extern int doExtDtPg(void);extern int doNoRedoFile(struct lrd *, uint32_t);extern int doNoRedoPage(struct lrd *);extern int doNoRedoInoExt(struct lrd *);extern int doAfter(struct lrd *, int32_t);extern int doUpdateMap(struct lrd *);extern int alloc_wrksp(uint32_t, int, int, void **);extern FILE * open_by_label(uuid_t, int, int, char *, int *);extern char log_device[];/* * forward references */int doMount(struct lrd *);int openVol(int32_t);int updateSuper(int vol);int rdwrSuper(FILE *, struct superblock *, int32_t);int bflush(int32_t, struct bufpool *);int logOpen(void);int fsError(int, int, int64_t);int logError(int, int);static int recoverExtendFS(FILE *);int alloc_storage(int32_t, void **, int32_t *);int alloc_dmap_bitrec(struct dmap_bitmaps **);/* * debug control */#ifdef _JFS_DEBUGint32_t dflag = 1;time_t *Tp;uint32_t tp_start, tp_end;int xdump(char *, int);int x_scmp(char *, char *);void x_scpy(char *, char *);int prtdesc(struct lrd *);#elseint32_t dflag = 0;#endif/* * NAME: jfs_logredo() * * FUNCTION: Replay all transactions committed since the most * recent synch point. * * NOTES: * >>>>>> The log replay is accomplished in one pass over the * log, reading backwards from logend to the first synch * point record encountered. This means that the log * entries are read and processed in LIFO (Last-In-First-Out) * order. In other words, the records logged latest in * time are the first records processed during log replay. * * >>>>>> Inodes, index trees, and directory trees * * Inodes, index tree structures, and directory tree * structures are handled by processing committed redopage * records which have not been superceded by noredo records. * This processing copies data from the log record into the * appropriate disk extent page(s). * * To ensure that only the last (in time) updates to any * given disk page are applied during log replay, logredo * maintains a record (union structure summary1/summary2), * for each disk page which it has processed, of which * portions have been updated by log records encountered. * * >>>>>> Inode Allocation Map processing * The xtree for the Inode Allocation Map is journaled, and * a careful write is used to update it during commit * processing. * The imap index tree is also duplicated at the known location. (TBD) * So at logredo time, the xtree for imap is always readable and correct. * This is the basic requirement from logredo. * * the inode map control page (struct dinomap) is only flushed to disk at * the umount time. For iag, pmap will go to disk at commit time. * iagnum will not change in run-time. * agstart field will stable without extendfs utility. It is TBD for * how to handle agstart when extendfs utility is available. * Other fields ( wmap. inosmap, extsmap ino free list pointers, * ino ext free list pointers ) are at working status ( i.e they are * updated in run-time. So the following * meta-data of the imap need to be reconstructed at the logredo time: * 1) IAGs, the pmap of imap and inoext array are contained in IAGs. * 2) AG Free inode list * 3) AG Free Inode Extent list * 4) IAG Free list * * There are two imaps need to take care of : * 1) aggregate imap * 2) fileset imap * For the first release, the aggregate imap is stable and we only * need to deal with the fileset imap. * * Block Allocation Map (bmap file) is for an aggregate/lv. There are * three fields related to the size of bmap file. * 1) superblock.s_size: This field indicates aggregate size. It * tells number of sector-size blocks for this * aggregate. The size of aggregate determines * the size of its bmap file. * Since the aggregate's superblock is updated * using sync-write, superblock.s_size is trustable * at logredo time. * note1: mkfs reserves the fsck space. So s_size really * inidcate (size_of_aggregate - fsck_reserve_space) * note2: At the mkfs time, "-s" parameter could be used * to indicate how large the aggregate/filesystem is. * One lv contains at most one aggregate/filesystem. * If "-s" gives the value is smaller than the size * of lv, it is ok. The space is just wasted. * * Without "-s" parameter, mkfs wil use the whole * size of lv to make an aggregate/filesystem. * That is usually the case. So we can also say * an aggregate/lv. "-s" is often used for test. * * 2) dbmap.dn_mapsize: This field also indicates aggregate/lv size. * It tells number of aggre. blocks in the * aggregate/lv. Without extendfs, this field should * be equivalent to superblock.s_size. * With extendfs, this field may not be updated * before a system crash happens. So logredo * need to update it. * 3) dinode.di_size: For an inode of bmap file, this field indicates * the logical size of the file. I.e. it contains * the offset value of the last byte written * in the file plus one. * So di_size will include the bmap control page, * the dmap control pages and dmap pages. * In the JFS, if a file is a sparse file, the logical * size is different from its physical size. * The bmap file is a sparse file if the total of * dmap pages is ( < 1024) or ( < 1024 * 1024). * In that case, physically L1.0, and/or L2 does * not exist, but di_size will include their page * size. * * Note: The di_size does NOT contain the logical * structure of the file, i.e. the space allocated * for the xtree stuff is not indicated in di_size. * It is indicated in di_nblocks. * * In addition, the mkfs always put one more dmap * page into the bmap file for preparing extendfs. * This hidden dmap page cannot be figured out from * superblock.s_size, but di_size includes it. Any * dmapctl pages caused by this hidden dmap page * are also included in di_size. * * The bmap control page, dmap control pages and dmap pages are all * needed to rebuild at logredo time. * * In overall, the following actions are taken at logredo time: * 1) apply log rec data to the specified page. * 2) initialize freelist for dtree page or root. * 3) rebuilt imap * 4) rebuilt bmap * in addition, in order to ensure the log record only applying to a * certain portion of page one time, logredo will start NoRedoFile, * NoRedoExtent/NoRedoPage filter in the process for accuracy and * efficiency. * * The three log rec types: REDOPAGE, NOREDOPAGE, NOREDOINOEXT, and * UPDATEMAP, are the main force to initiate these actions. See * comments on doAfter(), updatePage(), doNoRedoPage(), doNoRedoInoExt, * and doUpdateMap() for detailed information. * * If the aggregate/lv has state of FM_DIRTY, then fsck will run * after the logredo process since logredo could not get 100% * recovery. Currently bmap rebuild is slow ( 1 min per 32 GB), * so logredo will NOT rebuild imap and bmap if fsck will do it * anyway. But logredo still read maps in and mark them for starting * NoRedoExtent/NoRedoPage filter. * * The maps are rebuilt in the following way: * at the init phase, storage is allocated for the whole map file for * both imap and bmap. Reading in the map files from the disk. * The wmap is inited to zero. At the logredo time, the wmap is used * to track the bits in pmap. In the beginning of the logredo process * the allocation status of every block is in doubt. As log records * are processed, the allocation state is determined and the bit of pmap * is updated. This fact is recorded in the corresponding bits in wmap. * So a pmap bit is only updated once at logredo time and only updated * by the latest in time log record. * At the end of logredo, the control information, the freelist, etc. * are built from the value of pmap; then pmap is copied to wmap and * the whole map is written back to disk. * * the status field s_state in the superblock of each file-system is * set to FM_CLEAN provided the initial status was either FM_CLEAN * or FM_MOUNT and logredo processing was successful. If an error * is detected in logredo the status is set to FM_LOGREDO. the status * is not changed if its initial value was FM_MDIRTY. fsck should be * run to clean-up the probable damage if the status after logredo * is either FM_LOGREDO or FM_MDIRTY. * * The log record has the format: * <LogRecordData><LogRecLRD> * At logredo time, the log is read backward. So for every log rec, * we read LogRecLRD, which tells how long the LogRecordData is. * see comments on updatePage() for detailed info of log record format. * *..................................................................... * The logredo handles the log-within-file-system (aka inline log) issue: *..................................................................... * For AIX, we always deal with the outline log, i.e. the log resides * in a separate logical volume. A log is associated with one volume * group and can be shared by many file systems with this volume group. * In AIX, the logredo received a device name. It then determines if * this device is a log name or a filesystem name. If it is a filesustem * name, get the log minor number for this filesystem. If it is a log name, * get its minor number. * * XJFS decided to put log inside the file system * * For supporting the inline log, the above AIX logic should be changed. * * Here is the outline: * * When the logredo received a device name, it first read the SIZE_OF_SUPER * bytes from SUPER1_OFF offset to see if it is a file system superblock. * If yes, check the s_flag to see if it has a inline log or outline log. * for an inline log the s_logdev should match the input device name's * major and minor number. If not, an error is returned and logredo exit. * If no error, the logredo read the log superblock according the log info * in the fs superblock. * If the device name does not represent a filesystem device, then logredo * read the LOGPSIZE bytes from the log page 1 location. If it indicates * a log device, then open the filesystems according to the log superblock's * active list. For each filesystem in the active list, read its superblock * if one of the superblock indicates that it uses an inline log, return * an error. It is a system code bug if some filesystems use inline log * and some use outline log. * If the superblock indicates it used an outline log, check the superblock's * s_logdev to match the input device name's major and minor numbers. * If one of them does not match, return error. -- It is a system code bug, * if some match and some not match; -- It should either match all or non of * them match. The AIX logredo never check s_logdev with the input log device. * We should check here. * * for outline log, logredo will be called once to cover all the file * systems in the log superblock's active list. * For inline log, logredo will be called many times. Each time is for * one file system. The log superblock's active list has nothing. The * logmajor and logminor contains file system's major and minor number. * *..................................................................... * logredo handles support EA: *..................................................................... * There is 16-byte EA descriptor which is located in the section I of * dinode. * The EA can be inline or outline. If it is inlineEA then the data will * occupy the section IV of the dinode. The dxd_t.flag will indicate so. * If it is outlineEA, dxd_t.flag will indicate so and the single extent * is described by EA descriptor. * * The section IV of dinode has 128 byte. It is shared by the xtroot and * inlineEA. The sharing is in FCFS style. If xtree gets the section IV, * xtree will never give it away even if xtree is shrink or split. * If inlineEA gets it, there is a chance that later inlineEA is freed and * so xtree still can get it. * * for outlineEA, the XJFS will syncly write the data portion out so there * is no log rec for the data, but there is still an INODE log rec for EA * descriptor changes and there is a UPDATEMAP log rec for the allocated * pxd. If an outlineEA is freed, there are also two log records for it: * one is INODE with EA descriptor zeroed out, another is the UPDATEMAP * log rec for the freed pxd. * For inlineEA, it has to be recorded in the log rec. It is not in a * separate log rec. Just one additional segment is added into the * INODE log rec. So an INODE log rec can have at most three segments: * when the parent and child inodes are in the same page, then there are * one segment for parent base inode; one segment for child base inode; * and maybe the third one for the child inlineEA data. *.................................................................... * 32-bit vs 64-bit * At the first release. assume that a file system will not be larger * than 32-bit. *.................................................................... * TBD: * the method for handling crashes in the middle of extending a file * system is as follows. the size of a filesystem is established from * the superblock.s_size field (i.e the sizes in the diskmap * and inodemaps are ignored). in extendfs (jfs_cntl.c) the superblock * is not updated before the maps have been extended and the new inodes * formatted to zeros. no allocations in the new part of the filesystem * occur prior to the change in map sizes. if a crash occurs just * before updating the superblock, the map sizes will be their old * values. in this case the maps as files may be bigger than necessary. * if the crash occurs just after writing the super block, the map sizes * are fixed up here. */int jfs_logredo(caddr_t pathname, FILE *fp, int32_t use_2nd_aggSuper){ int rc; int k, logaddr, nextaddr, lastaddr, nlogrecords; int syncrecord = 0; struct lrd ld; int lowest_lr_byte = 2 * LOGPSIZE + LOGPHDRSIZE; int highest_lr_byte = 0; int log_has_wrapped = 0; int logend; int in_use; /*
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -