📄 journal.c
字号:
/* * linux/fs/journal.c * * Written by Stephen C. Tweedie <sct@redhat.com>, 1998 * * Copyright 1998 Red Hat corp --- All Rights Reserved * * This file is part of the Linux kernel and is made available under * the terms of the GNU General Public License, version 2, or at your * option, any later version, incorporated herein by reference. * * Generic filesystem journal-writing code; part of the ext2fs * journaling system. * * This file manages journals: areas of disk reserved for logging * transactional updates. This includes the kernel journaling thread * which is responsible for scheduling updates to the log. * * We do not actually manage the physical storage of the journal in this * file: that is left to a per-journal policy function, which allows us * to store the journal within a filesystem-specified area for ext2 * journaling (ext2 can use a reserved inode for storing the log). */#include <linux/module.h>#include <linux/sched.h>#include <linux/fs.h>#include <linux/jbd.h>#include <linux/errno.h>#include <linux/slab.h>#include <linux/locks.h>#include <linux/smp_lock.h>#include <linux/sched.h>#include <linux/init.h>#include <linux/mm.h>#include <linux/slab.h>#include <asm/uaccess.h>#include <linux/proc_fs.h>EXPORT_SYMBOL(journal_start);EXPORT_SYMBOL(journal_try_start);EXPORT_SYMBOL(journal_restart);EXPORT_SYMBOL(journal_extend);EXPORT_SYMBOL(journal_stop);EXPORT_SYMBOL(journal_lock_updates);EXPORT_SYMBOL(journal_unlock_updates);EXPORT_SYMBOL(journal_get_write_access);EXPORT_SYMBOL(journal_get_create_access);EXPORT_SYMBOL(journal_get_undo_access);EXPORT_SYMBOL(journal_dirty_data);EXPORT_SYMBOL(journal_dirty_metadata);#if 0EXPORT_SYMBOL(journal_release_buffer);#endifEXPORT_SYMBOL(journal_forget);#if 0EXPORT_SYMBOL(journal_sync_buffer);#endifEXPORT_SYMBOL(journal_flush);EXPORT_SYMBOL(journal_revoke);EXPORT_SYMBOL(journal_init_dev);EXPORT_SYMBOL(journal_init_inode);EXPORT_SYMBOL(journal_update_format);EXPORT_SYMBOL(journal_check_used_features);EXPORT_SYMBOL(journal_check_available_features);EXPORT_SYMBOL(journal_set_features);EXPORT_SYMBOL(journal_create);EXPORT_SYMBOL(journal_load);EXPORT_SYMBOL(journal_destroy);EXPORT_SYMBOL(journal_recover);EXPORT_SYMBOL(journal_update_superblock);EXPORT_SYMBOL(journal_abort);EXPORT_SYMBOL(journal_errno);EXPORT_SYMBOL(journal_ack_err);EXPORT_SYMBOL(journal_clear_err);EXPORT_SYMBOL(log_wait_commit);EXPORT_SYMBOL(log_start_commit);EXPORT_SYMBOL(journal_wipe);EXPORT_SYMBOL(journal_blocks_per_page);EXPORT_SYMBOL(journal_flushpage);EXPORT_SYMBOL(journal_try_to_free_buffers);EXPORT_SYMBOL(journal_bmap);EXPORT_SYMBOL(journal_force_commit);static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);/* * journal_datalist_lock is used to protect data buffers: * * bh->b_transaction * bh->b_tprev * bh->b_tnext * * journal_free_buffer() is called from journal_try_to_free_buffer(), and is * async wrt everything else. * * It is also used for checkpoint data, also to protect against * journal_try_to_free_buffer(): * * bh->b_cp_transaction * bh->b_cpnext * bh->b_cpprev * transaction->t_checkpoint_list * transaction->t_cpnext * transaction->t_cpprev * journal->j_checkpoint_transactions * * It is global at this time rather than per-journal because it's * impossible for __journal_free_buffer to go from a buffer_head * back to a journal_t unracily (well, not true. Fix later) * * * The `datalist' and `checkpoint list' functions are quite * separate and we could use two spinlocks here. * * lru_list_lock nests inside journal_datalist_lock. */spinlock_t journal_datalist_lock = SPIN_LOCK_UNLOCKED;/* * jh_splice_lock needs explantion. * * In a number of places we want to do things like: * * if (buffer_jbd(bh) && bh2jh(bh)->foo) * * This is racy on SMP, because another CPU could remove the journal_head * in the middle of this expression. We need locking. * * But we can greatly optimise the locking cost by testing BH_JBD * outside the lock. So, effectively: * * ret = 0; * if (buffer_jbd(bh)) { * spin_lock(&jh_splice_lock); * if (buffer_jbd(bh)) { (* Still there? *) * ret = bh2jh(bh)->foo; * } * spin_unlock(&jh_splice_lock); * } * return ret; * * Now, that protects us from races where another CPU can remove the * journal_head. But it doesn't defend us from the situation where another * CPU can *add* a journal_head. This is a correctness issue. But it's not * a problem because a) the calling code was *already* racy and b) it often * can't happen at the call site and c) the places where we add journal_heads * tend to be under external locking. */spinlock_t jh_splice_lock = SPIN_LOCK_UNLOCKED;/* * List of all journals in the system. Protected by the BKL. */static LIST_HEAD(all_journals);/* * Helper function used to manage commit timeouts */static void commit_timeout(unsigned long __data){ struct task_struct * p = (struct task_struct *) __data; wake_up_process(p);}/* Static check for data structure consistency. There's no code * invoked --- we'll just get a linker failure if things aren't right. */void __journal_internal_check(void){ extern void journal_bad_superblock_size(void); if (sizeof(struct journal_superblock_s) != 1024) journal_bad_superblock_size();}/* * kjournald: The main thread function used to manage a logging device * journal. * * This kernel thread is responsible for two things: * * 1) COMMIT: Every so often we need to commit the current state of the * filesystem to disk. The journal thread is responsible for writing * all of the metadata buffers to disk. * * 2) CHECKPOINT: We cannot reuse a used section of the log file until all * of the data in that part of the log has been rewritten elsewhere on * the disk. Flushing these old buffers to reclaim space in the log is * known as checkpointing, and this thread is responsible for that job. */journal_t *current_journal; // AKPM: debugint kjournald(void *arg){ journal_t *journal = (journal_t *) arg; transaction_t *transaction; struct timer_list timer; current_journal = journal; lock_kernel(); daemonize(); spin_lock_irq(¤t->sigmask_lock); sigfillset(¤t->blocked); recalc_sigpending(current); spin_unlock_irq(¤t->sigmask_lock); sprintf(current->comm, "kjournald"); /* Set up an interval timer which can be used to trigger a commit wakeup after the commit interval expires */ init_timer(&timer); timer.data = (unsigned long) current; timer.function = commit_timeout; journal->j_commit_timer = &timer; /* Record that the journal thread is running */ journal->j_task = current; wake_up(&journal->j_wait_done_commit); printk(KERN_INFO "kjournald starting. Commit interval %ld seconds\n", journal->j_commit_interval / HZ); list_add(&journal->j_all_journals, &all_journals); /* And now, wait forever for commit wakeup events. */ while (1) { if (journal->j_flags & JFS_UNMOUNT) break; jbd_debug(1, "commit_sequence=%d, commit_request=%d\n", journal->j_commit_sequence, journal->j_commit_request); if (journal->j_commit_sequence != journal->j_commit_request) { jbd_debug(1, "OK, requests differ\n"); if (journal->j_commit_timer_active) { journal->j_commit_timer_active = 0; del_timer(journal->j_commit_timer); } journal_commit_transaction(journal); continue; } wake_up(&journal->j_wait_done_commit); interruptible_sleep_on(&journal->j_wait_commit); jbd_debug(1, "kjournald wakes\n"); /* Were we woken up by a commit wakeup event? */ if ((transaction = journal->j_running_transaction) != NULL && time_after_eq(jiffies, transaction->t_expires)) { journal->j_commit_request = transaction->t_tid; jbd_debug(1, "woke because of timeout\n"); } } if (journal->j_commit_timer_active) { journal->j_commit_timer_active = 0; del_timer_sync(journal->j_commit_timer); } list_del(&journal->j_all_journals); journal->j_task = NULL; wake_up(&journal->j_wait_done_commit); jbd_debug(1, "Journal thread exiting.\n"); return 0;}static void journal_start_thread(journal_t *journal){ kernel_thread(kjournald, (void *) journal, CLONE_VM | CLONE_FS | CLONE_FILES); while (!journal->j_task) sleep_on(&journal->j_wait_done_commit);}static void journal_kill_thread(journal_t *journal){ journal->j_flags |= JFS_UNMOUNT; while (journal->j_task) { wake_up(&journal->j_wait_commit); sleep_on(&journal->j_wait_done_commit); }}#if 0This is no longer needed - we do it in commit quite efficiently.Note that if this function is resurrected, the loop needs tobe reorganised into the next_jh/last_jh algorithm./* * journal_clean_data_list: cleanup after data IO. * * Once the IO system has finished writing the buffers on the transaction's * data list, we can remove those buffers from the list. This function * scans the list for such buffers and removes them cleanly. * * We assume that the journal is already locked. * We are called with journal_datalist_lock held. * * AKPM: This function looks inefficient. Approximately O(n^2) * for potentially thousands of buffers. It no longer shows on profiles * because these buffers are mainly dropped in journal_commit_transaction(). */void __journal_clean_data_list(transaction_t *transaction){ struct journal_head *jh, *next; assert_spin_locked(&journal_datalist_lock);restart: jh = transaction->t_sync_datalist; if (!jh) goto out; do { next = jh->b_tnext; if (!buffer_locked(jh2bh(jh)) && !buffer_dirty(jh2bh(jh))) { struct buffer_head *bh = jh2bh(jh); BUFFER_TRACE(bh, "data writeout complete: unfile"); __journal_unfile_buffer(jh); jh->b_transaction = NULL; __journal_remove_journal_head(bh); refile_buffer(bh); __brelse(bh); goto restart; } jh = next; } while (transaction->t_sync_datalist && jh != transaction->t_sync_datalist);out: return;}#endif/* * journal_write_metadata_buffer: write a metadata buffer to the journal. * * Writes a metadata buffer to a given disk block. The actual IO is not * performed but a new buffer_head is constructed which labels the data * to be written with the correct destination disk block. * * Any magic-number escaping which needs to be done will cause a * copy-out here. If the buffer happens to start with the * JFS_MAGIC_NUMBER, then we can't write it to the log directly: the * magic number is only written to the log for descripter blocks. In * this case, we copy the data and replace the first word with 0, and we * return a result code which indicates that this buffer needs to be * marked as an escaped buffer in the corresponding log descriptor * block. The missing word can then be restored when the block is read * during recovery. * * If the source buffer has already been modified by a new transaction * since we took the last commit snapshot, we use the frozen copy of * that data for IO. If we end up using the existing buffer_head's data * for the write, then we *have* to lock the buffer to prevent anyone * else from using and possibly modifying it while the IO is in * progress. * * The function returns a pointer to the buffer_heads to be used for IO. * * We assume that the journal has already been locked in this function. * * Return value: * <0: Error * >=0: Finished OK * * On success: * Bit 0 set == escape performed on the data * Bit 1 set == buffer copy-out performed (kfree the data after IO) */static inline unsigned long virt_to_offset(void *p) {return ((unsigned long) p) & ~PAGE_MASK;} int journal_write_metadata_buffer(transaction_t *transaction, struct journal_head *jh_in, struct journal_head **jh_out, int blocknr){ int need_copy_out = 0; int done_copy_out = 0; int do_escape = 0; char *mapped_data; struct buffer_head *new_bh; struct journal_head * new_jh; struct page *new_page; unsigned int new_offset; /* * The buffer really shouldn't be locked: only the current committing * transaction is allowed to write it, so nobody else is allowed * to do any IO. * * akpm: except if we're journalling data, and write() output is * also part of a shared mapping, and another thread has * decided to launch a writepage() against this buffer. */ J_ASSERT_JH(jh_in, buffer_jdirty(jh2bh(jh_in))); /* * If a new transaction has already done a buffer copy-out, then * we use that version of the data for the commit. */ if (jh_in->b_frozen_data) { done_copy_out = 1; new_page = virt_to_page(jh_in->b_frozen_data); new_offset = virt_to_offset(jh_in->b_frozen_data); } else { new_page = jh2bh(jh_in)->b_page; new_offset = virt_to_offset(jh2bh(jh_in)->b_data); } mapped_data = ((char *) kmap(new_page)) + new_offset; /* * Check for escaping */ if (* ((unsigned int *) mapped_data) == htonl(JFS_MAGIC_NUMBER)) { need_copy_out = 1; do_escape = 1; } /* * Do we need to do a data copy? */ if (need_copy_out && !done_copy_out) { char *tmp; tmp = jbd_rep_kmalloc(jh2bh(jh_in)->b_size, GFP_NOFS); jh_in->b_frozen_data = tmp; memcpy (tmp, mapped_data, jh2bh(jh_in)->b_size); /* If we get to this path, we'll always need the new address kmapped so that we can clear the escaped magic number below. */ kunmap(new_page); new_page = virt_to_page(tmp); new_offset = virt_to_offset(tmp); mapped_data = ((char *) kmap(new_page)) + new_offset; done_copy_out = 1; } /* * Right, time to make up the new buffer_head. */ do { new_bh = get_unused_buffer_head(0); if (!new_bh) { printk (KERN_NOTICE __FUNCTION__ ": ENOMEM at get_unused_buffer_head, " "trying again.\n"); current->policy |= SCHED_YIELD; schedule(); } } while (!new_bh); /* keep subsequent assertions sane */ new_bh->b_prev_free = 0; new_bh->b_next_free = 0;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -