📄 035_fs_buffer_c.html

📁 重读linux 2.4.2o所写的笔记
💻 HTML
📖 第 1 页 / 共 5 页
字号:
        create_empty_buffers的调用者不少：<br>
        static int __block_write_full_page(struct inode *inode, struct page
        *page, get_block_t *get_block)<br>
        static int __block_prepare_write(struct inode *inode, struct page *page,<br>
        static int __block_commit_write(struct inode *inode, struct page *page,<br>
        <br>
        int <font color=#006600>block_prepare_write</font>(struct page *page,
        unsigned from, unsigned to,...)<br>
        int <font color=#006600>generic_commit_write</font>(struct file *file,
        struct page *page,...)<br>
        int <font color=#006600>block_read_full_page</font>(struct page *page,
        get_block_t *get_block)<br>
        int <font color=#006600>block_write_full_page</font>(struct page *page,
        get_block_t *get_block)<br>
        int <font color=#006600>block_truncate_page</font>(struct address_space
        *mapping, loff_t from, get_block_t *get_block)<br>
        int <font color=#006600>brw_page</font>(int rw, struct page *page,
        kdev_t dev, int b[], int size)<br>
        int <font color=#006600>brw_kiovec</font>(int rw, int nr, struct kiobuf
        *iovec[],<br>
        <br>
        这些函数为读写文件提供了buffer支持，作为一个io entry，使文件和磁盘驱动能够结合起来。手工
        trace这些函数，就会知道这些buffer没有加入buffer cache，是“真正的文件”内容而非文件的元数据。典型的例子是文件的读写：<br>
        do_generic_file_read -&gt; mapping-&gt;a_ops-&gt;readpage(filp,
        page);-&gt;ext2_readpage-&gt;<font color=#006600>block_read_full_page</font><br>
        generic_file_write -&gt;mapping-&gt;a_ops-&gt;prepare_write(file, page,
        offset, offset+bytes);-&gt;ext2_prepare_write-&gt;
        <font color=#006600>block_prepare_write</font><br>
        generic_file_write -&gt; mapping-&gt;a_ops-&gt;commit_write(file, page,
        offset, offset+bytes) -&gt;
        <font color=#006600>generic_commit_write</font><br>
        这里给出一个图示说明page cache， filemap，buffer cache， buffer entry(仅作io
        entry的buffer)的关系(也许不是100%正确!!)<br>
        <br>
        <div id=ips: style="PADDING:1em 0pt; TEXT-ALIGN:left">
          <img src=035_fs_buffer_c_images/dcbsxfpf_15cq94jchq.gif style="WIDTH:701px; HEIGHT:606px">
        </div>
        &nbsp; 马上回顾一下buffer_head的回收,就会发现,这种类型的buffer 很自然的进入page
        cache继而通过<font color=#006600><b>try_to_free_buffers </b>进行回收.</font><br>
        <br>
        实在没有必要把这些函数的实现都列到这里仔细讨论了，仅以其中一个为例吧，但是在讨论前还是说一下这些函数的用途吧：<br>
        这些函数值得注意的是写文件的方式，第一种提供给具体的文件系统使用，参考generic_file_write，<br>
        int <font color=#006600>block_prepare_write</font>(struct page *page,
        unsigned from, unsigned to,...)<br>
        int <font color=#006600>generic_commit_write</font>(struct file *file,
        struct page *page,...)<br>
        我们在讨论generic_file 的读写时也涉及到这些函数。<br>
        另外一中类型的是
        <font color=#006600>block_write_full_page，<font color=#000000>像是上面两个函数的打包，其实其中有不同</font>。</font><br>
        我们回顾一下generic_file_write的基本操作流程：<br>
        <br>
        ssize_t <b>generic_file_write</b>(struct file *file,const char
        *buf,size_t count,loff_t *ppos)<br>
        {<br>
        &nbsp;&nbsp;&nbsp; ............ //略过<br>
        &nbsp;&nbsp;&nbsp; while (count) {<br>
        &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; unsigned long bytes, index,
        offset;<br>
        &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; char *kaddr;<br>
        &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; int deactivate = 1;<br>
        <br>
        &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp;
        <b>/*</b><br style=FONT-WEIGHT:bold>
        <b>&nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;* Try to find the page in
        the cache. If it isn't there,</b><br style=FONT-WEIGHT:bold>
        <b>&nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;* allocate a free
        page.</b><br style=FONT-WEIGHT:bold>
        <b>&nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;*/</b><br>
        &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; offset = (pos &amp;
        (PAGE_CACHE_SIZE -1)); /* Within page */<br>
        &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; 。。。。<br>
        &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; page = __grab_cache_page(mapping,
        index, &amp;cached_page);<br>
        &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; if (!page)<br>
        &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; break;<br>
        <br>
        &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; /* We have exclusive IO access to
        the page.. */<br>
        &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; if (!PageLocked(page)) {<br>
        &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; PAGE_BUG(page);<br>
        &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; }<br>
        <br>
        &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp;<b>
        /*对于ext2,就是从磁盘先将文件页面读入,如果需要还要为文件分配磁盘block*/</b><br>
        &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; status =
        mapping-&gt;a_ops-&gt;prepare_write(file, page, offset, offset+bytes);<br>
        &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; if (status)<br>
        &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; goto unlock;<br>
        &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; kaddr = page_address(page);<br>
        &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; status =
        <b>copy_from_user</b>(kaddr+offset, buf, bytes);<br>
        &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; flush_dcache_page(page);<br>
        &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; if (status)<br>
        &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; goto
        fail_write;<br>
        &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp;<b> /*对于ext2,就是mark所有bh为dirt,mark
        对应 inode为dirty. 见 ext2_aops */</b><br>
        &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; status =
        mapping-&gt;a_ops-&gt;commit_write(file, page, offset, offset+bytes);<br>
        &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>
        &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; .............//略过<br>
        &nbsp;&nbsp;&nbsp; /* For now, when the user asks for O_SYNC, we'll
        actually<br>
        &nbsp;&nbsp;&nbsp; &nbsp;* provide O_DSYNC. */<br>
        &nbsp;&nbsp;&nbsp; if ((status &gt;= 0) &amp;&amp; (file-&gt;f_flags
        &amp; O_SYNC))<br>
        &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; status =
        generic_osync_inode(inode, 1); /* 1 means datasync */<br>
        &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br>
        }<br>
        <br>
        //mapping-&gt;a_ops-&gt;prepare_write -&gt;
        <font color=#006600>block_prepare_write
        --&gt;</font>__block_prepare_write<br>
        static int <b>__block_prepare_write</b>(struct inode *inode, struct page
        *page,<br>
        &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; unsigned from, unsigned to,
        get_block_t *get_block)<br>
        {<br>
        <br>
        &nbsp;&nbsp;&nbsp; if (!page-&gt;buffers)<br>
        &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp;
        <font color=#006600><b>create_empty_buffers</b></font>(page,
        inode-&gt;i_dev, blocksize); <b>//为page 创建 bh io entry</b><br>
        &nbsp;&nbsp; ........<br>
        <br>
        &nbsp;&nbsp;&nbsp; for(bh = head, block_start = 0; bh != head ||
        !block_start;<br>
        &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; block++, block_start=block_end, bh
        = bh-&gt;b_this_page) {<br>
        &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; ........<br>
        &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; if (!buffer_mapped(bh)) {&nbsp;<br>
        &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; err =
        get_block(inode, block, bh,
        1);<b>//如果没有对应到磁盘上就分配一个磁盘块,ext2,就是ext2_get_block,map
        bh到具体设备上的block</b><br>
        &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp;<br>
        &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; <b>if
        (buffer_new(bh))</b> {<br>
        &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp;
        &nbsp;&nbsp;<font color=#ff0000>&nbsp;
        </font><font color=#ff0000><b>unmap_underlying_metadata</b></font><b>(bh);
        </b><font color=#cc0000 size=4><b>//这次我们把这个东西讨论清楚...呵呵</b></font><br>
        &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;
        .....<br>
        &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; }<br>
        &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; }<br>
        &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;<br>
        &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; if (!buffer_uptodate(bh)
        &amp;&amp;<br>
        &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp;&nbsp;
        (block_start &lt; from || block_end &gt; to)) {<br>
        &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp;
        ll_rw_block(READ, 1, &amp;bh);&nbsp;<b> //read in , make it
        uptodate</b><br>
        &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; *wait_bh++=bh;<br>
        &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; }<br>
        &nbsp;&nbsp;&nbsp; }<br>
        &nbsp;&nbsp; ......<br>
        }<br>
        <font color=#ff0000><b>unmap_underlying_metadata 曾经是一个很困惑的问题,这次终于能够了断了
        :-) 我们曾经在linuxforum上有一个讨论,但是基本上没有说道点子上,见这个帖子:<br>
        </b></font>linux forum上讨论unmpa_underlaying_metadata 的讨论<br>
        <a href=http://www.linuxforum.net/forum/showthreaded.php?Cat=&amp;Board=linuxK&amp;Number=408077&amp;page=&amp;view=&amp;sb=&amp;o= id=is7v title=http://www.linuxforum.net/forum/showthreaded.php?Cat=&amp;Board=linuxK&amp;Number=408077&amp;page=&amp;view=&amp;sb=&amp;o=>http://www.linuxforum.net/forum/showthreaded.php?Cat=&amp;Board=linuxK&amp;Number=408077&amp;page=&amp;view=&amp;sb=&amp;o=</a><br>
        这次分析到这里,没有办法,经过刻苦的寻找,终于找到了1999年关于这个问题的一些线索,其实很简单,我终于受到了启发:<br>
        <br>
        这个讨论启发了我：<br>
        <a href=http://www.mail-archive.com/linux-fsdevel@vger.rutgers.edu/msg00298.html id=qo7f title=http://www.mail-archive.com/linux-fsdevel@vger.rutgers.edu/msg00298.html>http://www.mail-archive.com/linux-fsdevel@vger.rutgers.edu/msg00298.html</a><br>
        <br>
        <font size=4><b>问题的根源在于buffer 的释放问题:真正从buffer
        cache中消除buffer的函数是</b></font>
        <font size=4><b>__bforget</b></font>,&nbsp;
        然而只有(少数文件系统系统直接调用__bforget)<b>unmap_underlying_metadata,
        try_to_free_buffers (page_lunder)是进入这个过程的常见入口.<br>
        &nbsp;&nbsp;</b> 设想这个一个流程:<br>
        &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
        1) 打开 foo/xxx , 修改xxx的内容<br>
        &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
        2)rm foo<br>
        &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
        3)吧xxx元数据所占用的block分配给新的文件, 现在,因为rm
        foo的时候我们并没有及时调用<font size=4>__bforget, 所以buffer cache 中还有一个alias的buffer.<br>
        <font size=2>至于以前讨论的,我们认为通过dd这种操作raw设备的方式所拥有的alias,
        并不在</font></font><font size=2><b>unmap_underlying_metadata
        </b>考虑的范围内.本来,2.4的时候已经不负责buffer cache和page
        cache之间的同步了.这里有必要性不在于这个alias在buffer cache中,而在于他是ditry的如果不clear掉,就会引起data
        corrupt. 2.4以后仅仅是drop掉数据就够了.</font><br>
        <font size=4><br>
        </font>/*<br>
        &nbsp;* bforget() is like brelse(), except it puts the buffer on the<br>
        &nbsp;* free list if it can.. We can NOT free the buffer if:<br>
        &nbsp;*&nbsp; - there are other users of it<br>
        &nbsp;*&nbsp; - it is locked and thus can have active IO<br>
        &nbsp;*/<br>
        void <font color=#006600><b>__bforget</b></font>(struct buffer_head *
        buf)<br>
        {<br>
        &nbsp;&nbsp;&nbsp; /* grab the lru lock here to block bdflush. */<br>
        &nbsp;&nbsp;&nbsp; spin_lock(&amp;lru_list_lock);<br>
        &nbsp;&nbsp;&nbsp; write_lock(&amp;hash_table_lock);<br>
        &nbsp;&nbsp;&nbsp; if (!atomic_dec_and_test(&amp;buf-&gt;b_count) ||
        buffer_locked(buf))<br>
        &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; goto in_use;<br>
        &nbsp;&nbsp;&nbsp;
        <font color=#006600><b>__hash_unlink</b></font><b>(buf);</b><br>
        &nbsp;&nbsp;&nbsp; remove_inode_queue(buf);<br>
        &nbsp;&nbsp;&nbsp; write_unlock(&amp;hash_table_lock);<br>
        &nbsp;&nbsp;&nbsp; __remove_from_lru_list(buf, buf-&gt;b_list);<br>
        &nbsp;&nbsp;&nbsp; spin_unlock(&amp;lru_list_lock);<br>
        &nbsp;&nbsp;&nbsp; put_last_free(buf);<br>
        &nbsp;&nbsp;&nbsp; return;<br>
        <br>
        &nbsp;in_use:<br>
        &nbsp;&nbsp;&nbsp; write_unlock(&amp;hash_table_lock);<br>
        &nbsp;&nbsp;&nbsp; spin_unlock(&amp;lru_list_lock);<br>
        }<br>
        /*<br>
        &nbsp;* We are taking a block for data and we don't want any output from
        any<br>
        &nbsp;* buffer-cache aliases starting from return from that function and<br>
        &nbsp;* until the moment when something will explicitly mark the buffer<br>
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -