📄 035_fs_buffer_c.html
字号:
create_empty_buffers的调用者不少:<br>
static int __block_write_full_page(struct inode *inode, struct page
*page, get_block_t *get_block)<br>
static int __block_prepare_write(struct inode *inode, struct page *page,<br>
static int __block_commit_write(struct inode *inode, struct page *page,<br>
<br>
int <font color=#006600>block_prepare_write</font>(struct page *page,
unsigned from, unsigned to,...)<br>
int <font color=#006600>generic_commit_write</font>(struct file *file,
struct page *page,...)<br>
int <font color=#006600>block_read_full_page</font>(struct page *page,
get_block_t *get_block)<br>
int <font color=#006600>block_write_full_page</font>(struct page *page,
get_block_t *get_block)<br>
int <font color=#006600>block_truncate_page</font>(struct address_space
*mapping, loff_t from, get_block_t *get_block)<br>
int <font color=#006600>brw_page</font>(int rw, struct page *page,
kdev_t dev, int b[], int size)<br>
int <font color=#006600>brw_kiovec</font>(int rw, int nr, struct kiobuf
*iovec[],<br>
<br>
这些函数为读写文件提供了buffer支持,作为一个io entry,使文件和磁盘驱动能够结合起来。手工
trace这些函数,就会知道这些buffer没有加入buffer cache,是“真正的文件”内容而非文件的元数据。典型的例子是文件的读写:<br>
do_generic_file_read -> mapping->a_ops->readpage(filp,
page);->ext2_readpage-><font color=#006600>block_read_full_page</font><br>
generic_file_write ->mapping->a_ops->prepare_write(file, page,
offset, offset+bytes);->ext2_prepare_write->
<font color=#006600>block_prepare_write</font><br>
generic_file_write -> mapping->a_ops->commit_write(file, page,
offset, offset+bytes) ->
<font color=#006600>generic_commit_write</font><br>
这里给出一个图示说明page cache, filemap,buffer cache, buffer entry(仅作io
entry的buffer)的关系(也许不是100%正确!!)<br>
<br>
<div id=ips: style="PADDING:1em 0pt; TEXT-ALIGN:left">
<img src=035_fs_buffer_c_images/dcbsxfpf_15cq94jchq.gif style="WIDTH:701px; HEIGHT:606px">
</div>
马上回顾一下buffer_head的回收,就会发现,这种类型的buffer 很自然的进入page
cache继而通过<font color=#006600><b>try_to_free_buffers </b>进行回收.</font><br>
<br>
实在没有必要把这些函数的实现都列到这里仔细讨论了,仅以其中一个为例吧,但是在讨论前还是说一下这些函数的用途吧:<br>
这些函数值得注意的是写文件的方式,第一种提供给具体的文件系统使用,参考generic_file_write,<br>
int <font color=#006600>block_prepare_write</font>(struct page *page,
unsigned from, unsigned to,...)<br>
int <font color=#006600>generic_commit_write</font>(struct file *file,
struct page *page,...)<br>
我们在讨论generic_file 的读写时也涉及到这些函数。<br>
另外一中类型的是
<font color=#006600>block_write_full_page,<font color=#000000>像是上面两个函数的打包,其实其中有不同</font>。</font><br>
我们回顾一下generic_file_write的基本操作流程:<br>
<br>
ssize_t <b>generic_file_write</b>(struct file *file,const char
*buf,size_t count,loff_t *ppos)<br>
{<br>
............ //略过<br>
while (count) {<br>
unsigned long bytes, index,
offset;<br>
char *kaddr;<br>
int deactivate = 1;<br>
<br>
<b>/*</b><br style=FONT-WEIGHT:bold>
<b> * Try to find the page in
the cache. If it isn't there,</b><br style=FONT-WEIGHT:bold>
<b> * allocate a free
page.</b><br style=FONT-WEIGHT:bold>
<b> */</b><br>
offset = (pos &
(PAGE_CACHE_SIZE -1)); /* Within page */<br>
。。。。<br>
page = __grab_cache_page(mapping,
index, &cached_page);<br>
if (!page)<br>
break;<br>
<br>
/* We have exclusive IO access to
the page.. */<br>
if (!PageLocked(page)) {<br>
PAGE_BUG(page);<br>
}<br>
<br>
<b>
/*对于ext2,就是从磁盘先将文件页面读入,如果需要还要为文件分配磁盘block*/</b><br>
status =
mapping->a_ops->prepare_write(file, page, offset, offset+bytes);<br>
if (status)<br>
goto unlock;<br>
kaddr = page_address(page);<br>
status =
<b>copy_from_user</b>(kaddr+offset, buf, bytes);<br>
flush_dcache_page(page);<br>
if (status)<br>
goto
fail_write;<br>
<b> /*对于ext2,就是mark所有bh为dirt,mark
对应 inode为dirty. 见 ext2_aops */</b><br>
status =
mapping->a_ops->commit_write(file, page, offset, offset+bytes);<br>
<br>
.............//略过<br>
/* For now, when the user asks for O_SYNC, we'll
actually<br>
* provide O_DSYNC. */<br>
if ((status >= 0) && (file->f_flags
& O_SYNC))<br>
status =
generic_osync_inode(inode, 1); /* 1 means datasync */<br>
<br>
}<br>
<br>
//mapping->a_ops->prepare_write ->
<font color=#006600>block_prepare_write
--></font>__block_prepare_write<br>
static int <b>__block_prepare_write</b>(struct inode *inode, struct page
*page,<br>
unsigned from, unsigned to,
get_block_t *get_block)<br>
{<br>
<br>
if (!page->buffers)<br>
<font color=#006600><b>create_empty_buffers</b></font>(page,
inode->i_dev, blocksize); <b>//为page 创建 bh io entry</b><br>
........<br>
<br>
for(bh = head, block_start = 0; bh != head ||
!block_start;<br>
block++, block_start=block_end, bh
= bh->b_this_page) {<br>
........<br>
if (!buffer_mapped(bh)) { <br>
err =
get_block(inode, block, bh,
1);<b>//如果没有对应到磁盘上就分配一个磁盘块,ext2,就是ext2_get_block,map
bh到具体设备上的block</b><br>
<br>
<b>if
(buffer_new(bh))</b> {<br>
<font color=#ff0000>
</font><font color=#ff0000><b>unmap_underlying_metadata</b></font><b>(bh);
</b><font color=#cc0000 size=4><b>//这次我们把这个东西讨论清楚...呵呵</b></font><br>
.....<br>
}<br>
}<br>
<br>
if (!buffer_uptodate(bh)
&&<br>
(block_start < from || block_end > to)) {<br>
ll_rw_block(READ, 1, &bh); <b> //read in , make it
uptodate</b><br>
*wait_bh++=bh;<br>
}<br>
}<br>
......<br>
}<br>
<font color=#ff0000><b>unmap_underlying_metadata 曾经是一个很困惑的问题,这次终于能够了断了
:-) 我们曾经在linuxforum上有一个讨论,但是基本上没有说道点子上,见这个帖子:<br>
</b></font>linux forum上讨论unmpa_underlaying_metadata 的讨论<br>
<a href=http://www.linuxforum.net/forum/showthreaded.php?Cat=&Board=linuxK&Number=408077&page=&view=&sb=&o= id=is7v title=http://www.linuxforum.net/forum/showthreaded.php?Cat=&Board=linuxK&Number=408077&page=&view=&sb=&o=>http://www.linuxforum.net/forum/showthreaded.php?Cat=&Board=linuxK&Number=408077&page=&view=&sb=&o=</a><br>
这次分析到这里,没有办法,经过刻苦的寻找,终于找到了1999年关于这个问题的一些线索,其实很简单,我终于受到了启发:<br>
<br>
这个讨论启发了我:<br>
<a href=http://www.mail-archive.com/linux-fsdevel@vger.rutgers.edu/msg00298.html id=qo7f title=http://www.mail-archive.com/linux-fsdevel@vger.rutgers.edu/msg00298.html>http://www.mail-archive.com/linux-fsdevel@vger.rutgers.edu/msg00298.html</a><br>
<br>
<font size=4><b>问题的根源在于buffer 的释放问题:真正从buffer
cache中消除buffer的函数是</b></font>
<font size=4><b>__bforget</b></font>,
然而只有(少数文件系统系统直接调用__bforget)<b>unmap_underlying_metadata,
try_to_free_buffers (page_lunder)是进入这个过程的常见入口.<br>
</b> 设想这个一个流程:<br>
1) 打开 foo/xxx , 修改xxx的内容<br>
2)rm foo<br>
3)吧xxx元数据所占用的block分配给新的文件, 现在,因为rm
foo的时候我们并没有及时调用<font size=4>__bforget, 所以buffer cache 中还有一个alias的buffer.<br>
<font size=2>至于以前讨论的,我们认为通过dd这种操作raw设备的方式所拥有的alias,
并不在</font></font><font size=2><b>unmap_underlying_metadata
</b>考虑的范围内.本来,2.4的时候已经不负责buffer cache和page
cache之间的同步了.这里有必要性不在于这个alias在buffer cache中,而在于他是ditry的如果不clear掉,就会引起data
corrupt. 2.4以后仅仅是drop掉数据就够了.</font><br>
<font size=4><br>
</font>/*<br>
* bforget() is like brelse(), except it puts the buffer on the<br>
* free list if it can.. We can NOT free the buffer if:<br>
* - there are other users of it<br>
* - it is locked and thus can have active IO<br>
*/<br>
void <font color=#006600><b>__bforget</b></font>(struct buffer_head *
buf)<br>
{<br>
/* grab the lru lock here to block bdflush. */<br>
spin_lock(&lru_list_lock);<br>
write_lock(&hash_table_lock);<br>
if (!atomic_dec_and_test(&buf->b_count) ||
buffer_locked(buf))<br>
goto in_use;<br>
<font color=#006600><b>__hash_unlink</b></font><b>(buf);</b><br>
remove_inode_queue(buf);<br>
write_unlock(&hash_table_lock);<br>
__remove_from_lru_list(buf, buf->b_list);<br>
spin_unlock(&lru_list_lock);<br>
put_last_free(buf);<br>
return;<br>
<br>
in_use:<br>
write_unlock(&hash_table_lock);<br>
spin_unlock(&lru_list_lock);<br>
}<br>
/*<br>
* We are taking a block for data and we don't want any output from
any<br>
* buffer-cache aliases starting from return from that function and<br>
* until the moment when something will explicitly mark the buffer<br>
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -