gdk_posix.c

来自「这个是内存数据库中的一个管理工具」· C语言代码 · 共 2,053 行 · 第 1/4 页
2,053 行
#line 310 "/export/scratch0/monet/monet.GNU.64.64.d.14791/MonetDB/src/gdk/gdk_posix.mx"#include "monetdb_config.h"#include "gdk.h"#include <stdio.h>#ifdef HAVE_FCNTL_H# include <fcntl.h>#endif#ifdef WIN32size_t  GDK_mem_pagebits = 16; /* on windows, the mmap addresses can be set by the 64KB */#elsesize_t  GDK_mem_pagebits = 14; /* on linux, 4KB pages can be addressed */#endif#ifndef MAP_NORESERVE# define MAP_NORESERVE 		MAP_PRIVATE#endif#define MMAP_ADVISE		7#define MMAP_WRITABLE		(MMAP_WRITE|MMAP_COPY)/* DDALERT: AIX4.X 64bits needs HAVE_SETENV==0 due to a AIX bug, but it probably isn't detected so by configure */#ifndef HAVE_SETENVintsetenv(const char *name, const char *value, int overwrite){	int ret = 0;	if (overwrite || getenv(name) == NULL) {		char *p = (char *) GDKmalloc(2 + strlen(name) + strlen(value));		strcpy(p, name);		strcat(p, "=");		strcat(p, value);		ret = putenv(p);		/* GDKfree(p); LEAK INSERTED DUE TO SOME WEIRD CRASHES */	}	return ret;}#endifchar *MT_heapbase = NULL;/* Crude VM buffer management that keep a list of all memory mapped regions. * * a.k.a. "helping stupid VM implementations that ignore VM advise" * * The main goal is to be able to tell the OS to please stop buffering all memory * mapped pages when under pressure. A major problem is materialization of large * results in newly created memory mapped files. Operating systems tend to cache * all dirty pages, such that when memory is out, all pages are dirty and cannot * be unloaded quickly. The VM panic occurs and comatose OS states may be observed. * This is in spite of our use of madvise(MADV_SEQUENTIAL). That is; we would want  * that the OS drops pages after we've passed them. That does not happen; pages are  * retained and pollute the buffer cache. * * Regrettably, at this level, we don't know anything about how Monet is using the  * mmapped regions. Monet code is totally oblivious of any I/O; that's why it is  * so easy to create CPU efficient code in Monet. * * The current solution focuses on large writable maps. These often represent * newly created BATs, that are the result of some (running) operator. We  * assume two things here: * - the BAT is created in sequential fashion (always almost true) * - afterwards, this BAT is used in sequential fashion (often true) * * A VMtrim thread keeps an eye on the RSS (memory pressure) and large writable * memory maps. If RSS approaches mem_maxsize(), it starts to *worry*, and starts * to write dirty data from these writable maps to disk in 128MB tiles. So, if  * memory pressure rises further in the near future, the OS has some optiont to release  * memory pages cheaply (i.e. without needing I/O). This is also done explicitly by the  * VM-thread: when RSS exceeds mem_maxsize() is explicitly asks the OS to release pages. * The reason is that Linux is not smart enough to do even this. Anyway.. * * The way to free pages explicitly in Linux is to call posix_fadvise(..,MADV_DONTNEED).  * Particularly, posix_madvise(..,POSIX_MADV_DONTNEED) which is supported and documented  * doesn't work on Linux. But we do both posix_madvise and posix_fadvise, so on other unix * systems that don't support posix_fadvise, posix_madvise still might work. * On Windows, to our knowledge, there is no way to tell it stop buffering * a memory mapped region. msync (FlushViewOfFile) does work, though. So let's * hope the VM paging algorithm behaves better than Linux which just runs off * the cliff and if MonetDB does not prevent RSS from being too high, enters coma. * * We will only eb able to sensibly test this on Windows64. On Windows32, mmap sizes * do not significantly exceed RAM sizes so MonetDB swapping actually will not happen * (of course, you've got this nasty problem of VM fragemntation and failing mmaps instead). * * In principle, page tiles are saved sequentially, and behind it, but never overtaking * it, is an "unload-cursor" that frees the pages if that is needed to keep RSS down. * There is a tweak in the algorithm, that re-sets the unload-cursor if it seems * that all tiles to the end have been saved (whether a tile is actually saved is * determined by timing the sync action). This means that the producing operator * is ready creating the BAT, and we assume it is going to be used sequentially afterwards. * In that case, we should start unloading right after the 'read-cursor', that is,  * from the start. * * EXAMPLE * D = dirty tile * s = saved tile (i.e. clean) * u = unloaded tile * L = tile that is being loaded * *           +--> operator produces  BAT  * (1) DDDDDD|......................................| end of reserved mmap *                      ____|RSS *                     | *                     | at 3/4 of RSS consumed we start to worry *                     +--> operator produces BAT  * (2) DDDDDDDDDDDDDDDD|............................| *                    s<----------------------------- VM backwards save thread  *                    | *                    + first tile of which saving costs anything * *                        +--> operator produces BAT  * (3) DDDDDDDDDDDDDDDss|D|.........................| *     VM-thread save ->|  * * When the RSS target is exceeded, we start unloading tiles.. * *                     +-->  VM-thread unload starts at *second* 's' *                     | *                     |    +--> operator produces BAT  * (4) DDDDDDDDDDDDDDDsus|DD|........................| *     VM-thread save -->|  | RSS = Full! *                     *                                  +-- 0 => save costs nothing!! *     VM-thread save ------------->|        assume bat complete  * (5) DDDDDDDDDDDDDDDsuuuuuuuuussss0................| *                    |<-------- re-set unload cursor *                    +--- first tile was not unloaded.  *                      * later.. some other operator sequentially reads the bat * first part is 'D', that is, nicely cached. * *     ---read------->| * (6) DDDDDDDDDDDDDDDsuuuuuuuuussss0................| * * now we're hitting the unloaded region. the query becomes * I/O read bound here (typically 20% CPU utilization). * *     ---read-------->| * (7) DDDDDDDDDDDDDDDuLuuuuuuuussss0................| *                   /  \ *      unload cursor    load cursor * *     ---read---------------->| * (8) DDDDDDDDDDDDDDDuuuuuuuuuLssss0................| *                           /  \ *              unload cursor    load cursor * *     ---read--------------------->| done * (9) DDDDDDDDDDDDDDDuuuuuuuuuLssss0................| *                              **** *                              last part still cached  * * note: if we would not have re-setted the unload cursor (5) *       the last part would have been lost due to continuing *       RSS pressure from the 'L' read-cursor. * * If multiple write-mmaps exist, we do unload-tile and save-tile * selection on a round-robin basis among them. * * Of course, this is a simple solution for simple cases only. * (a) if the bat is produced too fast, (or your disk is too slow) *     RSS will exceeds its limit and Linux will go into swapping.  * (b) if your data is not produced and read sequentially. *     Examples are sorting or clustering on huge datasets. * (c) if RSS pressure is due to large read-maps, rather than  *     intermediate results.  * * Two crude suggestions: * - If we are under RSS pressure without unloadable tiles and with  *   savable tiles, we should consider suspending *all* other threads  *   until we manage to unload a tile. * - if there are no savable tiles (or in case of read-only maps) *   we could resort to saving and unloading random tiles. *  * To do better, our BAT algorithms should provide even more detailed * advise on their access patterns, which may even consist of pointers * to the cursors (i.e. pointers to b->batBuns->free or the cursors * in radix-cluster), which an enhanced version of this thread might * take into account. */#ifdef HAVE_PTHREAD_H/* pthread.h on Windows includes config.h if HAVE_CONFIG_H is set */#undef HAVE_CONFIG_H#include <sched.h>#include <pthread.h>#endif#ifdef HAVE_SEMAPHORE_H#include <semaphore.h>#endiftypedef struct {	char path[128];		/* mapped file, retained for debugging */	char *base;		/* base address */	size_t len;		/* length of map */	size_t first_tile;	/* from here we started saving tiles */	size_t save_tile;	/* next tile to save */	size_t unload_tile;	/* next tile to unload */	int last_tile;				int fd;			/* open fd (==-1 for anon vm), retained to give posix_fadvise */	int pincnt;		/* incremented while a MIL command uses heap with a random pattern */	int writable;	int next;} MT_mmap_t;#ifdef HAVE_POSIX_FADVISEstatic int do_not_use_posix_fadvise = 0;#endif#define MT_MMAP_TILE (1<<27)#define MT_MMAP_BUFSIZE 100MT_mmap_t MT_mmap_tab[MT_MMAP_BUFSIZE];int MT_mmap_cur = -1, MT_mmap_first = -1, MT_mmap_free = 0;pthread_mutex_t MT_mmap_lock;static voidMT_mmap_empty(int i){	MT_mmap_tab[i].path[0] = 0;	MT_mmap_tab[i].base = NULL;	MT_mmap_tab[i].len = 0;	MT_mmap_tab[i].writable = 0;	MT_mmap_tab[i].fd = -1;	MT_mmap_tab[i].pincnt = -1;}static voidMT_mmap_init(void){	int i;	/* create lock */	pthread_mutex_init(&MT_mmap_lock, 0);	for (i = 0; i < MT_MMAP_BUFSIZE; i++) {		MT_mmap_tab[i].next = i + 1;		MT_mmap_empty(i);	}	MT_mmap_tab[i-1].next = -1;}/* returns previous element (to facilitate deletion) */static intMT_mmap_find(void *base){	/* maybe consider a hash table iso linked list?? */	int i, prev = MT_MMAP_BUFSIZE;	for (i = MT_mmap_first; i >= 0; i = MT_mmap_tab[i].next) {		if (MT_mmap_tab[i].base == base) {			return prev;		}		prev = i;	}	return i;}static intMT_mmap_idx(void *base, size_t len){	if (len > MT_MMAP_TILE) {		int i = MT_mmap_find(base);		if (i >= 0) {			if (i == MT_MMAP_BUFSIZE) {				return MT_mmap_first;			} else {				return MT_mmap_tab[i].next;			}		}	}	return -1;}static intMT_mmap_new(char *path, void *base, size_t len, int fd, int writable){	(void) pthread_mutex_lock(&MT_mmap_lock);	if (len > MT_MMAP_TILE && MT_mmap_free >= 0) {		int i = MT_mmap_free;		MT_mmap_free = MT_mmap_tab[i].next;		MT_mmap_tab[i].next = MT_mmap_first;		MT_mmap_first = i;		if (MT_mmap_cur == -1) MT_mmap_cur = i;#ifdef MMAP_DEBUG		stream_printf(GDKerr, "MT_mmap_new: %s fd=%d\n", path, fd);#endif		strncpy(MT_mmap_tab[i].path, path, 128);		MT_mmap_tab[i].base = base;		MT_mmap_tab[i].len = len;		MT_mmap_tab[i].save_tile = 1;		MT_mmap_tab[i].last_tile = 0;		MT_mmap_tab[i].first_tile = 0;		MT_mmap_tab[i].unload_tile = 0;		MT_mmap_tab[i].writable = writable;		MT_mmap_tab[i].fd = fd;		MT_mmap_tab[i].pincnt = 0;		fd = -1;	}	(void) pthread_mutex_unlock(&MT_mmap_lock);	return fd;}static voidMT_mmap_del(void *base, size_t len){	if (len > MT_MMAP_TILE) {		int victim = 0, prev;		(void) pthread_mutex_lock(&MT_mmap_lock);		prev = MT_mmap_find(base);		if (prev >= 0) {			int ret;			if (prev == MT_MMAP_BUFSIZE) {				victim = MT_mmap_first;				MT_mmap_first = MT_mmap_tab[MT_mmap_first].next;			} else {				victim = MT_mmap_tab[prev].next;				MT_mmap_tab[prev].next = MT_mmap_tab[victim].next;			}			if (MT_mmap_cur == victim) {				MT_mmap_cur = MT_mmap_first;			}#ifdef HAVE_POSIX_FADVISE			if (!do_not_use_posix_fadvise &&			    MT_mmap_tab[victim].fd >= 0) {				/* tell the OS quite clearly that you want to drop this */				ret = posix_fadvise(MT_mmap_tab[victim].fd, 0LL, MT_mmap_tab[victim].len, POSIX_FADV_DONTNEED);#ifdef MMAP_DEBUG				stream_printf(GDKerr, "MT_mmap_del: posix_fadvise(%s,fd=%d,%uMB,POSIX_FADV_DONTNEED) = %d\n", MT_mmap_tab[victim].path, MT_mmap_tab[victim].fd, (unsigned int) (MT_mmap_tab[victim].len >> 20), ret);#endif			}#endif			ret = close(MT_mmap_tab[victim].fd);#ifdef MMAP_DEBUG			stream_printf(GDKerr, "MT_mmap_del: close(%s fd=%d) = %d\n", MT_mmap_tab[victim].path, MT_mmap_tab[victim].fd, ret);#endif			MT_mmap_tab[victim].next = MT_mmap_free;			MT_mmap_empty(victim);			MT_mmap_free = victim;			(void) ret;		}		(void) pthread_mutex_unlock(&MT_mmap_lock);	}}static intMT_fadvise(void *base, size_t len, int advice){	int ret = 0;#ifdef HAVE_POSIX_FADVISE	if (!do_not_use_posix_fadvise) {		int i;		(void) pthread_mutex_lock(&MT_mmap_lock);		i = MT_mmap_idx(base, len);		if (i >= 0) {			if (MT_mmap_tab[i].fd >= 0) {				ret = posix_fadvise(MT_mmap_tab[i].fd, 0, len, advice);#ifdef MMAP_DEBUG				stream_printf(GDKerr, "MT_fadvise: posix_fadvise(%s,fd=%d,%uMB,%d) = %d\n", MT_mmap_tab[i].path, MT_mmap_tab[i].fd, (unsigned int) (len >> 20), advice, ret);#endif			}		}		(void) pthread_mutex_unlock(&MT_mmap_lock);	}#else	(void) base;	(void) len;	(void) advice;#endif	return ret;}static voidMT_mmap_unload_tile(int i, size_t off, stream* err) {	/* tell Linux to please stop caching this stuff */	int ret = posix_madvise(MT_mmap_tab[i].base+off, MT_MMAP_TILE, POSIX_MADV_DONTNEED);	if (err) {		stream_printf(err, "MT_mmap_unload_tile: posix_madvise(%s,off=%uMB,%uMB,fd=%d,POSIX_MADV_DONTNEED) = %d\n", 			MT_mmap_tab[i].path, (unsigned int) (off>>20), (unsigned int) (MT_MMAP_TILE>>20), MT_mmap_tab[i].fd, ret);	}#ifdef HAVE_POSIX_FADVISE	if (!do_not_use_posix_fadvise) {		/* tell the OS quite clearly that you want to drop this */		ret = posix_fadvise(MT_mmap_tab[i].fd, off, MT_MMAP_TILE, POSIX_FADV_DONTNEED);		if (err) {			stream_printf(err, "MT_mmap_unload_tile: posix_fadvise(%s,off=%uMB,%uMB,fd=%d,POSIX_MADV_DONTNEED) = %d\n", 				MT_mmap_tab[i].path, (unsigned int) (off>>20), (unsigned int) (MT_MMAP_TILE>>20), MT_mmap_tab[i].fd, ret);		}	}#endif}static intMT_mmap_save_tile(int i, size_t tile, stream* err) {	int t, ret;	/* save to disk an 128MB tile, and observe how long this takes */	if (err) {		stream_printf(err, "MT_mmap_save_tile: msync(%s,off=%uM,%u,SYNC)...\n", 			MT_mmap_tab[i].path, (unsigned int) (tile>>20), (unsigned int) (MT_MMAP_TILE>>20));	} 	t = GDKms();	ret = MT_msync(MT_mmap_tab[i].base+tile, MT_MMAP_TILE, MMAP_SYNC);	t = GDKms() - t;	if (err) {		stream_printf(err, "MT_mmap_save_tile: msync(%s,tile=%uM,%uM,SYNC) = %d (%dms)\n", 			MT_mmap_tab[i].path, (unsigned int) (tile>>20), (unsigned int) (MT_MMAP_TILE>>20), ret, t);	}	if (t > 200) {		/* this took time; so we should report back on our actions and await new orders */		if (MT_mmap_tab[i].save_tile == 1) {			MT_mmap_tab[i].first_tile = tile;			/* leave first tile for later sequential use pass (start unloading after it) */			MT_mmap_tab[i].unload_tile = tile + MT_MMAP_TILE; 		} 		MT_mmap_tab[i].save_tile = tile + MT_MMAP_TILE; 		(void) pthread_mutex_unlock(&MT_mmap_lock);		return 1; 	}	return 0;}/* round-robin next. this is to ensure some fairness if multiple large results are produced simultaneously */static intMT_mmap_next(int i) {	if (i != -1) { 		i = MT_mmap_tab[i].next;		if (i == -1) i = MT_mmap_first;	}	return i;}intMT_mmap_trim(size_t target, void *fp){	stream *err = (stream *) fp;	size_t off, rss = MT_getrss();	int i, worry = (rss*4 > target*3);	(void) pthread_mutex_lock(&MT_mmap_lock);	if (err) {		stream_printf(err, "MT_mmap_trim(%u MB): rss = %u MB\n", (unsigned int) ((target) >> 20), (unsigned int) (rss >> 20));	}	if (rss > target) {						/* try to selectively unload pages from the writable regions */		size_t delta = ((rss - target) + 4*MT_MMAP_TILE - 1) & ~(MT_MMAP_TILE-1);		for(i = MT_mmap_next(MT_mmap_cur); delta && i != MT_mmap_cur; i = MT_mmap_next(i)) {			if (MT_mmap_tab[i].fd >= 0 && MT_mmap_tab[i].writable) {				if (MT_mmap_tab[i].unload_tile >= MT_mmap_tab[i].save_tile)					MT_mmap_tab[i].unload_tile = MT_mmap_tab[i].first_tile;				while(MT_mmap_tab[i].unload_tile < MT_mmap_tab[i].save_tile) {					MT_mmap_unload_tile(i, MT_mmap_tab[i].unload_tile, err);					MT_mmap_tab[i].unload_tile += MT_MMAP_TILE;					if ((delta -= MT_MMAP_TILE) == 0) break;				}			}		}	} 	if (worry) {		/* schedule background saves of tiles */		for(i = MT_mmap_next(MT_mmap_cur); i != MT_mmap_cur; i = MT_mmap_next(i)) {			if (MT_mmap_tab[i].fd >= 0 && MT_mmap_tab[i].writable && 			   (MT_mmap_tab[i].pincnt == 0 || MT_mmap_tab[i].len > target)) 			{				if (MT_mmap_tab[i].save_tile == 1) {					/* first run, walk backwards until we hit an unsaved tile */					for(off=MT_mmap_tab[i].len; off>=MT_MMAP_TILE; off-=MT_MMAP_TILE)						if (MT_mmap_save_tile(i, off, err)) return 1;				} else { 					/* save the next tile */					for(off=MT_mmap_tab[i].save_tile; off+MT_MMAP_TILE<MT_mmap_tab[i].len; off+=MT_MMAP_TILE) {						if (MT_mmap_save_tile(i, off, err)) return 1;					}					/* we seem to have run through all savable tiles */					if (!MT_mmap_tab[i].last_tile) {						MT_mmap_tab[i].last_tile = 1;						MT_mmap_tab[i].unload_tile = MT_mmap_tab[i].first_tile;					}				}			}		} 		MT_mmap_cur = i;	}	(void) pthread_mutex_unlock(&MT_mmap_lock);	return (worry);}voidMT_mmap_pin(void *base, size_t len){	int i;	(void) pthread_mutex_lock(&MT_mmap_lock);	i = MT_mmap_idx(base, len);	if (i >= 0) {		MT_mmap_tab[i].pincnt++;	}	(void) pthread_mutex_unlock(&MT_mmap_lock);
gdk_posix.c - 源码说明

本页面展示了「这个是内存数据库中的一个管理工具」中的 gdk_posix.c 源码文件，采用 C语言编程语言编写，共 2,053 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与内存数据库相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?