⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 gdk_posix.mx

📁 这个是内存数据库中的一个管理工具
💻 MX
📖 第 1 页 / 共 5 页
字号:
@' The contents of this file are subject to the MonetDB Public License@' Version 1.1 (the "License"); you may not use this file except in@' compliance with the License. You may obtain a copy of the License at@' http://monetdb.cwi.nl/Legal/MonetDBLicense-1.1.html@'@' Software distributed under the License is distributed on an "AS IS"@' basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the@' License for the specific language governing rights and limitations@' under the License.@'@' The Original Code is the MonetDB Database System.@'@' The Initial Developer of the Original Code is CWI.@' Portions created by CWI are Copyright (C) 1997-2007 CWI.@' All Rights Reserved.@f gdk_posix@a Niels Nes, Peter Boncz@* System Independent LayerGDK is built on Posix. Exceptions are made for memory mapped files and anonymous virtual memory, for which somewhat higher-level functions are defined here.Most of this file concerns itself with emulation of Posix functionality on the WIN32 native platform.@-@{@h#ifndef GDK_POSIX_H#define GDK_POSIX_H#include "gdk_system.h"#include <sys/types.h>#ifdef HAVE_MALLOC_H# include <malloc.h>		/* mallopt, mallinfo, and  malloc, free etc. */#endif#ifdef HAVE_FTIME#include <sys/timeb.h>#endif#if TIME_WITH_SYS_TIME# include <sys/time.h># include <time.h>#else# if HAVE_SYS_TIME_H#  include <sys/time.h># else#  include <time.h># endif#endif#if defined(HAVE_WINSOCK_H) && defined(NATIVE_WIN32)#include <winsock.h>		/* for timeval */#endif#ifdef NATIVE_WIN32#include <io.h>#include <direct.h>#else#ifdef PROFILE/* Linux gprof messes up on multithreaded programs */gdk_export int gprof_pthread_create(pthread_t *__restrict, __const pthread_attr_t *__restrict,                                     void * (*fcn)(void *), void *__restrict);#endif#endif/* Some systems (SGI, Sun) call malloc before we get a chance to call   mallopt, and mallopt should be called before the first call to   malloc.  Therefore we do as if we don't have mallopt, even though   in reality we do. */#ifdef HAVE_MALLOPT#undef HAVE_MALLOPT#endif#ifndef HAVE_MALLINFO#ifndef M_MXFAST#define M_MXFAST	1	/* set size of blocks to be fast */#endif#ifndef M_NLBLKS#define M_NLBLKS	2	/* set number of block in a holding block */#endif#ifndef M_GRAIN#define M_GRAIN		3	/* set number of sizes mapped to one, for */			   /* small blocks */#endif#ifndef M_KEEP#define M_KEEP		4	/* retain contents of block after a free */			   /* until another allocation */#endif#ifndef HAVE_STRUCT_MALLINFOstruct mallinfo {	int arena;		/* total space in arena */	int ordblks;		/* number of ordinary blocks */	int smblks;		/* number of small blocks */	int hblks;		/* number of holding blocks */	int hblkhd;		/* space in holding block headers */	int usmblks;		/* space in small blocks in use */	int fsmblks;		/* space in free small blocks */	int uordblks;		/* space in ordinary blocks in use */	int fordblks;		/* space in free ordinary blocks */	int keepcost;		/* cost of enabling keep option */};#endif#define mallinfo() 		{0}#define mallopt(cmd,value)	0#endif /* ! HAVE_MALLINFO */gdk_export struct mallinfo MT_mallinfo(void);@- locking, sleep@h#define F_TLOCK 2		/* test and lock a region for exclusive use */#define F_ULOCK 0		/* unlock a previously locked region */#define F_LOCK 1		/* lock a region for exclusive use */gdk_export int MT_lockf(char *filename, int mode, off_t off, off_t len);gdk_export void MT_sleep_ms(unsigned int ms);@- virtual memory@h#define MT_VMUNITLOG 	16#define MT_VMUNITSIZE 	(1 << MT_VMUNITLOG)#ifdef DEBUG_ALLOCgdk_export int MT_alloc_register(void *p, size_t size, char mode);gdk_export int MT_alloc_print(void);gdk_export int MT_alloc_table(void);#else#define MT_alloc_register(p, size, mode) (void)p; (void)size; (void)mode#define MT_alloc_print()#define MT_alloc_table()#endif /* make sure POSIX_MADV_* and posix_madvise() are defined somehow */#ifdef HAVE_SYS_MMAN_H#  define __USE_BSD# include <sys/mman.h>#endif#ifndef HAVE_POSIX_MADVISE# ifdef HAVE_MADVISE#  define posix_madvise madvise#  ifndef MADV_RANDOM#   define MADV_RANDOM	0#  endif#  ifndef POSIX_MADV_NORMAL#   define POSIX_MADV_NORMAL     MADV_NORMAL#   define POSIX_MADV_RANDOM     MADV_RANDOM#   define POSIX_MADV_SEQUENTIAL MADV_SEQUENTIAL#   define POSIX_MADV_WILLNEED   MADV_WILLNEED#   define POSIX_MADV_DONTNEED   MADV_DONTNEED#  endif# else#  define posix_madvise(x,y,z)	0#  ifndef POSIX_MADV_NORMAL#   define POSIX_MADV_NORMAL     0#   define POSIX_MADV_RANDOM     0#   define POSIX_MADV_SEQUENTIAL 0#   define POSIX_MADV_WILLNEED   0#   define POSIX_MADV_DONTNEED   0#  endif# endif#endif/* in case they are still not defined, define these values as   something that doesn't do anything */#ifndef POSIX_MADV_NORMAL#define POSIX_MADV_NORMAL 0#endif#ifndef POSIX_MADV_RANDOM#define POSIX_MADV_RANDOM 0#endif#ifndef POSIX_MADV_SEQUENTIAL#define POSIX_MADV_SEQUENTIAL 0#endif#ifndef POSIX_MADV_WILLNEED#define POSIX_MADV_WILLNEED 0#endif#ifndef POSIX_MADV_DONTNEED#define POSIX_MADV_DONTNEED 0#endif/* the new mmap modes, mimic default MADV_* madvise POSIX constants */#define MMAP_NORMAL     	POSIX_MADV_NORMAL	/* no further special treatment */#define MMAP_RANDOM     	POSIX_MADV_RANDOM	/* expect random page references */#define MMAP_SEQUENTIAL 	POSIX_MADV_SEQUENTIAL	/* expect sequential page references */#define MMAP_WILLNEED   	POSIX_MADV_WILLNEED	/* will need these pages */#define MMAP_DONTNEED   	POSIX_MADV_DONTNEED	/* don't need these pages */#define MMAP_READ		1024	/* region is readable (default if ommitted) */#define MMAP_WRITE		2048	/* region may be written into */#define MMAP_COPY		4096	/* writable, but changes never reach file */#define MMAP_ASYNC		8192	/* asynchronous writes (default if ommitted) */#define MMAP_SYNC		16384	/* writing is done synchronously *//* in order to be sure of madvise and msync modes, pass them to mmap() call as well *//* a hook function to add any initialization required for the MT_ functionality */gdk_export char *MT_heapbase;gdk_export char *MT_heapcur(void);gdk_export void MT_init_posix(int alloc_map);gdk_export size_t MT_getrss(void);gdk_export void *MT_mmap(char *path, int mode, off_t off, size_t len);gdk_export int MT_munmap(void *p, size_t len);gdk_export int MT_msync(void *p, size_t len, int mode);gdk_export int MT_madvise(void *p, size_t len, int advise);typedef struct MT_mmap_hdl_t {	void *hdl;	int mode;	void *fixed;#ifdef NATIVE_WIN32	int hasLock;	void *map;#endif} MT_mmap_hdl;gdk_export void *MT_mmap_open(MT_mmap_hdl *hdl, char *path, int mode, off_t off, size_t len, size_t nremaps);gdk_export void *MT_mmap_remap(MT_mmap_hdl *hdl, off_t off, size_t len);gdk_export void MT_mmap_close(MT_mmap_hdl *hdl);gdk_export int MT_mmap_trim(size_t lim, void *err);gdk_export void MT_mmap_pin(void *p, size_t len);gdk_export void MT_mmap_unpin(void *p, size_t len);gdk_export void *MT_vmalloc(size_t size, size_t * maxsize);gdk_export void MT_vmfree(void *p, size_t size);gdk_export void *MT_vmrealloc(void *voidptr, size_t oldsize, size_t newsize, size_t oldmaxsize, size_t * newmaxsize);gdk_export int MT_path_absolute(char *path);@}@+ Posix under WIN32 WIN32 actually supports many Posix functions directly.  Some it does not, though. For some functionality we move in Monet from Posix calls to MT_*() calls, which translate easier to WIN32.  Examples are MT_mmap() , MT_sleep_ms() and MT_path_absolute(). Why? In the caseof mmap() it is much easier for WIN32 to get a filename parameter rather than a file-descriptor. That is the reason in the case of mmap() to go for a MT_mmap() solution.For some other functionality, we do not need to abandon the Posix interface, though. Two cases can be distinguished.Missing functions in WIN32 are directly implemented (e.g. dlopen()/dlsym()/dlclose()).Posix functions in WIN32 whose functionality should be changed a bit. Examples are stat()/rename()/mkdir()/rmdir() who under WIN32 do not work if the path ends with a directory separator, but should work according to Posix. We remap such functions using a defineto an equivalent win_*() function (which in its implementation calls through to the WIN32 function).@{@h#ifdef NATIVE_WIN32#define RTLD_LAZY	1#define RTLD_NOW	2#define RTLD_GLOBAL	4gdk_export void *dlopen(const char *file, int mode);gdk_export int dlclose(void *handle);gdk_export void *dlsym(void *handle, const char *name);gdk_export char *dlerror(void);#ifndef HAVE_GETTIMEOFDAYgdk_export int gettimeofday(struct timeval *tv, int *ignore_zone);#endifgdk_export int win_stat(const char *, struct stat *);gdk_export int win_rmdir(const char *);gdk_export int win_mkdir(const char *, const int mode);#define stat(x,y)	win_stat(x,y)#define mkdir		win_mkdir#define rmdir		win_rmdir#if _WIN32_WINNT >= 0x500#define link		win_link#endif#define NAME_MAX 255#ifndef HAVE_OPENDIRstruct DIR {	char *dir_name;	int just_opened;	unsigned int find_file_handle;	char *find_file_data;};typedef struct DIR DIR;struct direct {	char d_name[NAME_MAX + 1];	int d_namelen;};#endif#ifndef HAVE_FTRUNCATEgdk_export int ftruncate(int fd, off_t size);#endif#ifndef HAVE_OPENDIRgdk_export DIR *opendir(const char *dirname);gdk_export struct direct *readdir(DIR *dir);gdk_export void rewinddir(DIR *dir);gdk_export int closedir(DIR *dir);#endif#endif#endif /* GDK_POSIX_H */@c#include "monetdb_config.h"#include "gdk.h"#include <stdio.h>#ifdef HAVE_FCNTL_H# include <fcntl.h>#endif#ifdef WIN32size_t  GDK_mem_pagebits = 16; /* on windows, the mmap addresses can be set by the 64KB */#elsesize_t  GDK_mem_pagebits = 14; /* on linux, 4KB pages can be addressed */#endif#ifndef MAP_NORESERVE# define MAP_NORESERVE 		MAP_PRIVATE#endif#define MMAP_ADVISE		7#define MMAP_WRITABLE		(MMAP_WRITE|MMAP_COPY)/* DDALERT: AIX4.X 64bits needs HAVE_SETENV==0 due to a AIX bug, but it probably isn't detected so by configure */#ifndef HAVE_SETENVintsetenv(const char *name, const char *value, int overwrite){	int ret = 0;	if (overwrite || getenv(name) == NULL) {		char *p = (char *) GDKmalloc(2 + strlen(name) + strlen(value));		strcpy(p, name);		strcat(p, "=");		strcat(p, value);		ret = putenv(p);		/* GDKfree(p); LEAK INSERTED DUE TO SOME WEIRD CRASHES */	}	return ret;}#endifchar *MT_heapbase = NULL;/* Crude VM buffer management that keep a list of all memory mapped regions. * * a.k.a. "helping stupid VM implementations that ignore VM advise" * * The main goal is to be able to tell the OS to please stop buffering all memory * mapped pages when under pressure. A major problem is materialization of large * results in newly created memory mapped files. Operating systems tend to cache * all dirty pages, such that when memory is out, all pages are dirty and cannot * be unloaded quickly. The VM panic occurs and comatose OS states may be observed. * This is in spite of our use of madvise(MADV_SEQUENTIAL). That is; we would want  * that the OS drops pages after we've passed them. That does not happen; pages are  * retained and pollute the buffer cache. * * Regrettably, at this level, we don't know anything about how Monet is using the  * mmapped regions. Monet code is totally oblivious of any I/O; that's why it is  * so easy to create CPU efficient code in Monet. * * The current solution focuses on large writable maps. These often represent * newly created BATs, that are the result of some (running) operator. We  * assume two things here: * - the BAT is created in sequential fashion (always almost true) * - afterwards, this BAT is used in sequential fashion (often true) * * A VMtrim thread keeps an eye on the RSS (memory pressure) and large writable * memory maps. If RSS approaches mem_maxsize(), it starts to *worry*, and starts * to write dirty data from these writable maps to disk in 128MB tiles. So, if  * memory pressure rises further in the near future, the OS has some optiont to release  * memory pages cheaply (i.e. without needing I/O). This is also done explicitly by the  * VM-thread: when RSS exceeds mem_maxsize() is explicitly asks the OS to release pages. * The reason is that Linux is not smart enough to do even this. Anyway.. * * The way to free pages explicitly in Linux is to call posix_fadvise(..,MADV_DONTNEED).  * Particularly, posix_madvise(..,POSIX_MADV_DONTNEED) which is supported and documented  * doesn't work on Linux. But we do both posix_madvise and posix_fadvise, so on other unix * systems that don't support posix_fadvise, posix_madvise still might work. * On Windows, to our knowledge, there is no way to tell it stop buffering * a memory mapped region. msync (FlushViewOfFile) does work, though. So let's * hope the VM paging algorithm behaves better than Linux which just runs off * the cliff and if MonetDB does not prevent RSS from being too high, enters coma. * * We will only eb able to sensibly test this on Windows64. On Windows32, mmap sizes * do not significantly exceed RAM sizes so MonetDB swapping actually will not happen * (of course, you've got this nasty problem of VM fragemntation and failing mmaps instead). * * In principle, page tiles are saved sequentially, and behind it, but never overtaking * it, is an "unload-cursor" that frees the pages if that is needed to keep RSS down. * There is a tweak in the algorithm, that re-sets the unload-cursor if it seems * that all tiles to the end have been saved (whether a tile is actually saved is * determined by timing the sync action). This means that the producing operator * is ready creating the BAT, and we assume it is going to be used sequentially afterwards. * In that case, we should start unloading right after the 'read-cursor', that is,  * from the start. * * EXAMPLE * D = dirty tile * s = saved tile (i.e. clean) * u = unloaded tile * L = tile that is being loaded * *           +--> operator produces  BAT  * (1) DDDDDD|......................................| end of reserved mmap *                      ____|RSS *                     | *                     | at 3/4 of RSS consumed we start to worry *                     +--> operator produces BAT  * (2) DDDDDDDDDDDDDDDD|............................| *                    s<----------------------------- VM backwards save thread  *                    | *                    + first tile of which saving costs anything * *                        +--> operator produces BAT  * (3) DDDDDDDDDDDDDDDss|D|.........................| *     VM-thread save ->|  * * When the RSS target is exceeded, we start unloading tiles.. * *                     +-->  VM-thread unload starts at *second* 's' *                     | *                     |    +--> operator produces BAT  * (4) DDDDDDDDDDDDDDDsus|DD|........................| *     VM-thread save -->|  | RSS = Full! *                     *                                  +-- 0 => save costs nothing!! *     VM-thread save ------------->|        assume bat complete  * (5) DDDDDDDDDDDDDDDsuuuuuuuuussss0................| *                    |<-------- re-set unload cursor *                    +--- first tile was not unloaded.  *                      * later.. some other operator sequentially reads the bat * first part is 'D', that is, nicely cached. * *     ---read------->| * (6) DDDDDDDDDDDDDDDsuuuuuuuuussss0................| * * now we're hitting the unloaded region. the query becomes * I/O read bound here (typically 20% CPU utilization). * *     ---read-------->| * (7) DDDDDDDDDDDDDDDuLuuuuuuuussss0................| *                   /  \ *      unload cursor    load cursor * *     ---read---------------->| * (8) DDDDDDDDDDDDDDDuuuuuuuuuLssss0................| *                           /  \ *              unload cursor    load cursor * *     ---read--------------------->| done * (9) DDDDDDDDDDDDDDDuuuuuuuuuLssss0................| *                              **** *                              last part still cached  * * note: if we would not have re-setted the unload cursor (5) *       the last part would have been lost due to continuing *       RSS pressure from the 'L' read-cursor. * * If multiple write-mmaps exist, we do unload-tile and save-tile * selection on a round-robin basis among them. * * Of course, this is a simple solution for simple cases only. * (a) if the bat is produced too fast, (or your disk is too slow) *     RSS will exceeds its limit and Linux will go into swapping.  * (b) if your data is not produced and read sequentially.

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -