📄 gdk_posix.mx
字号:
@' The contents of this file are subject to the MonetDB Public License@' Version 1.1 (the "License"); you may not use this file except in@' compliance with the License. You may obtain a copy of the License at@' http://monetdb.cwi.nl/Legal/MonetDBLicense-1.1.html@'@' Software distributed under the License is distributed on an "AS IS"@' basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the@' License for the specific language governing rights and limitations@' under the License.@'@' The Original Code is the MonetDB Database System.@'@' The Initial Developer of the Original Code is CWI.@' Portions created by CWI are Copyright (C) 1997-2007 CWI.@' All Rights Reserved.@f gdk_posix@a Niels Nes, Peter Boncz@* System Independent LayerGDK is built on Posix. Exceptions are made for memory mapped files and anonymous virtual memory, for which somewhat higher-level functions are defined here.Most of this file concerns itself with emulation of Posix functionality on the WIN32 native platform.@-@{@h#ifndef GDK_POSIX_H#define GDK_POSIX_H#include "gdk_system.h"#include <sys/types.h>#ifdef HAVE_MALLOC_H# include <malloc.h> /* mallopt, mallinfo, and malloc, free etc. */#endif#ifdef HAVE_FTIME#include <sys/timeb.h>#endif#if TIME_WITH_SYS_TIME# include <sys/time.h># include <time.h>#else# if HAVE_SYS_TIME_H# include <sys/time.h># else# include <time.h># endif#endif#if defined(HAVE_WINSOCK_H) && defined(NATIVE_WIN32)#include <winsock.h> /* for timeval */#endif#ifdef NATIVE_WIN32#include <io.h>#include <direct.h>#else#ifdef PROFILE/* Linux gprof messes up on multithreaded programs */gdk_export int gprof_pthread_create(pthread_t *__restrict, __const pthread_attr_t *__restrict, void * (*fcn)(void *), void *__restrict);#endif#endif/* Some systems (SGI, Sun) call malloc before we get a chance to call mallopt, and mallopt should be called before the first call to malloc. Therefore we do as if we don't have mallopt, even though in reality we do. */#ifdef HAVE_MALLOPT#undef HAVE_MALLOPT#endif#ifndef HAVE_MALLINFO#ifndef M_MXFAST#define M_MXFAST 1 /* set size of blocks to be fast */#endif#ifndef M_NLBLKS#define M_NLBLKS 2 /* set number of block in a holding block */#endif#ifndef M_GRAIN#define M_GRAIN 3 /* set number of sizes mapped to one, for */ /* small blocks */#endif#ifndef M_KEEP#define M_KEEP 4 /* retain contents of block after a free */ /* until another allocation */#endif#ifndef HAVE_STRUCT_MALLINFOstruct mallinfo { int arena; /* total space in arena */ int ordblks; /* number of ordinary blocks */ int smblks; /* number of small blocks */ int hblks; /* number of holding blocks */ int hblkhd; /* space in holding block headers */ int usmblks; /* space in small blocks in use */ int fsmblks; /* space in free small blocks */ int uordblks; /* space in ordinary blocks in use */ int fordblks; /* space in free ordinary blocks */ int keepcost; /* cost of enabling keep option */};#endif#define mallinfo() {0}#define mallopt(cmd,value) 0#endif /* ! HAVE_MALLINFO */gdk_export struct mallinfo MT_mallinfo(void);@- locking, sleep@h#define F_TLOCK 2 /* test and lock a region for exclusive use */#define F_ULOCK 0 /* unlock a previously locked region */#define F_LOCK 1 /* lock a region for exclusive use */gdk_export int MT_lockf(char *filename, int mode, off_t off, off_t len);gdk_export void MT_sleep_ms(unsigned int ms);@- virtual memory@h#define MT_VMUNITLOG 16#define MT_VMUNITSIZE (1 << MT_VMUNITLOG)#ifdef DEBUG_ALLOCgdk_export int MT_alloc_register(void *p, size_t size, char mode);gdk_export int MT_alloc_print(void);gdk_export int MT_alloc_table(void);#else#define MT_alloc_register(p, size, mode) (void)p; (void)size; (void)mode#define MT_alloc_print()#define MT_alloc_table()#endif /* make sure POSIX_MADV_* and posix_madvise() are defined somehow */#ifdef HAVE_SYS_MMAN_H# define __USE_BSD# include <sys/mman.h>#endif#ifndef HAVE_POSIX_MADVISE# ifdef HAVE_MADVISE# define posix_madvise madvise# ifndef MADV_RANDOM# define MADV_RANDOM 0# endif# ifndef POSIX_MADV_NORMAL# define POSIX_MADV_NORMAL MADV_NORMAL# define POSIX_MADV_RANDOM MADV_RANDOM# define POSIX_MADV_SEQUENTIAL MADV_SEQUENTIAL# define POSIX_MADV_WILLNEED MADV_WILLNEED# define POSIX_MADV_DONTNEED MADV_DONTNEED# endif# else# define posix_madvise(x,y,z) 0# ifndef POSIX_MADV_NORMAL# define POSIX_MADV_NORMAL 0# define POSIX_MADV_RANDOM 0# define POSIX_MADV_SEQUENTIAL 0# define POSIX_MADV_WILLNEED 0# define POSIX_MADV_DONTNEED 0# endif# endif#endif/* in case they are still not defined, define these values as something that doesn't do anything */#ifndef POSIX_MADV_NORMAL#define POSIX_MADV_NORMAL 0#endif#ifndef POSIX_MADV_RANDOM#define POSIX_MADV_RANDOM 0#endif#ifndef POSIX_MADV_SEQUENTIAL#define POSIX_MADV_SEQUENTIAL 0#endif#ifndef POSIX_MADV_WILLNEED#define POSIX_MADV_WILLNEED 0#endif#ifndef POSIX_MADV_DONTNEED#define POSIX_MADV_DONTNEED 0#endif/* the new mmap modes, mimic default MADV_* madvise POSIX constants */#define MMAP_NORMAL POSIX_MADV_NORMAL /* no further special treatment */#define MMAP_RANDOM POSIX_MADV_RANDOM /* expect random page references */#define MMAP_SEQUENTIAL POSIX_MADV_SEQUENTIAL /* expect sequential page references */#define MMAP_WILLNEED POSIX_MADV_WILLNEED /* will need these pages */#define MMAP_DONTNEED POSIX_MADV_DONTNEED /* don't need these pages */#define MMAP_READ 1024 /* region is readable (default if ommitted) */#define MMAP_WRITE 2048 /* region may be written into */#define MMAP_COPY 4096 /* writable, but changes never reach file */#define MMAP_ASYNC 8192 /* asynchronous writes (default if ommitted) */#define MMAP_SYNC 16384 /* writing is done synchronously *//* in order to be sure of madvise and msync modes, pass them to mmap() call as well *//* a hook function to add any initialization required for the MT_ functionality */gdk_export char *MT_heapbase;gdk_export char *MT_heapcur(void);gdk_export void MT_init_posix(int alloc_map);gdk_export size_t MT_getrss(void);gdk_export void *MT_mmap(char *path, int mode, off_t off, size_t len);gdk_export int MT_munmap(void *p, size_t len);gdk_export int MT_msync(void *p, size_t len, int mode);gdk_export int MT_madvise(void *p, size_t len, int advise);typedef struct MT_mmap_hdl_t { void *hdl; int mode; void *fixed;#ifdef NATIVE_WIN32 int hasLock; void *map;#endif} MT_mmap_hdl;gdk_export void *MT_mmap_open(MT_mmap_hdl *hdl, char *path, int mode, off_t off, size_t len, size_t nremaps);gdk_export void *MT_mmap_remap(MT_mmap_hdl *hdl, off_t off, size_t len);gdk_export void MT_mmap_close(MT_mmap_hdl *hdl);gdk_export int MT_mmap_trim(size_t lim, void *err);gdk_export void MT_mmap_pin(void *p, size_t len);gdk_export void MT_mmap_unpin(void *p, size_t len);gdk_export void *MT_vmalloc(size_t size, size_t * maxsize);gdk_export void MT_vmfree(void *p, size_t size);gdk_export void *MT_vmrealloc(void *voidptr, size_t oldsize, size_t newsize, size_t oldmaxsize, size_t * newmaxsize);gdk_export int MT_path_absolute(char *path);@}@+ Posix under WIN32 WIN32 actually supports many Posix functions directly. Some it does not, though. For some functionality we move in Monet from Posix calls to MT_*() calls, which translate easier to WIN32. Examples are MT_mmap() , MT_sleep_ms() and MT_path_absolute(). Why? In the caseof mmap() it is much easier for WIN32 to get a filename parameter rather than a file-descriptor. That is the reason in the case of mmap() to go for a MT_mmap() solution.For some other functionality, we do not need to abandon the Posix interface, though. Two cases can be distinguished.Missing functions in WIN32 are directly implemented (e.g. dlopen()/dlsym()/dlclose()).Posix functions in WIN32 whose functionality should be changed a bit. Examples are stat()/rename()/mkdir()/rmdir() who under WIN32 do not work if the path ends with a directory separator, but should work according to Posix. We remap such functions using a defineto an equivalent win_*() function (which in its implementation calls through to the WIN32 function).@{@h#ifdef NATIVE_WIN32#define RTLD_LAZY 1#define RTLD_NOW 2#define RTLD_GLOBAL 4gdk_export void *dlopen(const char *file, int mode);gdk_export int dlclose(void *handle);gdk_export void *dlsym(void *handle, const char *name);gdk_export char *dlerror(void);#ifndef HAVE_GETTIMEOFDAYgdk_export int gettimeofday(struct timeval *tv, int *ignore_zone);#endifgdk_export int win_stat(const char *, struct stat *);gdk_export int win_rmdir(const char *);gdk_export int win_mkdir(const char *, const int mode);#define stat(x,y) win_stat(x,y)#define mkdir win_mkdir#define rmdir win_rmdir#if _WIN32_WINNT >= 0x500#define link win_link#endif#define NAME_MAX 255#ifndef HAVE_OPENDIRstruct DIR { char *dir_name; int just_opened; unsigned int find_file_handle; char *find_file_data;};typedef struct DIR DIR;struct direct { char d_name[NAME_MAX + 1]; int d_namelen;};#endif#ifndef HAVE_FTRUNCATEgdk_export int ftruncate(int fd, off_t size);#endif#ifndef HAVE_OPENDIRgdk_export DIR *opendir(const char *dirname);gdk_export struct direct *readdir(DIR *dir);gdk_export void rewinddir(DIR *dir);gdk_export int closedir(DIR *dir);#endif#endif#endif /* GDK_POSIX_H */@c#include "monetdb_config.h"#include "gdk.h"#include <stdio.h>#ifdef HAVE_FCNTL_H# include <fcntl.h>#endif#ifdef WIN32size_t GDK_mem_pagebits = 16; /* on windows, the mmap addresses can be set by the 64KB */#elsesize_t GDK_mem_pagebits = 14; /* on linux, 4KB pages can be addressed */#endif#ifndef MAP_NORESERVE# define MAP_NORESERVE MAP_PRIVATE#endif#define MMAP_ADVISE 7#define MMAP_WRITABLE (MMAP_WRITE|MMAP_COPY)/* DDALERT: AIX4.X 64bits needs HAVE_SETENV==0 due to a AIX bug, but it probably isn't detected so by configure */#ifndef HAVE_SETENVintsetenv(const char *name, const char *value, int overwrite){ int ret = 0; if (overwrite || getenv(name) == NULL) { char *p = (char *) GDKmalloc(2 + strlen(name) + strlen(value)); strcpy(p, name); strcat(p, "="); strcat(p, value); ret = putenv(p); /* GDKfree(p); LEAK INSERTED DUE TO SOME WEIRD CRASHES */ } return ret;}#endifchar *MT_heapbase = NULL;/* Crude VM buffer management that keep a list of all memory mapped regions. * * a.k.a. "helping stupid VM implementations that ignore VM advise" * * The main goal is to be able to tell the OS to please stop buffering all memory * mapped pages when under pressure. A major problem is materialization of large * results in newly created memory mapped files. Operating systems tend to cache * all dirty pages, such that when memory is out, all pages are dirty and cannot * be unloaded quickly. The VM panic occurs and comatose OS states may be observed. * This is in spite of our use of madvise(MADV_SEQUENTIAL). That is; we would want * that the OS drops pages after we've passed them. That does not happen; pages are * retained and pollute the buffer cache. * * Regrettably, at this level, we don't know anything about how Monet is using the * mmapped regions. Monet code is totally oblivious of any I/O; that's why it is * so easy to create CPU efficient code in Monet. * * The current solution focuses on large writable maps. These often represent * newly created BATs, that are the result of some (running) operator. We * assume two things here: * - the BAT is created in sequential fashion (always almost true) * - afterwards, this BAT is used in sequential fashion (often true) * * A VMtrim thread keeps an eye on the RSS (memory pressure) and large writable * memory maps. If RSS approaches mem_maxsize(), it starts to *worry*, and starts * to write dirty data from these writable maps to disk in 128MB tiles. So, if * memory pressure rises further in the near future, the OS has some optiont to release * memory pages cheaply (i.e. without needing I/O). This is also done explicitly by the * VM-thread: when RSS exceeds mem_maxsize() is explicitly asks the OS to release pages. * The reason is that Linux is not smart enough to do even this. Anyway.. * * The way to free pages explicitly in Linux is to call posix_fadvise(..,MADV_DONTNEED). * Particularly, posix_madvise(..,POSIX_MADV_DONTNEED) which is supported and documented * doesn't work on Linux. But we do both posix_madvise and posix_fadvise, so on other unix * systems that don't support posix_fadvise, posix_madvise still might work. * On Windows, to our knowledge, there is no way to tell it stop buffering * a memory mapped region. msync (FlushViewOfFile) does work, though. So let's * hope the VM paging algorithm behaves better than Linux which just runs off * the cliff and if MonetDB does not prevent RSS from being too high, enters coma. * * We will only eb able to sensibly test this on Windows64. On Windows32, mmap sizes * do not significantly exceed RAM sizes so MonetDB swapping actually will not happen * (of course, you've got this nasty problem of VM fragemntation and failing mmaps instead). * * In principle, page tiles are saved sequentially, and behind it, but never overtaking * it, is an "unload-cursor" that frees the pages if that is needed to keep RSS down. * There is a tweak in the algorithm, that re-sets the unload-cursor if it seems * that all tiles to the end have been saved (whether a tile is actually saved is * determined by timing the sync action). This means that the producing operator * is ready creating the BAT, and we assume it is going to be used sequentially afterwards. * In that case, we should start unloading right after the 'read-cursor', that is, * from the start. * * EXAMPLE * D = dirty tile * s = saved tile (i.e. clean) * u = unloaded tile * L = tile that is being loaded * * +--> operator produces BAT * (1) DDDDDD|......................................| end of reserved mmap * ____|RSS * | * | at 3/4 of RSS consumed we start to worry * +--> operator produces BAT * (2) DDDDDDDDDDDDDDDD|............................| * s<----------------------------- VM backwards save thread * | * + first tile of which saving costs anything * * +--> operator produces BAT * (3) DDDDDDDDDDDDDDDss|D|.........................| * VM-thread save ->| * * When the RSS target is exceeded, we start unloading tiles.. * * +--> VM-thread unload starts at *second* 's' * | * | +--> operator produces BAT * (4) DDDDDDDDDDDDDDDsus|DD|........................| * VM-thread save -->| | RSS = Full! * * +-- 0 => save costs nothing!! * VM-thread save ------------->| assume bat complete * (5) DDDDDDDDDDDDDDDsuuuuuuuuussss0................| * |<-------- re-set unload cursor * +--- first tile was not unloaded. * * later.. some other operator sequentially reads the bat * first part is 'D', that is, nicely cached. * * ---read------->| * (6) DDDDDDDDDDDDDDDsuuuuuuuuussss0................| * * now we're hitting the unloaded region. the query becomes * I/O read bound here (typically 20% CPU utilization). * * ---read-------->| * (7) DDDDDDDDDDDDDDDuLuuuuuuuussss0................| * / \ * unload cursor load cursor * * ---read---------------->| * (8) DDDDDDDDDDDDDDDuuuuuuuuuLssss0................| * / \ * unload cursor load cursor * * ---read--------------------->| done * (9) DDDDDDDDDDDDDDDuuuuuuuuuLssss0................| * **** * last part still cached * * note: if we would not have re-setted the unload cursor (5) * the last part would have been lost due to continuing * RSS pressure from the 'L' read-cursor. * * If multiple write-mmaps exist, we do unload-tile and save-tile * selection on a round-robin basis among them. * * Of course, this is a simple solution for simple cases only. * (a) if the bat is produced too fast, (or your disk is too slow) * RSS will exceeds its limit and Linux will go into swapping. * (b) if your data is not produced and read sequentially.
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -