📄 gdk_posix.c
字号:
#line 310 "/export/scratch0/monet/monet.GNU.64.64.d.14791/MonetDB/src/gdk/gdk_posix.mx"#include "monetdb_config.h"#include "gdk.h"#include <stdio.h>#ifdef HAVE_FCNTL_H# include <fcntl.h>#endif#ifdef WIN32size_t GDK_mem_pagebits = 16; /* on windows, the mmap addresses can be set by the 64KB */#elsesize_t GDK_mem_pagebits = 14; /* on linux, 4KB pages can be addressed */#endif#ifndef MAP_NORESERVE# define MAP_NORESERVE MAP_PRIVATE#endif#define MMAP_ADVISE 7#define MMAP_WRITABLE (MMAP_WRITE|MMAP_COPY)/* DDALERT: AIX4.X 64bits needs HAVE_SETENV==0 due to a AIX bug, but it probably isn't detected so by configure */#ifndef HAVE_SETENVintsetenv(const char *name, const char *value, int overwrite){ int ret = 0; if (overwrite || getenv(name) == NULL) { char *p = (char *) GDKmalloc(2 + strlen(name) + strlen(value)); strcpy(p, name); strcat(p, "="); strcat(p, value); ret = putenv(p); /* GDKfree(p); LEAK INSERTED DUE TO SOME WEIRD CRASHES */ } return ret;}#endifchar *MT_heapbase = NULL;/* Crude VM buffer management that keep a list of all memory mapped regions. * * a.k.a. "helping stupid VM implementations that ignore VM advise" * * The main goal is to be able to tell the OS to please stop buffering all memory * mapped pages when under pressure. A major problem is materialization of large * results in newly created memory mapped files. Operating systems tend to cache * all dirty pages, such that when memory is out, all pages are dirty and cannot * be unloaded quickly. The VM panic occurs and comatose OS states may be observed. * This is in spite of our use of madvise(MADV_SEQUENTIAL). That is; we would want * that the OS drops pages after we've passed them. That does not happen; pages are * retained and pollute the buffer cache. * * Regrettably, at this level, we don't know anything about how Monet is using the * mmapped regions. Monet code is totally oblivious of any I/O; that's why it is * so easy to create CPU efficient code in Monet. * * The current solution focuses on large writable maps. These often represent * newly created BATs, that are the result of some (running) operator. We * assume two things here: * - the BAT is created in sequential fashion (always almost true) * - afterwards, this BAT is used in sequential fashion (often true) * * A VMtrim thread keeps an eye on the RSS (memory pressure) and large writable * memory maps. If RSS approaches mem_maxsize(), it starts to *worry*, and starts * to write dirty data from these writable maps to disk in 128MB tiles. So, if * memory pressure rises further in the near future, the OS has some optiont to release * memory pages cheaply (i.e. without needing I/O). This is also done explicitly by the * VM-thread: when RSS exceeds mem_maxsize() is explicitly asks the OS to release pages. * The reason is that Linux is not smart enough to do even this. Anyway.. * * The way to free pages explicitly in Linux is to call posix_fadvise(..,MADV_DONTNEED). * Particularly, posix_madvise(..,POSIX_MADV_DONTNEED) which is supported and documented * doesn't work on Linux. But we do both posix_madvise and posix_fadvise, so on other unix * systems that don't support posix_fadvise, posix_madvise still might work. * On Windows, to our knowledge, there is no way to tell it stop buffering * a memory mapped region. msync (FlushViewOfFile) does work, though. So let's * hope the VM paging algorithm behaves better than Linux which just runs off * the cliff and if MonetDB does not prevent RSS from being too high, enters coma. * * We will only eb able to sensibly test this on Windows64. On Windows32, mmap sizes * do not significantly exceed RAM sizes so MonetDB swapping actually will not happen * (of course, you've got this nasty problem of VM fragemntation and failing mmaps instead). * * In principle, page tiles are saved sequentially, and behind it, but never overtaking * it, is an "unload-cursor" that frees the pages if that is needed to keep RSS down. * There is a tweak in the algorithm, that re-sets the unload-cursor if it seems * that all tiles to the end have been saved (whether a tile is actually saved is * determined by timing the sync action). This means that the producing operator * is ready creating the BAT, and we assume it is going to be used sequentially afterwards. * In that case, we should start unloading right after the 'read-cursor', that is, * from the start. * * EXAMPLE * D = dirty tile * s = saved tile (i.e. clean) * u = unloaded tile * L = tile that is being loaded * * +--> operator produces BAT * (1) DDDDDD|......................................| end of reserved mmap * ____|RSS * | * | at 3/4 of RSS consumed we start to worry * +--> operator produces BAT * (2) DDDDDDDDDDDDDDDD|............................| * s<----------------------------- VM backwards save thread * | * + first tile of which saving costs anything * * +--> operator produces BAT * (3) DDDDDDDDDDDDDDDss|D|.........................| * VM-thread save ->| * * When the RSS target is exceeded, we start unloading tiles.. * * +--> VM-thread unload starts at *second* 's' * | * | +--> operator produces BAT * (4) DDDDDDDDDDDDDDDsus|DD|........................| * VM-thread save -->| | RSS = Full! * * +-- 0 => save costs nothing!! * VM-thread save ------------->| assume bat complete * (5) DDDDDDDDDDDDDDDsuuuuuuuuussss0................| * |<-------- re-set unload cursor * +--- first tile was not unloaded. * * later.. some other operator sequentially reads the bat * first part is 'D', that is, nicely cached. * * ---read------->| * (6) DDDDDDDDDDDDDDDsuuuuuuuuussss0................| * * now we're hitting the unloaded region. the query becomes * I/O read bound here (typically 20% CPU utilization). * * ---read-------->| * (7) DDDDDDDDDDDDDDDuLuuuuuuuussss0................| * / \ * unload cursor load cursor * * ---read---------------->| * (8) DDDDDDDDDDDDDDDuuuuuuuuuLssss0................| * / \ * unload cursor load cursor * * ---read--------------------->| done * (9) DDDDDDDDDDDDDDDuuuuuuuuuLssss0................| * **** * last part still cached * * note: if we would not have re-setted the unload cursor (5) * the last part would have been lost due to continuing * RSS pressure from the 'L' read-cursor. * * If multiple write-mmaps exist, we do unload-tile and save-tile * selection on a round-robin basis among them. * * Of course, this is a simple solution for simple cases only. * (a) if the bat is produced too fast, (or your disk is too slow) * RSS will exceeds its limit and Linux will go into swapping. * (b) if your data is not produced and read sequentially. * Examples are sorting or clustering on huge datasets. * (c) if RSS pressure is due to large read-maps, rather than * intermediate results. * * Two crude suggestions: * - If we are under RSS pressure without unloadable tiles and with * savable tiles, we should consider suspending *all* other threads * until we manage to unload a tile. * - if there are no savable tiles (or in case of read-only maps) * we could resort to saving and unloading random tiles. * * To do better, our BAT algorithms should provide even more detailed * advise on their access patterns, which may even consist of pointers * to the cursors (i.e. pointers to b->batBuns->free or the cursors * in radix-cluster), which an enhanced version of this thread might * take into account. */#ifdef HAVE_PTHREAD_H/* pthread.h on Windows includes config.h if HAVE_CONFIG_H is set */#undef HAVE_CONFIG_H#include <sched.h>#include <pthread.h>#endif#ifdef HAVE_SEMAPHORE_H#include <semaphore.h>#endiftypedef struct { char path[128]; /* mapped file, retained for debugging */ char *base; /* base address */ size_t len; /* length of map */ size_t first_tile; /* from here we started saving tiles */ size_t save_tile; /* next tile to save */ size_t unload_tile; /* next tile to unload */ int last_tile; int fd; /* open fd (==-1 for anon vm), retained to give posix_fadvise */ int pincnt; /* incremented while a MIL command uses heap with a random pattern */ int writable; int next;} MT_mmap_t;#ifdef HAVE_POSIX_FADVISEstatic int do_not_use_posix_fadvise = 0;#endif#define MT_MMAP_TILE (1<<27)#define MT_MMAP_BUFSIZE 100MT_mmap_t MT_mmap_tab[MT_MMAP_BUFSIZE];int MT_mmap_cur = -1, MT_mmap_first = -1, MT_mmap_free = 0;pthread_mutex_t MT_mmap_lock;static voidMT_mmap_empty(int i){ MT_mmap_tab[i].path[0] = 0; MT_mmap_tab[i].base = NULL; MT_mmap_tab[i].len = 0; MT_mmap_tab[i].writable = 0; MT_mmap_tab[i].fd = -1; MT_mmap_tab[i].pincnt = -1;}static voidMT_mmap_init(void){ int i; /* create lock */ pthread_mutex_init(&MT_mmap_lock, 0); for (i = 0; i < MT_MMAP_BUFSIZE; i++) { MT_mmap_tab[i].next = i + 1; MT_mmap_empty(i); } MT_mmap_tab[i-1].next = -1;}/* returns previous element (to facilitate deletion) */static intMT_mmap_find(void *base){ /* maybe consider a hash table iso linked list?? */ int i, prev = MT_MMAP_BUFSIZE; for (i = MT_mmap_first; i >= 0; i = MT_mmap_tab[i].next) { if (MT_mmap_tab[i].base == base) { return prev; } prev = i; } return i;}static intMT_mmap_idx(void *base, size_t len){ if (len > MT_MMAP_TILE) { int i = MT_mmap_find(base); if (i >= 0) { if (i == MT_MMAP_BUFSIZE) { return MT_mmap_first; } else { return MT_mmap_tab[i].next; } } } return -1;}static intMT_mmap_new(char *path, void *base, size_t len, int fd, int writable){ (void) pthread_mutex_lock(&MT_mmap_lock); if (len > MT_MMAP_TILE && MT_mmap_free >= 0) { int i = MT_mmap_free; MT_mmap_free = MT_mmap_tab[i].next; MT_mmap_tab[i].next = MT_mmap_first; MT_mmap_first = i; if (MT_mmap_cur == -1) MT_mmap_cur = i;#ifdef MMAP_DEBUG stream_printf(GDKerr, "MT_mmap_new: %s fd=%d\n", path, fd);#endif strncpy(MT_mmap_tab[i].path, path, 128); MT_mmap_tab[i].base = base; MT_mmap_tab[i].len = len; MT_mmap_tab[i].save_tile = 1; MT_mmap_tab[i].last_tile = 0; MT_mmap_tab[i].first_tile = 0; MT_mmap_tab[i].unload_tile = 0; MT_mmap_tab[i].writable = writable; MT_mmap_tab[i].fd = fd; MT_mmap_tab[i].pincnt = 0; fd = -1; } (void) pthread_mutex_unlock(&MT_mmap_lock); return fd;}static voidMT_mmap_del(void *base, size_t len){ if (len > MT_MMAP_TILE) { int victim = 0, prev; (void) pthread_mutex_lock(&MT_mmap_lock); prev = MT_mmap_find(base); if (prev >= 0) { int ret; if (prev == MT_MMAP_BUFSIZE) { victim = MT_mmap_first; MT_mmap_first = MT_mmap_tab[MT_mmap_first].next; } else { victim = MT_mmap_tab[prev].next; MT_mmap_tab[prev].next = MT_mmap_tab[victim].next; } if (MT_mmap_cur == victim) { MT_mmap_cur = MT_mmap_first; }#ifdef HAVE_POSIX_FADVISE if (!do_not_use_posix_fadvise && MT_mmap_tab[victim].fd >= 0) { /* tell the OS quite clearly that you want to drop this */ ret = posix_fadvise(MT_mmap_tab[victim].fd, 0LL, MT_mmap_tab[victim].len, POSIX_FADV_DONTNEED);#ifdef MMAP_DEBUG stream_printf(GDKerr, "MT_mmap_del: posix_fadvise(%s,fd=%d,%uMB,POSIX_FADV_DONTNEED) = %d\n", MT_mmap_tab[victim].path, MT_mmap_tab[victim].fd, (unsigned int) (MT_mmap_tab[victim].len >> 20), ret);#endif }#endif ret = close(MT_mmap_tab[victim].fd);#ifdef MMAP_DEBUG stream_printf(GDKerr, "MT_mmap_del: close(%s fd=%d) = %d\n", MT_mmap_tab[victim].path, MT_mmap_tab[victim].fd, ret);#endif MT_mmap_tab[victim].next = MT_mmap_free; MT_mmap_empty(victim); MT_mmap_free = victim; (void) ret; } (void) pthread_mutex_unlock(&MT_mmap_lock); }}static intMT_fadvise(void *base, size_t len, int advice){ int ret = 0;#ifdef HAVE_POSIX_FADVISE if (!do_not_use_posix_fadvise) { int i; (void) pthread_mutex_lock(&MT_mmap_lock); i = MT_mmap_idx(base, len); if (i >= 0) { if (MT_mmap_tab[i].fd >= 0) { ret = posix_fadvise(MT_mmap_tab[i].fd, 0, len, advice);#ifdef MMAP_DEBUG stream_printf(GDKerr, "MT_fadvise: posix_fadvise(%s,fd=%d,%uMB,%d) = %d\n", MT_mmap_tab[i].path, MT_mmap_tab[i].fd, (unsigned int) (len >> 20), advice, ret);#endif } } (void) pthread_mutex_unlock(&MT_mmap_lock); }#else (void) base; (void) len; (void) advice;#endif return ret;}static voidMT_mmap_unload_tile(int i, size_t off, stream* err) { /* tell Linux to please stop caching this stuff */ int ret = posix_madvise(MT_mmap_tab[i].base+off, MT_MMAP_TILE, POSIX_MADV_DONTNEED); if (err) { stream_printf(err, "MT_mmap_unload_tile: posix_madvise(%s,off=%uMB,%uMB,fd=%d,POSIX_MADV_DONTNEED) = %d\n", MT_mmap_tab[i].path, (unsigned int) (off>>20), (unsigned int) (MT_MMAP_TILE>>20), MT_mmap_tab[i].fd, ret); }#ifdef HAVE_POSIX_FADVISE if (!do_not_use_posix_fadvise) { /* tell the OS quite clearly that you want to drop this */ ret = posix_fadvise(MT_mmap_tab[i].fd, off, MT_MMAP_TILE, POSIX_FADV_DONTNEED); if (err) { stream_printf(err, "MT_mmap_unload_tile: posix_fadvise(%s,off=%uMB,%uMB,fd=%d,POSIX_MADV_DONTNEED) = %d\n", MT_mmap_tab[i].path, (unsigned int) (off>>20), (unsigned int) (MT_MMAP_TILE>>20), MT_mmap_tab[i].fd, ret); } }#endif}static intMT_mmap_save_tile(int i, size_t tile, stream* err) { int t, ret; /* save to disk an 128MB tile, and observe how long this takes */ if (err) { stream_printf(err, "MT_mmap_save_tile: msync(%s,off=%uM,%u,SYNC)...\n", MT_mmap_tab[i].path, (unsigned int) (tile>>20), (unsigned int) (MT_MMAP_TILE>>20)); } t = GDKms(); ret = MT_msync(MT_mmap_tab[i].base+tile, MT_MMAP_TILE, MMAP_SYNC); t = GDKms() - t; if (err) { stream_printf(err, "MT_mmap_save_tile: msync(%s,tile=%uM,%uM,SYNC) = %d (%dms)\n", MT_mmap_tab[i].path, (unsigned int) (tile>>20), (unsigned int) (MT_MMAP_TILE>>20), ret, t); } if (t > 200) { /* this took time; so we should report back on our actions and await new orders */ if (MT_mmap_tab[i].save_tile == 1) { MT_mmap_tab[i].first_tile = tile; /* leave first tile for later sequential use pass (start unloading after it) */ MT_mmap_tab[i].unload_tile = tile + MT_MMAP_TILE; } MT_mmap_tab[i].save_tile = tile + MT_MMAP_TILE; (void) pthread_mutex_unlock(&MT_mmap_lock); return 1; } return 0;}/* round-robin next. this is to ensure some fairness if multiple large results are produced simultaneously */static intMT_mmap_next(int i) { if (i != -1) { i = MT_mmap_tab[i].next; if (i == -1) i = MT_mmap_first; } return i;}intMT_mmap_trim(size_t target, void *fp){ stream *err = (stream *) fp; size_t off, rss = MT_getrss(); int i, worry = (rss*4 > target*3); (void) pthread_mutex_lock(&MT_mmap_lock); if (err) { stream_printf(err, "MT_mmap_trim(%u MB): rss = %u MB\n", (unsigned int) ((target) >> 20), (unsigned int) (rss >> 20)); } if (rss > target) { /* try to selectively unload pages from the writable regions */ size_t delta = ((rss - target) + 4*MT_MMAP_TILE - 1) & ~(MT_MMAP_TILE-1); for(i = MT_mmap_next(MT_mmap_cur); delta && i != MT_mmap_cur; i = MT_mmap_next(i)) { if (MT_mmap_tab[i].fd >= 0 && MT_mmap_tab[i].writable) { if (MT_mmap_tab[i].unload_tile >= MT_mmap_tab[i].save_tile) MT_mmap_tab[i].unload_tile = MT_mmap_tab[i].first_tile; while(MT_mmap_tab[i].unload_tile < MT_mmap_tab[i].save_tile) { MT_mmap_unload_tile(i, MT_mmap_tab[i].unload_tile, err); MT_mmap_tab[i].unload_tile += MT_MMAP_TILE; if ((delta -= MT_MMAP_TILE) == 0) break; } } } } if (worry) { /* schedule background saves of tiles */ for(i = MT_mmap_next(MT_mmap_cur); i != MT_mmap_cur; i = MT_mmap_next(i)) { if (MT_mmap_tab[i].fd >= 0 && MT_mmap_tab[i].writable && (MT_mmap_tab[i].pincnt == 0 || MT_mmap_tab[i].len > target)) { if (MT_mmap_tab[i].save_tile == 1) { /* first run, walk backwards until we hit an unsaved tile */ for(off=MT_mmap_tab[i].len; off>=MT_MMAP_TILE; off-=MT_MMAP_TILE) if (MT_mmap_save_tile(i, off, err)) return 1; } else { /* save the next tile */ for(off=MT_mmap_tab[i].save_tile; off+MT_MMAP_TILE<MT_mmap_tab[i].len; off+=MT_MMAP_TILE) { if (MT_mmap_save_tile(i, off, err)) return 1; } /* we seem to have run through all savable tiles */ if (!MT_mmap_tab[i].last_tile) { MT_mmap_tab[i].last_tile = 1; MT_mmap_tab[i].unload_tile = MT_mmap_tab[i].first_tile; } } } } MT_mmap_cur = i; } (void) pthread_mutex_unlock(&MT_mmap_lock); return (worry);}voidMT_mmap_pin(void *base, size_t len){ int i; (void) pthread_mutex_lock(&MT_mmap_lock); i = MT_mmap_idx(base, len); if (i >= 0) { MT_mmap_tab[i].pincnt++; } (void) pthread_mutex_unlock(&MT_mmap_lock);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -