⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 pv.c

📁 机器学习作者tom mitchell的书上代码
💻 C
📖 第 1 页 / 共 2 页
字号:
/* "Position vector", a (compressed) list of word positions in documents *//* Copyright (C) 1998 Andrew McCallum   Written by:  Andrew Kachites McCallum <mccallum@cs.cmu.edu>   This file is part of the Bag-Of-Words Library, `libbow'.   This library is free software; you can redistribute it and/or   modify it under the terms of the GNU Library General Public License   as published by the Free Software Foundation, version 2.      This library is distributed in the hope that it will be useful,   but WITHOUT ANY WARRANTY; without even the implied warranty of   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU   Library General Public License for more details.   You should have received a copy of the GNU Library General Public   License along with this library; if not, write to the Free Software   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA */#define _FILE_OFFSET_BITS 64#include <bow/libbow.h>#include <bow/archer.h>#define PV_DEBUG 1/* The total amount of memory consumed by PVM's */int bow_pvm_total_bytes = 0;/* The maximum memory we will allow PVM's to take before we flush them   to disk.  Currently set to 128M */int bow_pvm_max_total_bytes = 128 * 1024 * 1024;/* Allocate and return a new PVM that can hold SIZE bytes */bow_pvm *bow_pvm_new (int size){  bow_pvm *ret = bow_malloc (sizeof (bow_pvm) + size);  ret->size = size;  ret->read_end = 0;  ret->write_end = 0;  bow_pvm_total_bytes += sizeof (bow_pvm) + size;  return ret;}/* Increase the capacity of PVM, growing by doubling size until we get   to 128k, then just grow by 128k increments. */voidbow_pvm_grow (bow_pvm **pvm){  if ((*pvm)->size < 64 * 1024)    {      (*pvm)->size *= 2;      bow_pvm_total_bytes += (*pvm)->size;    }  else    {      (*pvm)->size += 64 * 1024;      bow_pvm_total_bytes += 64 * 1024;    }  *pvm = bow_realloc (*pvm, sizeof (bow_pvm) + (*pvm)->size);}/* Free the memory associated with the PVM */voidbow_pvm_free (bow_pvm *pvm){  bow_free (pvm);}/* Put the PVM's reader-pointer back to the beginning */static inline voidbow_pvm_rewind (bow_pvm *pvm){  pvm->read_end = 0;}/* PV functions *//* The first four bytes of a segment are an int that indicate how many   bytes are allocated in this segment.  The last four bytes of a   segment are an int that indicates the seek location of the next   segment.  The read_segment_bytes_remaining does not include   the size of the two int's. *//* Always enough for one "document index"/"word index" pair:   5 bytes == 6+4*7 == 34 bits for di, likewise for pi. */#define bow_pv_max_sizeof_di_pi (2 * 5)static int bow_pv_sizeof_first_segment = 2 * bow_pv_max_sizeof_di_pi;/* Fill in PV with the correct initial values. */voidbow_pv_init (bow_pv *pv, FILE *fp){  //pv->byte_count = 0;  pv->word_count = 0;  //pv->document_count = 0;  pv->pvm = NULL;  pv->seek_start = 0; //-1  pv->read_seek_end = 0;  pv->read_segment_bytes_remaining = -1;  pv->read_last_di = -1;  pv->read_last_pi = -1;  pv->write_last_di = -1;  pv->write_last_pi = -1;  pv->write_seek_last_tailer = 0;	/* This value must match READ_SEEK_END */}/* Write this PV's PVM to disk, and free the PVM. */voidbow_pv_flush (bow_pv *pv, FILE *fp){  off_t seek_new_segment;  off_t seek_new_tailer;  if (pv->pvm == NULL || pv->pvm->write_end == 0)    return;  /* Seek to the end of the file, which is the position at which this     segment of the PV will begin. */  fseeko (fp, 0, SEEK_END);  seek_new_segment = ftello (fp);  /* If none of this PV has ever been written to disk, remember this     position as the start position so that we can rewind there later. */  if (pv->seek_start == 0) //-1    {      pv->seek_start = seek_new_segment;      pv->read_seek_end = seek_new_segment;      pv->read_segment_bytes_remaining = pv->pvm->write_end;    }  /* Write the "header", which is the number of contents data bytes in     this segment. */  bow_fwrite_int (pv->pvm->write_end, fp);  /* Write the contents data */  fwrite (pv->pvm->contents, sizeof (unsigned char), pv->pvm->write_end, fp);  /* Write (a temporary value for) the "tailer".  Later we will put     here the seek position of the next pv segment on disk. */  /* xxx Don't actually need a ftello() here.  Do the math instead. */  seek_new_tailer = ftello (fp);  bow_fwrite_off_t (0, fp); //-1  /* If this is not the first time this PV has been flushed, then     the "tailer" of the previous flushed segment, and write the seek     position of this segment there. */  if (pv->write_seek_last_tailer != 0)    {      fseeko (fp, pv->write_seek_last_tailer, SEEK_SET);      bow_fwrite_off_t (seek_new_segment, fp);    }  pv->write_seek_last_tailer = seek_new_tailer;  bow_pvm_total_bytes -= sizeof (bow_pvm) + pv->pvm->size;  bow_pvm_free (pv->pvm);  pv->pvm = NULL;}/* Write to PVM the unsigned integer I, marked with the special flag   saying if it is a DI or a PI, (as indicated by IS_DI).  Assumes   there is enough space there in this PVM to write the info.  Returns   the number of bytes written. */static inline intbow_pvm_write_unsigned_int (bow_pvm *pvm, unsigned int i, int is_di){  bow_pe pe;  int byte_count = 1;		/* Count already the last byte */  /* assert (i < (1 < 6+7+7+7+1)); */  if (is_di)    pe.bits.is_di = 1;  else    pe.bits.is_di = 0;  if (i > 0x3f)			/* binary = 00111111 */    {      pe.bits.is_more = 1;      pe.bits.index = i & 0x3f;	/* binary = 00111111 */      pvm->contents[pvm->write_end++] = pe.byte;  /* Write the first byte */      byte_count++;      i = i >> 6;      while (i > 0x7f)		/* binary = 01111111 */	{	  pe.bits_more.is_more = 1;	  pe.bits_more.index = i & 0x7f;	  pvm->contents[pvm->write_end++] = pe.byte;	  byte_count++;	  i = i >> 7;	}	pe.bits_more.is_more = 0;	pe.bits_more.index = i;	pvm->contents[pvm->write_end++] = pe.byte;    }  else    {      pe.bits.is_more = 0;      pe.bits.index = i;      /* Write the first byte and only */      pvm->contents[pvm->write_end++] = pe.byte;    }  return byte_count;}/* Read an unsigned integer into I, and indicate whether it is a   "document index" or a "position index" by the value of IS_DI.   Returns the number of bytes read. */static inline intbow_pvm_read_unsigned_int (bow_pvm *pvm, unsigned int *i, int *is_di){  bow_pe pe;  int index;  int shift = 6;  int byte_count = 1;  pe.byte = pvm->contents[pvm->read_end++];  if (pe.bits.is_di)    *is_di = 1;  else    *is_di = 0;  index = pe.bits.index;  while (pe.bits.is_more)    /* The above test relies on pe.bits.is_more == pe.bits_more.is_more */    {      pe.byte = pvm->contents[pvm->read_end++];      byte_count++;      index |= pe.bits_more.index << shift;      shift += 7;    }  *i = index;  return byte_count;}/* Read an unsigned integer into I, and indicate whether it is a   "document index" or a "position index" by the value of IS_DI.   Assumes that FP is already seek'ed to the correct position. Returns   the number of bytes read. */static inline intbow_pv_read_unsigned_int (unsigned int *i, int *is_di, FILE *fp){  bow_pe pe;  int index;  int shift = 6;  int byte_count = 1;  pe.byte = fgetc (fp);  if (pe.bits.is_di)    *is_di = 1;  else    *is_di = 0;  index = pe.bits.index;  while (pe.bits.is_more)    /* The above test relies on pe.bits.is_more == pe.bits_more.is_more */    {      pe.byte = fgetc (fp);      byte_count++;      index |= pe.bits_more.index << shift;      shift += 7;    }  *i = index;  return byte_count;}#define PV_WRITE_SIZE_INT(N)			\(((N) < (1 << (6+1)))				\ ? 1						\ : (((N) < (1 << (6+7+1)))			\    ? 2						\    : (((N) < (1 << (6+7+7+1)))			\       ? 3					\       : (((N) < (1 << (6+7+7+7+1)))		\	  ? 4					\	  : 5))))static inline intbow_pv_write_size_di_pi (bow_pv *pv, int di, int pi){

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -