📄 dbz.java
字号:
// Dbz - access a Unix DBZ file
//
// Copyright (C) 1996 by Jef Poskanzer <jef@acme.com>. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
// OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
// SUCH DAMAGE.
//
// Visit the ACME Labs Java page for up-to-date versions of this and other
// fine Java utilities: http://www.acme.com/java/
package Acme;
import java.util.*;
import java.io.*;
/// Access a Unix DBZ file.
// <P>
// DBZ files are similar to DBM files. From the Java point of view,
// they are just on-disk hash tables. Comments from the C version:
// <P>
// The dbz database exploits the fact that when news stores a <key,value>
// tuple, the `value' part is a seek offset into a text file, pointing to
// a copy of the `key' part. This avoids the need to store a copy of
// the key in the dbz files. However, the text file *must* exist and be
// consistent with the dbz files, or things will fail.
// <P>
// The basic format of the database is a simple hash table containing the
// values. A value is stored by indexing into the table using a hash value
// computed from the key; collisions are resolved by linear probing (just
// search forward for an empty slot, wrapping around to the beginning of
// the table if necessary). Linear probing is a performance disaster when
// the table starts to get full, so a complication is introduced. The
// database is actually one *or more* tables, stored sequentially in the
// .pag file, and the length of linear-probe sequences is limited. The
// search (for an existing item or an empty slot) always starts in the
// first table, and whenever MAXRUN probes have been done in table N,
// probing continues in table N+1. This behaves reasonably well even in
// cases of massive overflow. There are some other small complications
// added, see comments below.
// <P>
// The table size is fixed for any particular database, but is determined
// dynamically when a database is rebuilt. The strategy is to try to pick
// the size so the first table will be no more than 2/3 full, that being
// slightly before the point where performance starts to degrade. (It is
// desirable to be a bit conservative because the overflow strategy tends
// to produce files with holes in them, which is a nuisance.)
// <P>
// <A HREF="/resources/classes/Acme/Dbz.java">Fetch the software.</A><BR>
// <A HREF="/resources/classes/Acme.tar.gz">Fetch the entire Acme package.</A>
public class Dbz extends Dictionary
{
/// For validating .dir format.
private static final int dbzVersion = 3;
/// Size of an offset - in Java that's an int.
private static final int SOF = 4;
// We assume that unused areas of a binary file are zeros, and that the
// bit pattern of `(of_t)0' is all zeros. The alternative is rather
// painful file initialization. Note that okayvalue(), if OVERFLOW is
// defined, knows what value of an offset would cause overflow.
private static final int VACANT = 0;
private static int BIAS( int o ) { return o + 1; }
private static int UNBIAS( int o ) { return o - 1; }
// In a Unix implementation, or indeed any in which an of_t is a byte
// count, there are a bunch of high bits free in an of_t. There is a
// use for them. Checking a possible hit by looking it up in the base
// file is relatively expensive, and the cost can be dramatically reduced
// by using some of those high bits to tag the value with a few more bits
// of the key's hash. This detects most false hits without the overhead of
// seek+read+strcmp. We use the top bit to indicate whether the value is
// tagged or not, and don't tag a value which is using the tag bits itself.
// We're in trouble if the of_t representation wants to use the top bit.
// The actual bitmasks and offset come from the configuration stuff,
// which permits fiddling with them as necessary, and also suppressing
// them completely (by defining the masks to 0). We build pre-shifted
// versions of the masks for efficiency.
private int tagbits;
private int taghere;
private int tagboth;
private boolean HASTAG( int o ) { return ( o & taghere ) != 0; }
private int TAG( int o ) { return o & tagbits; }
private int NOTAG( int o ) { return o & ( ~ tagboth ); }
private boolean CANTAG( int o ) { return ( o & tagboth ) == 0; }
private int MKTAG( int v ) { return ( v << tagshift ) & tagbits; }
// A new, from-scratch database, not built as a rebuild of an old one,
// needs to know table size, casemap algorithm, and tagging. Normally
// the user supplies this info, but there have to be defaults.
private static final int DEFSIZE=120011; // 300007 might be better
private static final char DEFCASE='C';
private static final int TAGENB=0x80;
private static final int TAGMASK=0x7f;
private static final int TAGSHIFT=24;
// We read configuration info from the .dir file into these variables,
// so we can avoid wired-in assumptions for an existing database.
//
// Among the info is a record of recent peak usages, so that a new table
// size can be chosen intelligently when rebuilding. 10 is a good
// number of usages to keep, since news displays marked fluctuations
// in volume on a 7-day cycle.
boolean olddbz; // .dir file empty but .pag not?
int tsize; // table size
int[] used = new int[11]; // entries used today, yesterday, ...
int valuesize; // size of table values, == SOF
int[] bytemap = new int[SOF]; // byte-order map
char casemap; // case-mapping algorithm
char fieldsep; // field separator in base file, if any
int tagenb; // unshifted tag-enable bit
int tagmask; // unshifted tag mask
int tagshift; // shift count for tagmask and tagenb
// For a program that makes many, many references to the database, it
// is a large performance win to keep the table in core, if it will fit.
// Note that this does hurt robustness in the event of crashes, and
// dbmclose() *must* be called to flush the in-core database to disk.
// The code is prepared to deal with the possibility that there isn't
// enough memory. There *is* an assumption that a size_t is big enough
// to hold the size (in bytes) of one table, so dbminit() tries to figure
// out whether this is possible first.
//
// The preferred way to ask for an in-core table is to do dbzincore(1)
// before dbminit(). The default is not to do it, although -DINCORE
// overrides this for backward compatibility with old dbz.
//
// We keep only the first table in core. This greatly simplifies the
// code, and bounds memory demand. Furthermore, doing this is a large
// performance win even in the event of massive overflow.
private boolean incore = true;
// Buffer for .pag reads. Buffering more than about 16 does not help
// significantly at the densities we try to maintain
private int[] pagbuf = new int[16];
// Buffer for base-file reads. Message-IDs (all news ever needs to
// read) are essentially never longer than 64 bytes.
private byte[] basebuf = new byte[64];
private static final int NOTFOUND = -1;
// Arguably the searcher struct for a given routine ought to be local to
// it, but a fetch() is very often immediately followed by a store(), and
// in some circumstances it is a useful performance win to remember where
// the fetch() completed. So we use a global struct and remember whether
// it is current.
DbzSearcher srch = new DbzSearcher();
DbzSearcher prev; // srch or null
// Byte-ordering stuff.
private int[] mybmap = new int[SOF]; // my byte order
private boolean bytesame;
private int MAPIN( int o ) { return bytesame ? o : bytemap( o, bytemap, mybmap ); }
private int MAPOUT( int o ) { return bytesame ? o : bytemap( o, mybmap, bytemap ); }
// The files.
private RandomAccessFile basef; // descriptor for base file
private RandomAccessFile dirf; // descriptor for .dir file
private RandomAccessFile pagf; // descriptor for .pag file
private boolean readOnly; // files open read-only
private int pagpos; // posn in pagf; only search may set != -1
private int[] corepag; // incore version of .pag file, if any
private boolean written; // has a store() been done?
/// Constructor.
// @exception IOException if something goes wrong
public Dbz( File file ) throws IOException
{
try
{
basef = new RandomAccessFile( file, "rw" );
dirf = new RandomAccessFile( file.getPath() + ".dir", "rw" );
pagf = new RandomAccessFile( file.getPath() + ".pag", "rw" );
readOnly = false;
}
catch ( Exception e )
{
basef = new RandomAccessFile( file, "r" );
dirf = new RandomAccessFile( file.getPath() + ".dir", "r" );
pagf = new RandomAccessFile( file.getPath() + ".pag", "r" );
readOnly = true;
}
pagpos = -1;
getconf();
tagbits = tagmask << tagshift;
taghere = tagenb << tagshift;
tagboth = tagbits | taghere;
mybytemap( mybmap );
bytesame = true;
for ( int i = 0; i < SOF; ++i )
if ( mybmap[i] != bytemap[i] )
bytesame = false;
// Get first table into core, if it looks desirable and feasible.
int s = tsize * SOF;
if ( incore && s / SOF == tsize )
corepag = getcore();
else
corepag = null;
// Misc. setup.
crcinit();
written = false;
prev = null;
}
/// Returns the number of elements contained within the dbz file.
public int size()
{
// !!!
return 0;
}
/// Returns true if the dbz file contains no elements.
public boolean isEmpty()
{
// !!!
return true;
}
/// Returns an enumeration of the dbz file's keys.
public Enumeration keys()
{
// !!!
return new DbzKeyEnumerator();
}
/// Returns an enumeration of the elements. Use the Enumeration methods
// on the returned object to fetch the elements sequentially.
public Enumeration elements()
{
return new DbzElementEnumerator( this );
}
/// Gets the object associated with the specified key in the dbz file.
// @param key the key in the hash table
// @returns the element for the key, or null if the key
// is not defined in the hash table.
public Object get( Object key )
{
return fetch( mapcase( (String) key ) );
}
/// Puts the specified element into the Dictionary, using the specified
// key. The element may be retrieved by doing a get() with the same
// key. The key and the element cannot be null.
// @param key the specified hashtable key
// @param value the specified element
// @return the old value of the key, or null if it did not have one.
// @exception NullPointerException If the value of the specified
// element is null.
public Object put( Object key, Object value )
{
return store( mapcase( (String) key ), value );
}
/// Removes the element corresponding to the key. Does nothing if the
// key is not present.
// @param key the key that needs to be removed
// @return the value of key, or null if the key was not found.
public Object remove( Object key )
{
// !!!
return null;
}
/// Main program - for testing etc.
public static void main( String[] args )
{
// !!!
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -