synset.java

来自「WordNet is an extensive database develop」· Java 代码 · 共 219 行

JAVA

219 行

/*
 * WordNet-Java
 *
 * Copyright 1998 by Oliver Steele.  You can use this software freely so long as you preserve
 * the copyright notice and this restriction, and label your changes.
 */
package edu.gwu.wordnet;

import java.io.*;
import java.util.*;

/** A <code>Synset</code>, or <b>syn</b>onym <b>set</b>, represents a line of a WordNet <var>pos</var><code>.data</code> file.
 * A <code>Synset</code> represents a concept, and contains a set of <code>Word</code>s, each of which has a sense
 * that names that concept (and each of which is therefore synonymous with the other words in the
 * <code>Synset</code>).
 *
 * <code>Synset</code>'s are linked by {@link Pointer}s into a network of related concepts; this is the <it>Net</it>
 * in WordNet.  {@link Synset#getTarget getTarget} retrieves the targets of these links, and
 * {@link Synset#getPointer getPointer} retrieves the pointers themselves.
 *
 * @see Word
 * @see Pointer
 * @author Oliver Steele, steele@cs.brandeis.edu
 * @version 1.0
 */
public class Synset implements PointerTarget {
	/*
	 * Instance implementation
	 */
	protected FileBackedDictionary dictionary;
	protected POS pos;
	protected long offset;
	protected boolean isAdjectiveCluster;
	protected Word[] words;
	protected Pointer[] pointers;
	protected String gloss;

	//
	// Object initialization
	//
	Synset(FileBackedDictionary dictionary) {
		this.dictionary = dictionary;
	}

	Synset initializeFrom(String line) {
		TokenizerParser tokenizer = new TokenizerParser(line, " ");

		this.offset = tokenizer.nextLong();
		tokenizer.nextToken();	// lex_filenum
		String ss_type = tokenizer.nextToken();
		this.isAdjectiveCluster = false;
		if (ss_type.equals("s")) {
			ss_type = "a";
			this.isAdjectiveCluster = true;
		}
		this.pos = POS.lookup(ss_type);

		int wordCount = tokenizer.nextHexInt();
		this.words = new Word[wordCount];
		for (int i = 0; i < wordCount; i++) {
			String lemma = tokenizer.nextToken();
			int id = tokenizer.nextHexInt();
			int flags = Word.NONE;
			// strip the syntactic marker
			if (lemma.charAt(lemma.length() - 1) == ')' && lemma.indexOf('(') > 0) {
				int lparen = lemma.indexOf('(');
				String marker = lemma.substring(lparen + 1, lemma.length() - 1);
				lemma = lemma.substring(0, lparen - 1);
				if (marker.equals("p")) {
					flags |= Word.PREDICATIVE;
				} else if (marker.equals("a")) {
					flags |= Word.ATTRIBUTIVE;
				} else if (marker.equals("ip")) {
					flags |= Word.IMMEDIATE_POSTNOMINAL;
				} else {
					throw new RuntimeException("unknown syntactic marker " + marker);
				}
			}
			words[i] = new Word(this, i, lemma.replace('_', ' '), flags);
		}

		int pointerCount = tokenizer.nextInt();
		this.pointers = new Pointer[pointerCount];
		for (int i = 0; i < pointerCount; i++) {
			pointers[i] = Pointer.parsePointer(dictionary, this, i, tokenizer);
		}

		if (pos == POS.VERB) {
			int f_cnt = tokenizer.nextInt();
			for (int i = 0; i < f_cnt; i++) {
				tokenizer.nextToken();	// "+"
				int f_num = tokenizer.nextInt();
				int w_num = tokenizer.nextInt();
				if (w_num > 0) {
					words[w_num - 1].setVerbFrameFlag(f_num);
				} else {
					for (int j = 0; j < words.length; ++j) {
						words[j].setVerbFrameFlag(f_num);
					}
				}
			}
		}

		this.gloss = null;
		int index = line.indexOf('|');
		if (index > 0) {
			this.gloss = line.substring(index + 2).trim();
		}
		return this;
	}

	static Synset parseSynset(FileBackedDictionary dictionary, String line) {
		try {
			return new Synset(dictionary).initializeFrom(line);
		} catch (RuntimeException e) {
			System.err.println("while parsing " + line);
			throw e;
		}
	}
	
	//
	// Object methods
	//
	public boolean equals(Object object) {
		return (object instanceof Synset)
			&& ((Synset) object).pos.equals(pos)
			&& ((Synset) object).offset == offset;
	}
	
	public int hashCode() {
		return pos.hashCode() ^ (int) offset;
	}
	
	public String toString() {
		return "[Synset " + offset + "@" + pos +": \""+ getDescription() +"\"]";
	}
	
	
	//
	// Accessors
	//
	public POS getPOS() {
		return pos;
	}
	
	public String getGloss() {
		return gloss;
	}

	public Word[] getWords() {
		return words;
	}

	public Word getWord(int index) {
		return words[index];
	}


	//
	// Description
	//
	public String getDescription() {
		StringBuffer buffer = new StringBuffer();
		for (int i = 0; i < words.length; ++i) {
			if (i > 0) {
				buffer.append(", ");
			}
			buffer.append(words[i].lemma);
		}
		return buffer.toString();
	}
	
	public String getLongDescription() {
		String description = getDescription();
		String gloss = getGloss();
		if (gloss != null) {
			description += " -- (" + gloss + ")";
		}
		return description;
	}


	//
	// Pointers
	//
	protected static PointerTarget[] collectTargets(Pointer[] pointers) {
		PointerTarget[] targets = new PointerTarget[pointers.length];
		for (int i = 0; i < pointers.length; ++i) {
			targets[i] = pointers[i].getTarget();
		}
		return targets;
	}
	
	public Pointer[] getPointers() {
		return pointers;
	}
	
	public Pointer[] getPointers(PointerType type) {
		Vector vector = new Vector(pointers.length);
		for (int i = 0; i < pointers.length; ++i) {
			Pointer pointer = pointers[i];
			if (pointer.getType().equals(type)) {
				vector.addElement(pointer);
			}
		}
		Pointer[] targets = new Pointer[vector.size()];
		vector.copyInto(targets);
		return targets;
	}
	
	public PointerTarget[] getTargets() {
		return collectTargets(getPointers());
	}
	
	public PointerTarget[] getTargets(PointerType type) {
		return collectTargets(getPointers(type));
	}
}

synset.java - 源码说明

本页面展示了「WordNet is an extensive database developed by Princeton University faculty and students over the las」中的 synset.java 源码文件，采用 Java 编程语言编写，共 219 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。

虫虫下载站收录了大量与University相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。

⌨️ 快捷键说明

复制代码Ctrl + C

搜索代码Ctrl + F

全屏模式F11

增大字号Ctrl + =

减小字号Ctrl + -

显示快捷键?