📄 synonymtokenfilter.java
字号:
package org.apache.lucene.index.memory;/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */import java.io.IOException;import org.apache.lucene.analysis.Token;import org.apache.lucene.analysis.TokenFilter;import org.apache.lucene.analysis.TokenStream;/** * Injects additional tokens for synonyms of token terms fetched from the * underlying child stream; the child stream must deliver lowercase tokens * for synonyms to be found. * * @author whoschek.AT.lbl.DOT.gov */public class SynonymTokenFilter extends TokenFilter { /** The Token.type used to indicate a synonym to higher level filters. */ public static final String SYNONYM_TOKEN_TYPE = "SYNONYM"; private final SynonymMap synonyms; private final int maxSynonyms; private String[] stack = null; private int index = 0; private Token current = null; private int todo = 0; /** * Creates an instance for the given underlying stream and synonym table. * * @param input * the underlying child token stream * @param synonyms * the map used to extract synonyms for terms * @param maxSynonyms * the maximum number of synonym tokens to return per underlying * token word (a value of Integer.MAX_VALUE indicates unlimited) */ public SynonymTokenFilter(TokenStream input, SynonymMap synonyms, int maxSynonyms) { super(input); if (input == null) throw new IllegalArgumentException("input must not be null"); if (synonyms == null) throw new IllegalArgumentException("synonyms must not be null"); if (maxSynonyms < 0) throw new IllegalArgumentException("maxSynonyms must not be negative"); this.synonyms = synonyms; this.maxSynonyms = maxSynonyms; } /** Returns the next token in the stream, or null at EOS. */ public Token next() throws IOException { Token token; while (todo > 0 && index < stack.length) { // pop from stack token = createToken(stack[index++], current); if (token != null) { todo--; return token; } } token = input.next(); if (token == null) return null; // EOS; iterator exhausted stack = synonyms.getSynonyms(token.termText()); // push onto stack if (stack.length > maxSynonyms) randomize(stack); index = 0; current = token; todo = maxSynonyms; return token; } /** * Creates and returns a token for the given synonym of the current input * token; Override for custom (stateless or stateful) behaviour, if desired. * * @param synonym * a synonym for the current token's term * @param current * the current token from the underlying child stream * @return a new token, or null to indicate that the given synonym should be * ignored */ protected Token createToken(String synonym, Token current) { Token token = new Token( synonym, current.startOffset(), current.endOffset(), SYNONYM_TOKEN_TYPE); token.setPositionIncrement(0); return token; } /** * Randomize synonyms to later sample a subset. Uses constant random seed * for reproducability. Uses "DRand", a simple, fast, uniform pseudo-random * number generator with medium statistical quality (multiplicative * congruential method), producing integers in the range [Integer.MIN_VALUE, * Integer.MAX_VALUE]. */ private static void randomize(Object[] arr) { int seed = 1234567; // constant int randomState = 4*seed + 1;// Random random = new Random(seed); // unnecessary overhead int len = arr.length; for (int i=0; i < len-1; i++) { randomState *= 0x278DDE6D; // z(i+1)=a*z(i) (mod 2**32) int r = randomState % (len-i); if (r < 0) r = -r; // e.g. -9 % 2 == -1// int r = random.nextInt(len-i); // swap arr[i, i+r] Object tmp = arr[i]; arr[i] = arr[i + r]; arr[i + r] = tmp; } } }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -