📄 wikipediatokenizertest.java
字号:
/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */package org.apache.lucene.wikipedia.analysis;import junit.framework.TestCase;import org.apache.lucene.analysis.Token;import java.io.StringReader;import java.util.HashMap;import java.util.Map;/** * * **/public class WikipediaTokenizerTest extends TestCase { public WikipediaTokenizerTest(String s) { super(s); } protected void setUp() { } protected void tearDown() { } public void testHandwritten() throws Exception { //make sure all tokens are in only one type String test = "[[link]] This is a [[Category:foo]] Category This is a linked [[:Category:bar none withstanding]] " + "Category This is (parens) This is a [[link]] This is an external URL [http://lucene.apache.org] " + "Here is ''italics'' and ''more italics'', '''bold''' and '''''five quotes''''' " + " This is a [[link|display info]] This is a period. Here is $3.25 and here is 3.50. Here's Johnny. " + "==heading== ===sub head=== followed by some text [[Category:blah| ]] " + "''[[Category:ital_cat]]'' here is some that is ''italics [[Category:foo]] but is never closed." + "'''same [[Category:foo]] goes for this '''''and2 [[Category:foo]] and this" + " [http://foo.boo.com/test/test/ Test Test] [http://foo.boo.com/test/test/test.html Test Test]" + " [http://foo.boo.com/test/test/test.html?g=b&c=d Test Test] <ref>Citation</ref> <sup>martian</sup> <span class=\"glue\">code</span>"; Map tcm = new HashMap();//map tokens to types tcm.put("link", WikipediaTokenizer.INTERNAL_LINK); tcm.put("display", WikipediaTokenizer.INTERNAL_LINK); tcm.put("info", WikipediaTokenizer.INTERNAL_LINK); tcm.put("http://lucene.apache.org", WikipediaTokenizer.EXTERNAL_LINK_URL); tcm.put("http://foo.boo.com/test/test/", WikipediaTokenizer.EXTERNAL_LINK_URL); tcm.put("http://foo.boo.com/test/test/test.html", WikipediaTokenizer.EXTERNAL_LINK_URL); tcm.put("http://foo.boo.com/test/test/test.html?g=b&c=d", WikipediaTokenizer.EXTERNAL_LINK_URL); tcm.put("Test", WikipediaTokenizer.EXTERNAL_LINK); //alphanums tcm.put("This", "<ALPHANUM>"); tcm.put("is", "<ALPHANUM>"); tcm.put("a", "<ALPHANUM>"); tcm.put("Category", "<ALPHANUM>"); tcm.put("linked", "<ALPHANUM>"); tcm.put("parens", "<ALPHANUM>"); tcm.put("external", "<ALPHANUM>"); tcm.put("URL", "<ALPHANUM>"); tcm.put("and", "<ALPHANUM>"); tcm.put("period", "<ALPHANUM>"); tcm.put("Here", "<ALPHANUM>"); tcm.put("Here's", "<APOSTROPHE>"); tcm.put("here", "<ALPHANUM>"); tcm.put("Johnny", "<ALPHANUM>"); tcm.put("followed", "<ALPHANUM>"); tcm.put("by", "<ALPHANUM>"); tcm.put("text", "<ALPHANUM>"); tcm.put("that", "<ALPHANUM>"); tcm.put("but", "<ALPHANUM>"); tcm.put("never", "<ALPHANUM>"); tcm.put("closed", "<ALPHANUM>"); tcm.put("goes", "<ALPHANUM>"); tcm.put("for", "<ALPHANUM>"); tcm.put("this", "<ALPHANUM>"); tcm.put("an", "<ALPHANUM>"); tcm.put("some", "<ALPHANUM>"); tcm.put("martian", "<ALPHANUM>"); tcm.put("code", "<ALPHANUM>"); tcm.put("foo", WikipediaTokenizer.CATEGORY); tcm.put("bar", WikipediaTokenizer.CATEGORY); tcm.put("none", WikipediaTokenizer.CATEGORY); tcm.put("withstanding", WikipediaTokenizer.CATEGORY); tcm.put("blah", WikipediaTokenizer.CATEGORY); tcm.put("ital", WikipediaTokenizer.CATEGORY); tcm.put("cat", WikipediaTokenizer.CATEGORY); tcm.put("italics", WikipediaTokenizer.ITALICS); tcm.put("more", WikipediaTokenizer.ITALICS); tcm.put("bold", WikipediaTokenizer.BOLD); tcm.put("same", WikipediaTokenizer.BOLD); tcm.put("five", WikipediaTokenizer.BOLD_ITALICS); tcm.put("and2", WikipediaTokenizer.BOLD_ITALICS); tcm.put("quotes", WikipediaTokenizer.BOLD_ITALICS); tcm.put("heading", WikipediaTokenizer.HEADING); tcm.put("sub", WikipediaTokenizer.SUB_HEADING); tcm.put("head", WikipediaTokenizer.SUB_HEADING); tcm.put("Citation", WikipediaTokenizer.CITATION); tcm.put("3.25", "<NUM>"); tcm.put("3.50", "<NUM>"); WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test)); Token token = new Token(); int count = 0; int numItalics = 0; int numBoldItalics = 0; int numCategory = 0; int numCitation = 0; while ((token = tf.next(token)) != null) { String tokText = token.termText(); //System.out.println("Text: " + tokText + " Type: " + token.type()); assertTrue("token is null and it shouldn't be", token != null); String expectedType = (String) tcm.get(tokText); assertTrue("expectedType is null and it shouldn't be for: " + token, expectedType != null); assertTrue(token.type() + " is not equal to " + expectedType + " for " + token, token.type().equals(expectedType) == true); count++; if (token.type().equals(WikipediaTokenizer.ITALICS) == true){ numItalics++; } else if (token.type().equals(WikipediaTokenizer.BOLD_ITALICS) == true){ numBoldItalics++; } else if (token.type().equals(WikipediaTokenizer.CATEGORY) == true){ numCategory++; } else if (token.type().equals(WikipediaTokenizer.CITATION) == true){ numCitation++; } } assertTrue("We have not seen enough tokens: " + count + " is not >= " + tcm.size(), count >= tcm.size()); assertTrue(numItalics + " does not equal: " + 4 + " for numItalics", numItalics == 4); assertTrue(numBoldItalics + " does not equal: " + 3 + " for numBoldItalics", numBoldItalics == 3); assertTrue(numCategory + " does not equal: " + 10 + " for numCategory", numCategory == 10); assertTrue(numCitation + " does not equal: " + 1 + " for numCitation", numCitation == 1); } public void testLinkPhrases() throws Exception { String test = "click [[link here again]] click [http://lucene.apache.org here again]"; WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test)); Token token = new Token(); token = tf.next(token); assertTrue("token is null and it shouldn't be", token != null); assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "click", new String(token.termBuffer(), 0, token.termLength()).equals("click") == true); assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1); token = tf.next(token); assertTrue("token is null and it shouldn't be", token != null); assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "link", new String(token.termBuffer(), 0, token.termLength()).equals("link") == true); assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1); token = tf.next(token); assertTrue("token is null and it shouldn't be", token != null); assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "here", new String(token.termBuffer(), 0, token.termLength()).equals("here") == true); //The link, and here should be at the same position for phrases to work assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1); token = tf.next(token); assertTrue("token is null and it shouldn't be", token != null); assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "again", new String(token.termBuffer(), 0, token.termLength()).equals("again") == true); assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1); token = tf.next(token); assertTrue("token is null and it shouldn't be", token != null); assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "click", new String(token.termBuffer(), 0, token.termLength()).equals("click") == true); assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1); token = tf.next(token); assertTrue("token is null and it shouldn't be", token != null); assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "http://lucene.apache.org", new String(token.termBuffer(), 0, token.termLength()).equals("http://lucene.apache.org") == true); assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1); token = tf.next(token); assertTrue("token is null and it shouldn't be", token != null); assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "here", new String(token.termBuffer(), 0, token.termLength()).equals("here") == true); assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0); token = tf.next(token); assertTrue("token is null and it shouldn't be", token != null); assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "again", new String(token.termBuffer(), 0, token.termLength()).equals("again") == true); assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1); } public void testLinks() throws Exception { String test = "[http://lucene.apache.org/java/docs/index.html#news here] [http://lucene.apache.org/java/docs/index.html?b=c here] [https://lucene.apache.org/java/docs/index.html?b=c here]"; WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test)); Token token = new Token(); token = tf.next(token); assertTrue("token is null and it shouldn't be", token != null); assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "http://lucene.apache.org/java/docs/index.html#news", new String(token.termBuffer(), 0, token.termLength()).equals("http://lucene.apache.org/java/docs/index.html#news") == true); assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, token.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true); tf.next(token);//skip here token = tf.next(token); assertTrue("token is null and it shouldn't be", token != null); assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "http://lucene.apache.org/java/docs/index.html?b=c", new String(token.termBuffer(), 0, token.termLength()).equals("http://lucene.apache.org/java/docs/index.html?b=c") == true); assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, token.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true); tf.next(token);//skip here token = tf.next(token); assertTrue("token is null and it shouldn't be", token != null); assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "https://lucene.apache.org/java/docs/index.html?b=c", new String(token.termBuffer(), 0, token.termLength()).equals("https://lucene.apache.org/java/docs/index.html?b=c") == true); assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, token.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -