⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 stemmeren.java

📁 java编写的OCR软件
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
      if (ends("iveness")) { r("ive"); break; }      if (ends("fulness")) { r("ful"); break; }      if (ends("ousness")) { r("ous"); break; }      break;    case 't':       if (ends("aliti")) { r(NORM_AL); break; }      if (ends("iviti")) { r("ive"); break; }      if (ends("biliti")) { r(NORM_BLE); break; }      break;    case 'g':       if (ends("logi")) { r("log"); break; }    }   }  /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */  private final void step4() {     switch (b[k]) {    case 'e':       if (ends("icate")) { r(NORM_IC); break; }      if (ends("ative")) { r(""); break; }      if (ends("alize")) { r(NORM_AL); break; }      break;    case 'i':       if (ends("iciti")) { r(NORM_IC); break; }      break;    case 'l':       if (ends("ical")) { r(NORM_IC); break; }      if (ends("ful")) { r(""); break; }      break;    case 's':       if (ends("ness")) { r(""); break; }      break;    }  }    /* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */  private final void step5() {    if (k == k0) return;     switch (b[k-1]) {    case 'a':       if (ends(NORM_AL)) break;       return;    case 'c':       if (ends("ance")) break;      if (ends("ence")) break;       return;    case 'e':       if (ends("er")) break; return;    case 'i':       if (ends(NORM_IC)) break; return;    case 'l':       if (ends("able")) break;      if (ends("ible")) break; return;    case 'n':       if (ends("ant")) break;      if (ends("ement")) break;      if (ends("ment")) break;      /* element etc. not stripped before the m */      if (ends("ent")) break;       return;    case 'o':       if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) break;      if (ends("ou")) break;       return;      /* takes care of -ous */    case 's':       if (ends("ism")) break;       return;    case 't':       if (ends(NORM_ATE)) break;      if (ends("iti")) break;       return;    case 'u':       if (ends("ous")) break;       return;    case 'v':       if (ends("ive")) break;       return;    case 'z':       if (ends(NORM_IZE)) break;       return;    default:       return;    }    if (m() > 1)       k = j;  }  /* step6() removes a final -e if m() > 1. */  private final void step6() {    j = k;    if (b[k] == 'e') {      int a = m();      if (a > 1 || a == 1 && !cvc(k-1))         k--;    }    if (b[k] == 'l' && doublec(k) && m() > 1)       k--;  }  /**    * Stem a word provided as a String.  Returns the result as a String.   */  public String stem(String s) {    if (stem(s.toCharArray(), s.length()))      return toString();    else       return s;  }  /** Stem a word contained in a char[].  Returns true if the stemming process   * resulted in a word different from the input.  You can retrieve the    * result with getResultLength()/getResultBuffer() or toString().    */  public boolean stem(char[] word) {    return stem(word, word.length);  }  /** Stem a word contained in a portion of a char[] array.  Returns   * true if the stemming process resulted in a word different from   * the input.  You can retrieve the result with   * getResultLength()/getResultBuffer() or toString().     */  public boolean stem(char[] wordBuffer, int offset, int wordLen) {    reset();    if (b.length < wordLen) {      char[] new_b = new char[wordLen + EXTRA];      b = new_b;    }    for (int j=0; j<wordLen; j++)       b[j] = wordBuffer[offset+j];    i = wordLen;    return stem(0);  }  /** Stem a word contained in a leading portion of a char[] array.   * Returns true if the stemming process resulted in a word different   * from the input.  You can retrieve the result with   * getResultLength()/getResultBuffer() or toString().     */  public boolean stem(char[] word, int wordLen) {    return stem(word, 0, wordLen);  }  /** Stem the word placed into the Stemmer buffer through calls to add().   * Returns true if the stemming process resulted in a word different   * from the input.  You can retrieve the result with   * getResultLength()/getResultBuffer() or toString().     */  public boolean stem() {    return stem(0);  }  public boolean stem(int i0) {      k = i - 1;     k0 = i0;    if (k > k0+1) {       step1(); step2(); step3(); step4(); step5(); step6();     }    // Also, a word is considered dirty if we lopped off letters    // Thanks to Ifigenia Vairelles for pointing this out.    if (i != k+1)      dirty = true;    i = k+1;    return dirty;  }  /** Test program for demonstrating the Stemmer.  It reads a file and   * stems each word, writing the result to standard out.     * Usage: Stemmer file-name    */  public static void main(String[] args) {    StemmerEN s = new StemmerEN();    for (int i = 0; i < args.length; i++) {      try {        InputStream in = new FileInputStream(args[i]);        byte[] buffer = new byte[1024];        int bufferLen, offset, ch;        bufferLen = in.read(buffer);        offset = 0;        s.reset();        while(true) {            if (offset < bufferLen)             ch = buffer[offset++];          else {            bufferLen = in.read(buffer);            offset = 0;            if (bufferLen < 0)               ch = -1;            else               ch = buffer[offset++];          }          if (Character.isLetter((char) ch)) {            s.add(Character.toLowerCase((char) ch));          }          else {               s.stem();             System.out.print(s.toString());             s.reset();             if (ch < 0)                break;             else {               System.out.print((char) ch);             }           }        }        in.close();      }      catch (IOException e) {          System.out.println("error reading " + args[i]);      }    }  }}////    Jacson - Text Filtering with Java.//    Copyright (C) 2002 Frank S. Nestel (nestefan -at- users.sourceforge.net)////    This library is free software; you can redistribute it and/or//    modify it under the terms of the GNU Lesser General Public//    License as published by the Free Software Foundation; either//    version 2.1 of the License, or (at your option) any later version.////    This library is distributed in the hope that it will be useful,//    but WITHOUT ANY WARRANTY; without even the implied warranty of//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU//    Lesser General Public License for more details.////    You should have received a copy of the GNU Lesser General Public//    License along with this library; if not, write to the Free Software//    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA//// This file is taken from Apache Lucene which is under the following license:/* ==================================================================== * The Apache Software License, Version 1.1 * * Copyright (c) 2001 The Apache Software Foundation.  All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright *    notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright *    notice, this list of conditions and the following disclaimer in *    the documentation and/or other materials provided with the *    distribution. * * 3. The end-user documentation included with the redistribution, *    if any, must include the following acknowledgment: *       "This product includes software developed by the *        Apache Software Foundation (http://www.apache.org/)." *    Alternately, this acknowledgment may appear in the software itself, *    if and wherever such third-party acknowledgments normally appear. * * 4. The names "Apache" and "Apache Software Foundation" and *    "Apache Lucene" must not be used to endorse or promote products *    derived from this software without prior written permission. For *    written permission, please contact apache@apache.org. * * 5. Products derived from this software may not be called "Apache", *    "Apache Lucene", nor may "Apache" appear in their name, without *    prior written permission of the Apache Software Foundation. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * ==================================================================== * * This software consists of voluntary contributions made by many * individuals on behalf of the Apache Software Foundation.  For more * information on the Apache Software Foundation, please see * <http://www.apache.org/>. */

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -