⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 htmlparser.java

📁 前期开发时开发的新闻发布系统
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
/* Generated By:JavaCC: Do not edit this line. HTMLParser.java */package org.apache.lucene.demo.html;import java.io.*;import java.util.Properties;public class HTMLParser implements HTMLParserConstants {	  public static int SUMMARY_LENGTH = 200;  StringBuffer title = new StringBuffer(SUMMARY_LENGTH);  StringBuffer summary = new StringBuffer(SUMMARY_LENGTH * 2);  Properties metaTags = new Properties();  String currentMetaTag = null;  String currentMetaContent = null;  int length = 0;  boolean titleComplete = false;  boolean inTitle = false;  boolean inMetaTag = false;  boolean inStyle = false;  boolean afterTag = false;  boolean afterSpace = false;  String eol = System.getProperty("line.separator");  Reader pipeIn = null;  Writer pipeOut;  private MyPipedInputStream pipeInStream = null;  private PipedOutputStream pipeOutStream = null;  private class MyPipedInputStream extends PipedInputStream  {    public MyPipedInputStream()    {      super();    }    public MyPipedInputStream(PipedOutputStream src) throws IOException    {      super(src);    }    public boolean full() throws IOException    {      return this.available() >= PipedInputStream.PIPE_SIZE;    }  }  public HTMLParser(File file) throws FileNotFoundException   {    this(new FileInputStream(file));  }  public String getTitle() throws IOException, InterruptedException   {    if(pipeIn == null)    {    	    	getReader();// spawn parsing thread    }    while(true)     {    	      synchronized(this)       {        if(titleComplete || pipeInStream.full())        {        	        	break;        }        wait(10);      }    }    return title.toString().trim();  }  public Properties getMetaTags() throws IOException, InterruptedException   {    if(pipeIn == null)    {    	    	getReader();// spawn parsing thread    }    while(true)     {      synchronized(this)       {        if(titleComplete || pipeInStream.full())        {        	        	break;        }        wait(10);      }    }    return metaTags;  }  public String getSummary() throws IOException, InterruptedException   {    if(pipeIn == null)    {    	    	getReader();// spawn parsing thread    }    while(true)     {      synchronized(this)       {        if(summary.length() >= SUMMARY_LENGTH || pipeInStream.full())        {        	        	break;        }        wait(10);      }    }    if(summary.length() > SUMMARY_LENGTH)    {    	    	summary.setLength(SUMMARY_LENGTH);    }    String sum = summary.toString().trim();    String tit = getTitle();    if(sum.startsWith(tit) || sum.equals(""))    {    	    	return tit;    }    else    {    	    	return sum;    }  }  public Reader getReader() throws IOException   {    if(pipeIn == null)     {      pipeInStream = new MyPipedInputStream();      pipeOutStream = new PipedOutputStream(pipeInStream);      pipeIn = new InputStreamReader(pipeInStream);      pipeOut = new OutputStreamWriter(pipeOutStream);      Thread thread = new ParserThread(this);      thread.start();// start parsing    }    return pipeIn;  }  void addToSummary(String text)   {    if(summary.length() < SUMMARY_LENGTH)     {      summary.append(text);      if(summary.length() >= SUMMARY_LENGTH)       {        synchronized(this)         {          notifyAll();        }      }    }  }  void addText(String text) throws IOException   {    if(inStyle)    {    	    	return;    }    if(inTitle)    {    	    	title.append(text);    }    else     {      addToSummary(text);      if(!titleComplete && !title.equals(""))// finished title       {          synchronized(this)         {          titleComplete = true;// tell waiting threads          notifyAll();        }      }    }    length += text.length();    pipeOut.write(text);    afterSpace = false;  }  void addMetaTag() throws IOException   {      metaTags.setProperty(currentMetaTag, currentMetaContent);      currentMetaTag = null;      currentMetaContent = null;      return;  }  void addSpace() throws IOException   {    if(!afterSpace)     {      if(inTitle)      {    	  title.append(" ");    	        }      else      {    	      	  addToSummary(" ");      }      String space = afterTag ? eol : " ";      length += space.length();      pipeOut.write(space);      afterSpace = true;    }  }  final public void HTMLDocument() throws ParseException, IOException   {	  Token t;	    label_1:	    while(true) 	    {	      switch((jj_ntk == -1) ? jj_ntk() : jj_ntk) 	      {	      	case ScriptStart:	      	case TagName:	      	case DeclName:	      	case Comment1:	      	case Comment2:	      	case Word:	      	case Entity:	      	case Space:	      	case Punct:	      				;	      				break;	      	default:	      				jj_la1[0] = jj_gen;	      				break label_1;	      }	      switch((jj_ntk == -1) ? jj_ntk() : jj_ntk) 	      {	      	case TagName:	      					Tag();	      					afterTag = true;	      					break;	      	case DeclName:	      					t = Decl();	      					afterTag = true;	      					break;	      	case Comment1:	      	case Comment2:	      					CommentTag();	      					afterTag = true;	      					break;	      	case ScriptStart:	      					ScriptTag();	      					afterTag = true;	      					break;	      	case Word:	      					t = jj_consume_token(Word);	      					addText(t.image);	      					afterTag = false;	      					break;	      	case Entity:	      					t = jj_consume_token(Entity);	      					addText(Entities.decode(t.image)); 	      					afterTag = false;	      					break;	      	case Punct:	      					t = jj_consume_token(Punct);	      					addText(t.image); 	      					afterTag = false;	      					break;	      	case Space:	      					jj_consume_token(Space);	      					addSpace(); 	      					afterTag = false;	      					break;	      	default:	      					jj_la1[1] = jj_gen;	      					jj_consume_token(-1);	      					throw new ParseException();	      }	    }	    jj_consume_token(0);  }  final public void Tag() throws ParseException, IOException   {	  Token t1, t2;	  boolean inImg = false;	  t1 = jj_consume_token(TagName);	  String tagName = t1.image.toLowerCase();	  if(Tags.WS_ELEMS.contains(tagName) ) 	  {		  addSpace();	  }	  inTitle = tagName.equalsIgnoreCase("<title"); // keep track if in <TITLE>	  inMetaTag = tagName.equalsIgnoreCase("<META"); // keep track if in <META>	  inStyle = tagName.equalsIgnoreCase("<STYLE"); // keep track if in <STYLE>	  inImg = tagName.equalsIgnoreCase("<img");     // keep track if in <IMG>	  label_2:	  while(true) 	  {		  switch((jj_ntk == -1) ? jj_ntk() : jj_ntk) 		  {		  	case ArgName:		  					;		  					break;		  	default:		  					jj_la1[2] = jj_gen;		  					break label_2;		  }		  t1 = jj_consume_token(ArgName);		  switch((jj_ntk == -1) ? jj_ntk() : jj_ntk) 		  {		  	 case ArgEquals:		  					jj_consume_token(ArgEquals);		  					switch((jj_ntk == -1) ? jj_ntk() : jj_ntk) 		  					{		  						case ArgValue:		  						case ArgQuote1:		  						case ArgQuote2:		  										t2 = ArgValue();		  										if(inImg && t1.image.equalsIgnoreCase("alt") && t2 != null)		  										{		  													  											addText("[" + t2.image + "]");		  										}		  										if(inMetaTag && (t1.image.equalsIgnoreCase("name") || t1.image.equalsIgnoreCase("HTTP-EQUIV")) && t2 != null)										        {										                currentMetaTag=t2.image.toLowerCase();										                if(currentMetaTag != null && currentMetaContent != null) 										                {										                	addMetaTag();										                }										        }		  										if(inMetaTag && t1.image.equalsIgnoreCase("content") && t2 != null)										        {										                currentMetaContent=t2.image.toLowerCase();										                if(currentMetaTag != null && currentMetaContent != null) 										                {										                	addMetaTag();										                }										        }										        break;		  					 default:		  						 				jj_la1[3] = jj_gen;		  					 					;		  					}		  	break;		  	default:		  				jj_la1[4] = jj_gen;		  				;		  }	  }	  jj_consume_token(TagEnd);  }  final public Token ArgValue() throws ParseException   {  Token t = null;    switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {    case ArgValue:      t = jj_consume_token(ArgValue);                                              {if (true) return t;}      break;    default:      jj_la1[5] = jj_gen;      if (jj_2_1(2)) {        jj_consume_token(ArgQuote1);        jj_consume_token(CloseQuote1);                                              {if (true) return t;}      } else {        switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {        case ArgQuote1:          jj_consume_token(ArgQuote1);          t = jj_consume_token(Quote1Text);          jj_consume_token(CloseQuote1);                                              {if (true) return t;}          break;        default:          jj_la1[6] = jj_gen;          if (jj_2_2(2)) {            jj_consume_token(ArgQuote2);            jj_consume_token(CloseQuote2);                                              {if (true) return t;}          } else {            switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {            case ArgQuote2:              jj_consume_token(ArgQuote2);              t = jj_consume_token(Quote2Text);              jj_consume_token(CloseQuote2);                                              {if (true) return t;}              break;            default:              jj_la1[7] = jj_gen;              jj_consume_token(-1);              throw new ParseException();            }          }        }      }    }    throw new Error("Missing return statement in function");

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -