⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 postinglist.java

📁 Create Posting list for term in documents
💻 JAVA
字号:
/******* PostingList.java ********************************************* 
 Implement PostingList using inverted list, also calcualte df 
 i.e. document frequency and count D i.e total no. of documents
***********************************************************************/
package drsystem;

import java.util.*;
import java.io.*;

class PostingList                   // main class PostingList
{
                                   //hashmap for dictionary posting list
  HashMap<String,Vector<Integer>> hPosList=null;
                                  //hashmap for storing document freq value (df)
  HashMap<String,Integer> hmapDF=null;
  int D=0;                                     // count total no. of documents
  Vector<Integer> noTermsDoc=null;           //count no. of terms in each doc.
  
  public void createPList(String fileIO[]) throws IOException // start of function
  {
    hPosList = new HashMap<String,Vector<Integer>>();
    hmapDF=new HashMap<String,Integer>();
    D=0;
    String str=null;                           //for temp storing each line
    Integer doc_id=0;                          // keep track of doc. id
    noTermsDoc = new Vector<Integer>();     
    
  try
  {
   BufferedReader fin=new BufferedReader(new FileReader("E:/DRS/output_files/"+fileIO[0]));
   String delimiter = ",.:;/-()'[]\\\" "; // used for breaking sentence

   while((str=fin.readLine())!=null)
   {
    if(str.startsWith(".i"))            // check if start of new document
    {
     doc_id=Integer.valueOf(str.substring(3));   // find doc. number
     noTermsDoc.addElement(0);                   // initialize 
     D=D+1;                                     // count no. of documents
     continue;
    }

    StringTokenizer tokenizer=new StringTokenizer(str,delimiter);
    while(tokenizer.hasMoreTokens())
    {
     String token=tokenizer.nextToken();
     if(token.equals(".w"))                     // ignore line containing .w
     continue;
     
     noTermsDoc.set(D-1,noTermsDoc.get(D-1)+1);// increment no. of terms by 1
     if(hPosList.containsKey(token))           // if token already present
     {
      Vector<Integer> vTemp= (Vector<Integer>) hPosList.get(token);
      if(! vTemp.contains(doc_id))    // if doc_id for current token not already present
      {                               // otherwise ignore as needn't to add again
       vTemp.addElement(doc_id);
       hmapDF.put(token,vTemp.size());         //store doc. frequency
      }
     }
     else                                     // if token already not present
     {
      Vector<Integer> vPList=new Vector<Integer>();
      vPList.addElement(doc_id);             // create vector and add doc_id
      hPosList.put(token,vPList);
      hmapDF.put(token,vPList.size());      //store doc. frequency
     }
    }
   }
  }
  catch(FileNotFoundException e)
  {
   System.out.println("PostingList.java:  " + e.getMessage());
  }
  catch(NumberFormatException e)
  {
   System.out.println("PostingList.java:  " + e.getMessage());
  }

 }
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -