📄 xpathcounter.java
字号:
/**
* collects all XPaths, encodes all xml files
* @author Michal Karolczak
*/
import java.io.*;
import java.util.*;
import javax.xml.parsers.*;
import org.xml.sax.*;
import org.xml.sax.helpers.DefaultHandler;
public class XPathCounter
{
public static void main(String argv[]) throws IOException
{
System.out.println("App started");
/**
* start time
*/
long start = System.currentTimeMillis();
/**
* output file for all XPaths
*/
File outp= new File("C:\\out.txt");
/**
* output stream for outp file
*/
FileOutputStream fos = new FileOutputStream(outp,true);
/**
* initial capacity of Hashtable (the one with all XPaths)
*/
int initialCapacity = 200000; //133324 jest w sumie
/**
* global number of XPaths
*/
int xPathNr = 0;
System.out.println("Gathering information about files...");
/**
* GetFilesList object to create a list of all files in directory
*/
GetFilesList filesList = new GetFilesList("C:\\smalldoc","xml");//xmlDirPath,documents,smalldoc);
/**
* BitEncodedFilesList object to create a list of all bit encoded files (file name + bitset)
*/
BitEncodedFilesList bitEncodedFilesList = new BitEncodedFilesList();
/**
* Writer for writing xpaths to file
*/
Writer output = null;
/**
* buffered writer for outp (writing all xpaths to file)
*/
output = new BufferedWriter( new FileWriter(outp) );
/**
* Hashtable for all XPaths
* changed to LinkedHashMap for correct order
*/
LinkedHashMap<Integer,XPathInfo> allXPathsTable = new LinkedHashMap<Integer, XPathInfo>(initialCapacity);
int i=0;
/**
* iterate through all files in directory
*/
System.out.println("Encoding...");
for (i=0;i<filesList.listOfFiles.size()-1;i++)
{
/**
* XPathCounterSaxHandler object to count all xpaths in current document
*/
XPathCounterSaxHandler handler = new XPathCounterSaxHandler();
/**
* new instance of SAXParserFactory
*/
SAXParserFactory factory = SAXParserFactory.newInstance();
/**
* BitEncodedFileInfo object to gather all information about encoded file (filename, bitset)
*/
BitEncodedFileInfo bitEncodedFileInfo = new BitEncodedFileInfo(filesList.listOfFiles.get(i).toString());//75047 plikow jest
try
{
SAXParser saxParser = factory.newSAXParser();
/**
* parsing the content of the file with XPathCounterSaxHandler handler
*/
saxParser.parse(new File(filesList.listOfFiles.get(i)), handler);
/**
* gather the information from handler to HashMap
*/
HashMap<String, Integer> xPathCounts = handler.getXPathCounts();
//System.out.print(xPathNr+"\n");
if (xPathCounts.isEmpty())
{
System.out.println(filesList.listOfFiles.get(i).toString() + " contains no XPaths or it's not well formed.");
}
else
{
/**
* iterate through HashMap (now holds all xpaths from the current document)
*/
Iterator iterator = (new TreeSet<String>(xPathCounts.keySet())).iterator();
while (iterator.hasNext())
{
String xPath = (String) iterator.next();
String xPathWithCount = xPath + ": " + xPathCounts.get(xPath);
int newKey = xPathWithCount.hashCode();
//System.out.println(xPathWithCount + " -> " + newKey);
/**
* if the key is new
*/
if (!allXPathsTable.containsKey((int)newKey))
{
/**
* XPathInfo object to hold xpath and its global number
*/
XPathInfo xPathInfo = new XPathInfo(xPathWithCount,xPathNr);
/**
* puts new XPathInfo object into Hashtable
*/
allXPathsTable.put(newKey, xPathInfo);
/**
* sets the bit (index is global XPath number) for current file
*/
bitEncodedFileInfo.SetBit(xPathNr);
xPathNr++;
}
else
{
bitEncodedFileInfo.SetBit(allXPathsTable.get(newKey).xPathNr);
//System.out.println("XPath already in table | Actual xPath: " + allXPathsTable.get(newKey).toString() + " New xPath: " + xPathWithCount);
}
}//end - while xpath for 1 file iterator
//System.out.println(filesList.listOfFiles.get(i).toString());
}//end - else
}//end - try
catch (Throwable e)
{
System.out.println(filesList.listOfFiles.get(i).toString() + " contains no XPaths or is not well formed");
e.printStackTrace();
}
/**
* adds the new encoded information to file list
*/
bitEncodedFilesList.AddNewFileInfo(bitEncodedFileInfo);
}//for - every file
System.out.println("Encoding finished!");
System.out.println("Nr of files: " + i);
System.out.println("Nr of XPaths: " + xPathNr);
// Time measurement
long time = System.currentTimeMillis() - start;
double timeInMin = time*0.0000166;
System.out.println("Time [ms]: " + time);
System.out.println("Time [min]: " + (int)timeInMin);
//iterate through a collection of all xpaths
Set<Integer> set = allXPathsTable.keySet();
Iterator<Integer> itr = set.iterator();
while (itr.hasNext())
{
int elemNr = itr.next();
output.append(allXPathsTable.get(elemNr).xPath+" "+allXPathsTable.get(elemNr).xPathNr+"\n");
}
if (output != null) output.close();
//Saves encoded files list to file
bitEncodedFilesList.SaveToFile();
start = System.currentTimeMillis();
System.out.println("Creating distance matrix...");
bitEncodedFilesList.CreateDistanceMatrix();
System.out.println("Distance matrix created!");
time = System.currentTimeMillis() - start;
timeInMin = time*0.0000166;
System.out.println("Time [ms]: " + time);
System.out.println("Time [min]: " + (int)timeInMin);
} // main.
}; // XPathCounter
class XPathCounterSaxHandler extends DefaultHandler
{
int deep=0;
// All XPaths found in the XML document and their counts.
private LinkedHashMap<String, Integer> xPathCounts = new LinkedHashMap<String, Integer>();
// The stack of XPaths to the current element.
private Stack<String> xPaths = new Stack<String> ();
// Default constructor
public XPathCounterSaxHandler()
{
}
// Return the element events.
public LinkedHashMap<String, Integer> getXPathCounts()
{
return this.xPathCounts;
} // getXPathCounts
// Handler methods:
// Save event type (START:) and element name.
public void startElement(String namespaceURI,String lName,String qName,Attributes attrs) throws SAXException
{
// Get element names.
String eName = lName;
if ("".equals(eName)) eName = qName;
// Get parent's XPath
String parentXPath = xPaths.empty() ? "" : xPaths.peek();
// current element's XPath
String currentPath = parentXPath + "/" + eName;
xPaths.push(currentPath);
deep++;
//System.out.println(deep + ":" + currentPath);
// Insert current element and its attributes to the xPathCounts.
if (xPathCounts.containsKey(currentPath+":"+Integer.toString(deep)))
{
xPathCounts.put(currentPath+":"+Integer.toString(deep), new Integer(xPathCounts.get(currentPath+":"+Integer.toString(deep)).intValue()+1));
}
else
{
xPathCounts.put(currentPath+":"+Integer.toString(deep), new Integer(1));
}
// Insert attributes
if (attrs != null)
{
for (int i = 0; i < attrs.getLength(); i++)
{
String aName = attrs.getLocalName(i);
if ("".equals(aName)) aName = attrs.getQName(i);
String attPath = currentPath + "/@" + aName +":"+Integer.toString((deep+1));
if (xPathCounts.containsKey(attPath))
{
xPathCounts.put(attPath, new Integer(xPathCounts.get(attPath).intValue()));
}
else
{
xPathCounts.put(attPath, new Integer(1));
}
}
}
} // end - startElement
// Save event type (START:) and element name.
public void endElement(String namespaceURI, String lName,String qName) throws SAXException
{
// pop to remove the current element.
String currentPath = xPaths.pop();
deep--;
} // end - endElement
}; // XPathCounterSaxHandler
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -