📄 crawlsample.cs
字号:
using System;
using System.IO;
using System.Text;
using System.Collections;
namespace SECompare.Kernel
{
/// <summary>
/// List of query samples to crawl.
/// </summary>
public class CrawlSample
{
private ArrayList mSamples; //An array list of CrawlSampleUnit
private String mCategory;
public int Count
{
get
{
return this.mSamples.Count;
}
}
public CrawlSample()
{
this.mSamples = new ArrayList();
}
/// <summary>
/// Load Query samples of a certain category from given file.
/// File format is like:
/// ******************************
/// words frequency
/// ******************************
/// cnn 8700552
/// cnn.com 2554240
/// cnn news 786853
/// www.cnn.com 755488
/// news 184851
/// *******************************
/// </summary>
/// <param name="category"></param>
/// <param name="filename"></param>
public void Load(String category,String filename)
{
this.mCategory = category;
//firstly, clear former data
this.mSamples.Clear();
//then, read data file line by line
StreamReader reader = new StreamReader(filename);
String line;
line = reader.ReadLine();
while(line!=null)
{
//parse the line to get a string(words) and a integer(frequency)
String[] splits = line.Split(new char[]{'\t'});
//get frequency string
int frequency=0;
int end; //end of words, and start of frequency
for(end=splits.Length-1;end>0;end--)
{
try
{
frequency = Convert.ToInt32(splits[end]);
break;
}
catch(FormatException)
{
continue;
}
}
StringBuilder words = new StringBuilder(splits[0]);
for(int i=1;i<end;i++)
{
words.Append(splits[i]);
}
this.mSamples.Add(new CrawlSampleUnit(words.ToString(),frequency));
//read next line until read the end
line = reader.ReadLine();
}
reader.Close();
}
/// <summary>
/// Check if the Words has been contained in the Sample List.
/// </summary>
/// <param name="Words">The Words String to check.</param>
/// <returns>Return ture if contains, else false.</returns>
public bool ContainsWords(String Words)
{
for(int i=0;i<this.mSamples.Count;i++)
{
CrawlSampleUnit u = (CrawlSampleUnit)this.mSamples[i];
if(u.Words.Equals(Words))
return true;
}
return false;
}
/// <summary>
/// Save query samples to file. The file will be overwritten.
/// </summary>
/// <param name="filename">File path</param>
public void Save(String filename)
{
FileInfo file = new FileInfo(filename);
if(!file.Exists)
{
FileStream fs = file.Create();
fs.Close();
}
StreamWriter w = new StreamWriter(file.FullName,false);
foreach(CrawlSampleUnit unit in this.mSamples)
{
w.WriteLine(unit.ToString());
}
w.Close();
}
/// <summary>
/// Remove same samples.
/// Leave a given [percentage] samples remain, which include samples of high, middle and low frequency.
/// </summary>
/// <param name="percentage">a float between 0 and 1, else do nothing</param>
public void Prune(float percentage)
{
if(percentage>1||percentage<0) return;
if(this.mSamples.Count==0) return;
this.Prune((int)(this.mSamples.Count*percentage));
}
/// <summary>
/// Remove same samples.
/// Leave a given [sampleSize] samples remain, which include samples of high, middle and low frequency.
/// </summary>
/// <param name="sampleSize">an integer, which represent the sample size after pruning.If it is larger than original size, do nothing.</param>
public void Prune(int sampleSize)
{
if(this.mSamples.Count==0||this.mSamples.Count<sampleSize) return;
int interval = this.mSamples.Count/sampleSize;
if(interval<=1) return;
//pruning
int index=0;//original index in samples
for(int i=0;i<this.mSamples.Count;i++,index++)
{
if(this.mSamples.Count==sampleSize) break;
if(index%interval!=0)
{
this.mSamples.RemoveAt(i);
i--;
}
}
if(this.mSamples.Count>sampleSize)
{
this.mSamples.RemoveRange(sampleSize,this.mSamples.Count-sampleSize);
}
}
public ArrayList GetSamples()
{
return this.mSamples;
}
public String[] GetQueries()
{
String[] ret = new String[this.mSamples.Count];
int i=0;
foreach(CrawlSampleUnit unit in this.mSamples)
{
ret[i] = unit.Words;
i++;
}
return ret;
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -