📄 fileparser.cs
字号:
using System;
using System.IO;
using System.Text;
using System.Collections.Generic;
using System.Text.RegularExpressions;
using WordNetParser.Objects;
namespace WordNetParser.Helpers
{
internal class FileParser
{
#region ParseIndex
internal static Index ParseIndex( long offset, string dbFileName, string data )
{
Index retVal = new Index();
retVal.IdxOffset = 0;
retVal.OffsetCount = 0;
retVal.PartOfSpech = string.Empty;
retVal.PointersUsed = new List<int>();
retVal.PointersUsedCount = 0;
retVal.SenseCount = 0;
retVal.SynSetsOffsets = new List<long>();
retVal.TaggedSensesCount = 0;
retVal.Word = string.Empty;
if( string.IsNullOrEmpty( data ) )
data = ReadIndex( offset, dbFileName );
if( !string.IsNullOrEmpty( data ) )
{
int i = 0;
string[] tokens = data.Split( Constants.Tokenizer, StringSplitOptions.RemoveEmptyEntries );
retVal.IdxOffset = offset;
retVal.Word = tokens[ i ];
i++;
retVal.PartOfSpech = tokens[ i ];
i++;
retVal.SenseCount = Convert.ToInt32( tokens[ i ] );
i++;
retVal.PointersUsedCount = Convert.ToInt32( tokens[ i ] );
i++;
for( int j = 0; j < retVal.PointersUsedCount; j++ )
{
int pointerIndex = GetPointerTypeIndex( tokens[ i + j ] );
retVal.PointersUsed.Add( pointerIndex );
}
i = ( i + retVal.PointersUsedCount );
retVal.OffsetCount = Convert.ToInt32( tokens[ i ] );
i++;
retVal.TaggedSensesCount = Convert.ToInt32( tokens[ i ] );
i++;
for( int j = 0; j < retVal.OffsetCount; j++ )
{
long synSetOffset = Convert.ToInt64( tokens[ i + j ] );
retVal.SynSetsOffsets.Add( synSetOffset );
}
}
return retVal;
}
#endregion ParseIndex
#region ParseDefinition
internal static Definition ParseDefinition( long offset, string dbFileName, string word )
{
Definition retVal = new Definition();
string data = ReadIndex( offset, dbFileName );
if( !string.IsNullOrEmpty( data ) )
{
int i = 0;
bool foundPert = false;
string[] tokens = data.Split( Constants.Tokenizer, StringSplitOptions.RemoveEmptyEntries );
retVal.Position = Convert.ToInt64( tokens[ i ] );
i++;
if( retVal.Position != offset )
throw new ArithmeticException( "The stream position is not aligned with the specified offset!" );
retVal.FileNumber = Convert.ToInt32( tokens[ i ] );
i++;
retVal.PartOfSpeech = tokens[ i ];
i++;
if( GetSynSetTypeCode( retVal.PartOfSpeech ) == DbPartOfSpechType.Satellite )
retVal.DefinitionType = Constants.INDIRECT_ANT;
retVal.WordCount = Convert.ToInt32( tokens[ i ] );
i++;
for( int j = 0; j < retVal.WordCount * 2; j += 2 ) //Step by two for lexid
{
string tempWord = tokens[ i + j ];
if( !string.IsNullOrEmpty( tempWord ) )
retVal.Words.Add( DecodeWord( tempWord ) );
if( tempWord.ToLower() == word.ToLower() )
retVal.WhichWord = ( i + j );
}
i = ( i + ( retVal.WordCount * 2 ) );
retVal.PtrCount = Convert.ToInt32( tokens[ i ] );
i++;
for( int j = i; j < ( i + ( retVal.PtrCount * 4 ) ); j += 4 )
{
int pointerIndex = GetPointerTypeIndex( tokens[ j ] );
long pointerOffset = Convert.ToInt64( tokens[ j + 1 ] );
int pointerPartOfSpeech = GetPartOfSpeech( Convert.ToChar( tokens[ j + 2 ] ) );
string lexToFrom = tokens[ j + 3 ];
int lexFrom = Convert.ToInt32( lexToFrom.Substring( 0, 2 ) );
int lexTo = Convert.ToInt32( lexToFrom.Substring( 1, 2 ) );
retVal.PtrTypes.Add( pointerIndex );
retVal.PtrOffsets.Add( pointerOffset );
retVal.PtrPartOfSpeech.Add( pointerPartOfSpeech );
retVal.PtrFromFields.Add( lexFrom );
retVal.PtrToFields.Add( lexTo );
if( AssertDatabaseType( dbFileName, DbPartOfSpechType.Adj ) && retVal.DefinitionType == Constants.DONT_KNOW )
{
if( pointerIndex == Constants.PointerTypeContants.ANTPTR )
{
retVal.DefinitionType = Constants.DIRECT_ANT;
}
else if( pointerIndex == Constants.PointerTypeContants.PERTPTR )
{
foundPert = true;
}
}
}
i += ( retVal.PtrCount * 4 );
if( AssertDatabaseType( dbFileName, DbPartOfSpechType.Adj ) &&
retVal.DefinitionType == Constants.DONT_KNOW && foundPert )
{
retVal.DefinitionType = Constants.PERTAINY;
}
if( AssertDatabaseType( dbFileName, DbPartOfSpechType.Verb ) )
{
int verbFrames = Convert.ToInt32( tokens[ i ] );
retVal.VerbFrameCount = verbFrames;
i++;
for( int j = i; j < i + ( retVal.VerbFrameCount * 3 ); j += 3 )
{
int frameId = Convert.ToInt32( tokens[ j + 1 ] );
int frameTo = Convert.ToInt32( tokens[ j + 2 ] );
retVal.FrameIds.Add( frameId );
retVal.FrameToFields.Add( frameTo );
}
i += ( retVal.VerbFrameCount * 3 );
}
i++;
string definition = string.Join( " ", tokens, i, tokens.Length - i );
retVal.DefinitionText = definition;
retVal.SenseNumbers = new List<int>( new int[ retVal.WordCount ] );
for( int j = 0; j < retVal.WordCount; j++ )
{
retVal.SenseNumbers[ j ] = GetSearchSense( retVal, j );
}
}
return retVal;
}
#endregion ParseDefinition
#region GetSearchSense
private static int GetSearchSense( Definition def, int whichWord )
{
int retVal = 0;
DbPartOfSpechType indexType = GetSynSetTypeCode( def.PartOfSpeech );
string dbFileName = DbFileHelper.GetIndexForType( indexType )[ 0 ];
long offset = FastSearch( def.Words[ whichWord ], dbFileName );
Index idx = ParseIndex( offset, dbFileName, string.Empty );
for( int i = 0; i < idx.OffsetCount; i++ )
{
retVal = 0;
if( idx.SynSetsOffsets[ i ] == def.Position )
{
retVal = i + 1;
break;
}
}
return retVal;
}
#endregion GetSearchSense
#region ReadIndex
internal static string ReadIndex( long offset, string dbFileName )
{
string retVal = string.Empty;
using( StreamReader reader = new StreamReader( dbFileName, true ) )
{
reader.BaseStream.Seek( offset, SeekOrigin.Begin );
retVal = reader.ReadLine();
reader.Close();
//int i = 0;
//fs.Seek( offset, SeekOrigin.Begin );
//while( fs.Position < fs.Length && i < Constants.LINE_LEN )
//{
// byte[] btData = new byte[ 1 ];
// fs.Read( btData, 0, 1 );
// char c = ( char )btData[ 0 ];
// if( c == '\n' || c == '\r' )
// break;
// else
// retVal += c.ToString();
//}
//fs.Close();
}
return retVal;
}
#endregion ReadIndex
#region FastSearch
internal static long FastSearch( string keyword, string dbFileName )
{
long retVal = 0L;
string key = string.Empty;
Encoding enc = Encoding.Default;
using( StreamReader reader = new StreamReader( dbFileName, true ) )
{
enc = reader.CurrentEncoding;
reader.Close();
}
using( FileStream fs = File.OpenRead( dbFileName ) )
{
long diff = 666;
string line = string.Empty;
fs.Seek( 0, SeekOrigin.End );
long top = 0;
long bottom = fs.Position;
long mid = ( bottom - top ) / 2;
do
{
fs.Seek( mid - 1, SeekOrigin.Begin );
if( mid != 1 )
{
while( fs.ReadByte() != '\n' && fs.Position < fs.Length )
{
retVal = fs.Position;
}
}
byte[] btData = new byte[ Constants.KEY_LEN ];
int count = fs.Read( btData, 0, btData.Length );
fs.Seek( fs.Position - count, SeekOrigin.Begin );
string readData = enc.GetString( btData );
key = readData.Split( Constants.Tokenizer )[ 0 ];
if( string.Compare( key, keyword ) != 0 )
{
if( string.Compare( key, keyword ) < 0 )
{
top = mid;
diff = ( bottom - top ) / 2;
mid = top + diff;
}
if( string.Compare( key, keyword ) > 0 )
{
bottom = mid;
diff = ( bottom - top ) / 2;
mid = top + diff;
}
}
}
while( string.Compare( key, keyword ) != 0 && diff != 0 );
}
if( string.Compare( key, keyword ) != 0 )
retVal = 0L;
else
retVal++;
return retVal;
}
#endregion FastSearch
#region AssertDatabaseType
private static bool AssertDatabaseType( string dbFileName, DbPartOfSpechType type )
{
string strType = Path.GetExtension( dbFileName );
strType = strType.Substring( 1, strType.Length - 1 );
return ( strType.ToLower() == type.ToString().ToLower() );
}
#endregion AssertDatabaseType
#region GetPointerTypeIndex
internal static int GetPointerTypeIndex( string pointerMark )
{
int retVal = -1;
for( int i = 0; i < Constants.PointerTypes.Length; i++ )
{
string pointer = Constants.PointerTypes[ i ];
if( pointer.ToLower().Trim() == pointerMark.ToLower().Trim() )
{
retVal = i;
break;
}
}
return retVal;
}
#endregion GetPointerTypeIndex
#region GetSynSetTypeCode
internal static DbPartOfSpechType GetSynSetTypeCode( string data )
{
char pos = data[ 0 ];
switch( pos )
{
case 'n':
return DbPartOfSpechType.Noun;
case 'a':
return DbPartOfSpechType.Adj;
case 'v':
return DbPartOfSpechType.Verb;
case 's':
return DbPartOfSpechType.Satellite;
case 'r':
return DbPartOfSpechType.Adv;
}
return DbPartOfSpechType.All;
}
#endregion GetSynSetTypeCode
#region GetPartOfSpeech
internal static int GetPartOfSpeech( char data )
{
switch( data )
{
case 'n':
return ( Constants.POS_NOUN );
case 'a':
case 's':
return ( Constants.POS_ADJ );
case 'v':
return ( Constants.POS_VERB );
case 'r':
return ( Constants.POS_ADV );
}
return -1;
}
#endregion GetPartOfSpeech
#region EncodeWord
private static string EncodeWord( string data )
{
string retVal = string.Empty;
retVal = data.Replace( ' ', '_' );
return retVal;
}
#endregion EncodeWord
#region DecodeWord
private static string DecodeWord( string data )
{
string retVal = string.Empty;
retVal = data.Replace( '_', ' ' );
return retVal;
}
#endregion DecodeWord
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -