⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 fileparser.cs

📁 http://wordnet.princeton.edu/ WordNet is a large lexical database of English, developed under the d
💻 CS
字号:
using System;
using System.IO;
using System.Text;
using System.Collections.Generic;
using System.Text.RegularExpressions;

using WordNetParser.Objects;

namespace WordNetParser.Helpers
{
	internal class FileParser
	{
		#region ParseIndex
		internal static Index ParseIndex( long offset, string dbFileName, string data )
		{
			Index retVal = new Index();
			retVal.IdxOffset = 0;
			retVal.OffsetCount = 0;
			retVal.PartOfSpech = string.Empty;
			retVal.PointersUsed = new List<int>();
			retVal.PointersUsedCount = 0;
			retVal.SenseCount = 0;
			retVal.SynSetsOffsets = new List<long>();
			retVal.TaggedSensesCount = 0;
			retVal.Word = string.Empty;

			if( string.IsNullOrEmpty( data ) )
				data = ReadIndex( offset, dbFileName );

			if( !string.IsNullOrEmpty( data ) )
			{
				int i = 0;
				string[] tokens = data.Split( Constants.Tokenizer, StringSplitOptions.RemoveEmptyEntries );

				retVal.IdxOffset = offset;

				retVal.Word = tokens[ i ];
				i++;

				retVal.PartOfSpech = tokens[ i ];
				i++;

				retVal.SenseCount = Convert.ToInt32( tokens[ i ] );
				i++;

				retVal.PointersUsedCount = Convert.ToInt32( tokens[ i ] );
				i++;

				for( int j = 0; j < retVal.PointersUsedCount; j++ )
				{
					int pointerIndex = GetPointerTypeIndex( tokens[ i + j ] );
					retVal.PointersUsed.Add( pointerIndex );
				}
				i = ( i + retVal.PointersUsedCount );

				retVal.OffsetCount = Convert.ToInt32( tokens[ i ] );
				i++;

				retVal.TaggedSensesCount = Convert.ToInt32( tokens[ i ] );
				i++;

				for( int j = 0; j < retVal.OffsetCount; j++ )
				{
					long synSetOffset = Convert.ToInt64( tokens[ i + j ] );
					retVal.SynSetsOffsets.Add( synSetOffset );
				}
			}

			return retVal;
		}
		#endregion ParseIndex

		#region ParseDefinition
		internal static Definition ParseDefinition( long offset, string dbFileName, string word )
		{
			Definition retVal = new Definition();
			string data = ReadIndex( offset, dbFileName );
			if( !string.IsNullOrEmpty( data ) )
			{
				int i = 0;
				bool foundPert = false;
				string[] tokens = data.Split( Constants.Tokenizer, StringSplitOptions.RemoveEmptyEntries );

				retVal.Position = Convert.ToInt64( tokens[ i ] );
				i++;

				if( retVal.Position != offset )
					throw new ArithmeticException( "The stream position is not aligned with the specified offset!" );

				retVal.FileNumber = Convert.ToInt32( tokens[ i ] );
				i++;

				retVal.PartOfSpeech = tokens[ i ];
				i++;

				if( GetSynSetTypeCode( retVal.PartOfSpeech ) == DbPartOfSpechType.Satellite )
					retVal.DefinitionType = Constants.INDIRECT_ANT;

				retVal.WordCount = Convert.ToInt32( tokens[ i ] );
				i++;

				for( int j = 0; j < retVal.WordCount * 2; j += 2 ) //Step by two for lexid
				{
					string tempWord = tokens[ i + j ];
					if( !string.IsNullOrEmpty( tempWord ) )
						retVal.Words.Add( DecodeWord( tempWord ) );

					if( tempWord.ToLower() == word.ToLower() )
						retVal.WhichWord = ( i + j );
				}
				i = ( i + ( retVal.WordCount * 2 ) );

				retVal.PtrCount = Convert.ToInt32( tokens[ i ] );
				i++;

				for( int j = i; j < ( i + ( retVal.PtrCount * 4 ) ); j += 4 )
				{
					int pointerIndex = GetPointerTypeIndex( tokens[ j ] );
					long pointerOffset = Convert.ToInt64( tokens[ j + 1 ] );
					int pointerPartOfSpeech = GetPartOfSpeech( Convert.ToChar( tokens[ j + 2 ] ) );
					string lexToFrom = tokens[ j + 3 ];
					int lexFrom = Convert.ToInt32( lexToFrom.Substring( 0, 2 ) );
					int lexTo = Convert.ToInt32( lexToFrom.Substring( 1, 2 ) );

					retVal.PtrTypes.Add( pointerIndex );
					retVal.PtrOffsets.Add( pointerOffset );
					retVal.PtrPartOfSpeech.Add( pointerPartOfSpeech );
					retVal.PtrFromFields.Add( lexFrom );
					retVal.PtrToFields.Add( lexTo );

					if( AssertDatabaseType( dbFileName, DbPartOfSpechType.Adj ) && retVal.DefinitionType == Constants.DONT_KNOW )
					{
						if( pointerIndex == Constants.PointerTypeContants.ANTPTR )
						{
							retVal.DefinitionType = Constants.DIRECT_ANT;
						}
						else if( pointerIndex == Constants.PointerTypeContants.PERTPTR )
						{
							foundPert = true;
						}
					}
				}
				i += ( retVal.PtrCount * 4 );

				if( AssertDatabaseType( dbFileName, DbPartOfSpechType.Adj ) &&
					retVal.DefinitionType == Constants.DONT_KNOW && foundPert )
				{
					retVal.DefinitionType = Constants.PERTAINY;
				}

				if( AssertDatabaseType( dbFileName, DbPartOfSpechType.Verb ) )
				{
					int verbFrames = Convert.ToInt32( tokens[ i ] );
					retVal.VerbFrameCount = verbFrames;
					i++;

					for( int j = i; j < i + ( retVal.VerbFrameCount * 3 ); j += 3 )
					{
						int frameId = Convert.ToInt32( tokens[ j + 1 ] );
						int frameTo = Convert.ToInt32( tokens[ j + 2 ] );

						retVal.FrameIds.Add( frameId );
						retVal.FrameToFields.Add( frameTo );
					}
					i += ( retVal.VerbFrameCount * 3 );
				}
				i++;

				string definition = string.Join( " ", tokens, i, tokens.Length - i );
				retVal.DefinitionText = definition;

				retVal.SenseNumbers = new List<int>( new int[ retVal.WordCount ] );
				for( int j = 0; j < retVal.WordCount; j++ )
				{
					retVal.SenseNumbers[ j ] = GetSearchSense( retVal, j );
				}
			}
			return retVal;
		}
		#endregion ParseDefinition

		#region GetSearchSense
		private static int GetSearchSense( Definition def, int whichWord )
		{
			int retVal = 0;
			DbPartOfSpechType indexType = GetSynSetTypeCode( def.PartOfSpeech );
			string dbFileName = DbFileHelper.GetIndexForType( indexType )[ 0 ];
			long offset = FastSearch( def.Words[ whichWord ], dbFileName );
			Index idx = ParseIndex( offset, dbFileName, string.Empty );

			for( int i = 0; i < idx.OffsetCount; i++ )
			{
				retVal = 0;
				if( idx.SynSetsOffsets[ i ] == def.Position )
				{
					retVal = i + 1;
					break;
				}
			}

			return retVal;
		}
		#endregion GetSearchSense

		#region ReadIndex
		internal static string ReadIndex( long offset, string dbFileName )
		{
			string retVal = string.Empty;

			using( StreamReader reader = new StreamReader( dbFileName, true ) )
			{
				reader.BaseStream.Seek( offset, SeekOrigin.Begin );
				retVal = reader.ReadLine();
				reader.Close();

				//int i = 0;

				//fs.Seek( offset, SeekOrigin.Begin );
				//while( fs.Position < fs.Length && i < Constants.LINE_LEN )
				//{
				//    byte[] btData = new byte[ 1 ];
				//    fs.Read( btData, 0, 1 );

				//    char c = ( char )btData[ 0 ];
				//    if( c == '\n' || c == '\r' )
				//        break;
				//    else
				//        retVal += c.ToString();
				//}
				//fs.Close();
			}

			return retVal;
		}
		#endregion ReadIndex

		#region FastSearch
		internal static long FastSearch( string keyword, string dbFileName )
		{
			long retVal = 0L;
			string key = string.Empty;
			Encoding enc = Encoding.Default;

			using( StreamReader reader = new StreamReader( dbFileName, true ) )
			{
				enc = reader.CurrentEncoding;
				reader.Close();
			}

			using( FileStream fs = File.OpenRead( dbFileName ) )
			{
				long diff = 666;
				string line = string.Empty;

				fs.Seek( 0, SeekOrigin.End );
				long top = 0;
				long bottom = fs.Position;
				long mid = ( bottom - top ) / 2;

				do
				{
					fs.Seek( mid - 1, SeekOrigin.Begin );
					if( mid != 1 )
					{
						while( fs.ReadByte() != '\n' && fs.Position < fs.Length )
						{
							retVal = fs.Position;
						}
					}

					byte[] btData = new byte[ Constants.KEY_LEN ];
					int count = fs.Read( btData, 0, btData.Length );
					fs.Seek( fs.Position - count, SeekOrigin.Begin );

					string readData = enc.GetString( btData );
					key = readData.Split( Constants.Tokenizer )[ 0 ];

					if( string.Compare( key, keyword ) != 0 )
					{
						if( string.Compare( key, keyword ) < 0 )
						{
							top = mid;
							diff = ( bottom - top ) / 2;
							mid = top + diff;
						}

						if( string.Compare( key, keyword ) > 0 )
						{
							bottom = mid;
							diff = ( bottom - top ) / 2;
							mid = top + diff;
						}
					}
				}
				while( string.Compare( key, keyword ) != 0 && diff != 0 );
			}

			if( string.Compare( key, keyword ) != 0 )
				retVal = 0L;
			else
				retVal++;

			return retVal;
		}
		#endregion FastSearch

		#region AssertDatabaseType
		private static bool AssertDatabaseType( string dbFileName, DbPartOfSpechType type )
		{
			string strType = Path.GetExtension( dbFileName );
			strType = strType.Substring( 1, strType.Length - 1 );
			return ( strType.ToLower() == type.ToString().ToLower() );
		}
		#endregion AssertDatabaseType

		#region GetPointerTypeIndex
		internal static int GetPointerTypeIndex( string pointerMark )
		{
			int retVal = -1;
			for( int i = 0; i < Constants.PointerTypes.Length; i++ )
			{
				string pointer = Constants.PointerTypes[ i ];
				if( pointer.ToLower().Trim() == pointerMark.ToLower().Trim() )
				{
					retVal = i;
					break;
				}
			}
			return retVal;
		}
		#endregion GetPointerTypeIndex

		#region GetSynSetTypeCode
		internal static DbPartOfSpechType GetSynSetTypeCode( string data )
		{
			char pos = data[ 0 ];
			switch( pos )
			{
				case 'n':
					return DbPartOfSpechType.Noun;
				case 'a':
					return DbPartOfSpechType.Adj;
				case 'v':
					return DbPartOfSpechType.Verb;
				case 's':
					return DbPartOfSpechType.Satellite;
				case 'r':
					return DbPartOfSpechType.Adv;
			}
			return DbPartOfSpechType.All;
		}
		#endregion GetSynSetTypeCode

		#region GetPartOfSpeech
		internal static int GetPartOfSpeech( char data )
		{
			switch( data )
			{
				case 'n':
					return ( Constants.POS_NOUN );
				case 'a':
				case 's':
					return ( Constants.POS_ADJ );
				case 'v':
					return ( Constants.POS_VERB );
				case 'r':
					return ( Constants.POS_ADV );
			}
			return -1;
		}
		#endregion GetPartOfSpeech

		#region EncodeWord
		private static string EncodeWord( string data )
		{
			string retVal = string.Empty;
			retVal = data.Replace( ' ', '_' );
			return retVal;
		}
		#endregion EncodeWord

		#region DecodeWord
		private static string DecodeWord( string data )
		{
			string retVal = string.Empty;
			retVal = data.Replace( '_', ' ' );
			return retVal;
		}
		#endregion DecodeWord
	}
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -