⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 txtconv.cpp

📁 Emdros is a text database middleware-layer aimed at storage and retrieval of "text plus information
💻 CPP
字号:
/*	TxtConv.c	Copyright (c) 2002-2004 SIL International.	 5-May-2005		jk	added include <stdlib.h> and <string.h> to keep gcc happy	10-Mar-2004		jk	added -u option to control handling of unmappable input*/#include "TECkit_Engine.h"#include <stdlib.h>#include <string.h>#include <stdio.h>#if __MWERKS__#if __dest_os == __mac_os#include "SIOUX.h"#include "console.h"#endif#endif//#ifdef HAVE_CONFIG_H//#	include "config.h"	/* a Unix-ish setup where we have config.h available *///#else#	if	(defined __dest_os && (__dest_os == __win32_os)) || defined WIN32	/* Windows target: little-endian */#		undef WORDS_BIGENDIAN#	else#		if (defined TARGET_RT_BIG_ENDIAN)	/* the CodeWarrior prefix files set this */#			if TARGET_RT_BIG_ENDIAN#				define WORDS_BIGENDIAN 1#			else#				undef WORDS_BIGENDIAN#			endif#		else#                       if defined(__APPLE__)#                            if defined(__BIG_ENDIAN__)#                                 define WORDS_BIGENDIAN 1#                            else#                                 undef WORDS_BIGENDIAN#                            endif#                       else#                           error Unsure about endianness!#                       endif#		endif#	endif//#endif#define	kInBufLen	4096#define kOutBufLen	(kInBufLen * 4)static intdoConversion(TECkit_Converter cnv, FILE* inFile, FILE* outFile, UInt32 opts){	UInt32			inBase;	UInt32			inLength;	char			inBuffer[kInBufLen];	char			outBuffer[kOutBufLen];	UInt32			savedLen = 0;	UInt32			offset = 0;	TECkit_Status	status;	inBase = ftell(inFile);	fseek(inFile, 0, SEEK_END);	inLength = ftell(inFile) - inBase;	fseek(inFile, inBase, SEEK_SET);	while (1) {		UInt32	inUsed, outUsed, lookahead;		UInt32	amountToRead = kInBufLen - savedLen;		char*	inPtr = inBuffer;		UInt32	inAvail;				if (offset + amountToRead > inLength)			amountToRead = inLength - offset;				inAvail = savedLen + amountToRead;		if (inAvail > 0) {			fread(inBuffer + savedLen, 1, amountToRead, inFile);			offset += amountToRead;			status = TECkit_ConvertBufferOpt(cnv, (Byte*)inPtr, inAvail, &inUsed,											(Byte*)outBuffer, kOutBufLen, &outUsed, opts, &lookahead);			fwrite(outBuffer, 1, outUsed, outFile);						switch (status & kStatusMask_Basic) {				case kStatus_OutputBufferFull:				case kStatus_NeedMoreInput:					if (inUsed < inAvail) {						savedLen = inAvail - inUsed;						memcpy(inBuffer, inBuffer + inAvail - savedLen, savedLen);					}					else						savedLen = 0;					continue;				case kStatus_UnmappedChar:					fprintf(stderr, "processing aborted at unmappable character, within %lu characters before file offset %lu\n",								lookahead, offset - amountToRead + inUsed);					break;				default:					fprintf(stderr, "bad returned status from TECkit_ConvertBuffer: %ld\n", status);					break;			}		}		if ((status & kStatusMask_Basic) != kStatus_UnmappedChar) {			// flush the converter			do {				status = TECkit_FlushOpt(cnv, (Byte*)outBuffer, kOutBufLen, &outUsed, opts, &lookahead);				fwrite(outBuffer, 1, outUsed, outFile);				savedLen -= inUsed;				inPtr += inUsed;			} while ((status & kStatusMask_Basic) == kStatus_OutputBufferFull);			if ((status & kStatusMask_Basic) == kStatus_UnmappedChar)				fprintf(stderr, "processing aborted at unmappable character, within %lu characters before end of input\n", lookahead);			else if ((status & kStatusMask_Basic) != kStatus_NoError)				fprintf(stderr, "bad returned status from TECkit_Flush: %ld\n", status);		}				if ((status & kStatusMask_Warning) == kStatus_UsedReplacement)			fprintf(stderr, "warning: unmapped input replaced by default replacement character\n");		break;	}		return ((status & kStatusMask_Basic) != 0)		? 2		: (			((status & kStatusMask_Warning) != 0)				? 1				: 0		  );}static intstringArg(int* pargc, char*** pargv, char** pstr){	int	err = 0;	if (*pargc == 1) {		fprintf(stderr, "missing value after %s\n", **pargv);		err = 1;	}	if (*pstr != 0) {		fprintf(stderr, "repeated argument %s\n", **pargv);		err = 1;	}	*pstr = *++*pargv;	--*pargc;	return err;}intmain(int argc, char** argv){#if __MWERKS__ && __dest_os == __mac_os	SIOUXSettings.asktosaveonclose = 0;	argc = ccommand(&argv);#endif	char*	progName = argv[0];	char*	tecFileName = 0;	char*	inFileName = 0;	char*	outFileName = 0;	char*	nameID = 0;	char	forward = 1;	char*	unmappedOption = 0;	UInt16	inForm = kForm_Unspecified;	UInt16	outForm = kForm_Unspecified;	char	cmdLineErr = 0;	UInt16	normForm = 0;	char	noBOM = 0;	UInt32	opts = kOptionsUnmapped_UseReplacementCharSilently;	int		rval;		UInt32	len = 0;	char*	table = 0;	FILE*	tecFile;	FILE*	inFile;	FILE*	outFile;	UInt32	sourceFlags, targetFlags;	TECkit_Status	status;	TECkit_Converter	cnv;	if (TECkit_GetVersion() != kCurrentTECkitVersion) {		fprintf(stderr, "incorrect TECkit library version\n");		return 1;	}	while (--argc) {		char*	arg = *++argv;		if (arg[0] == '-') {			if (strlen(arg + 1) == 1) {				switch (arg[1]) {					case 'n':						cmdLineErr += stringArg(&argc, &argv, &nameID);						break;					case 't':						cmdLineErr += stringArg(&argc, &argv, &tecFileName);						break;					case 'i':						cmdLineErr += stringArg(&argc, &argv, &inFileName);						break;					case 'o':						cmdLineErr += stringArg(&argc, &argv, &outFileName);						break;					case 'u':						cmdLineErr += stringArg(&argc, &argv, &unmappedOption);						break;					case 'r':						forward = 0;						break;				}			}			else if (strcmp(arg + 1, "if") == 0) {				++argv;				--argc;				if (strcmp(*argv, "bytes") == 0)					inForm = kForm_Bytes;				else if (strcmp(*argv, "utf8") == 0)					inForm = kForm_UTF8;				else if (strcmp(*argv, "utf16") == 0)#ifdef WORDS_BIGENDIAN					inForm = kForm_UTF16BE;#else					inForm = kForm_UTF16LE;#endif				else if (strcmp(*argv, "utf16be") == 0)					inForm = kForm_UTF16BE;				else if (strcmp(*argv, "utf16le") == 0)					inForm = kForm_UTF16LE;				else if (strcmp(*argv, "utf32") == 0)#ifdef WORDS_BIGENDIAN					inForm = kForm_UTF32BE;#else					inForm = kForm_UTF32LE;#endif				else if (strcmp(*argv, "utf32be") == 0)					inForm = kForm_UTF32BE;				else if (strcmp(*argv, "utf32le") == 0)					inForm = kForm_UTF32LE;			}			else if (strcmp(arg + 1, "of") == 0) {				++argv;				--argc;				if (strcmp(*argv, "bytes") == 0)					outForm = kForm_Bytes;				else if (strcmp(*argv, "utf8") == 0)					outForm = kForm_UTF8;				else if (strcmp(*argv, "utf16") == 0)#ifdef WORDS_BIGENDIAN					outForm = kForm_UTF16BE;#else					outForm = kForm_UTF16LE;#endif				else if (strcmp(*argv, "utf16be") == 0)					outForm = kForm_UTF16BE;				else if (strcmp(*argv, "utf16le") == 0)					outForm = kForm_UTF16LE;				else if (strcmp(*argv, "utf32") == 0)#ifdef WORDS_BIGENDIAN					outForm = kForm_UTF32BE;#else					outForm = kForm_UTF32LE;#endif				else if (strcmp(*argv, "utf32be") == 0)					outForm = kForm_UTF32BE;				else if (strcmp(*argv, "utf32le") == 0)					outForm = kForm_UTF32LE;			}			else if (strcmp(arg + 1, "nfc") == 0)				normForm = kForm_NFC;			else if (strcmp(arg + 1, "nfd") == 0)				normForm = kForm_NFD;			else if (strcmp(arg + 1, "nobom") == 0)				noBOM = 1;			else {				fprintf(stderr, "unknown option %s\n", arg);				++cmdLineErr;			}		}		else {			fprintf(stderr, "missing option flag at %s\n", arg);			++cmdLineErr;		}	}		if (unmappedOption != 0) {		if (strlen(unmappedOption) != 1 || *unmappedOption < '0' || *unmappedOption > '2') {			fprintf(stderr, "unknown value '%s' for -u option (expected 0-2)\n", unmappedOption);			++cmdLineErr;		}		else {			switch (*unmappedOption) {				case '0':					opts = kOptionsUnmapped_UseReplacementCharSilently;					break;				case '1':					opts = kOptionsUnmapped_UseReplacementCharWithWarning;					break;				case '2':					opts = kOptionsUnmapped_DontUseReplacementChar;					break;			}		}	}	if (cmdLineErr != 0 || inFileName == 0 || outFileName == 0) {		fprintf(stderr, "\Usage: %s -i inFile -o outFile [-t tecFile] [-r] [-if inForm] [-of outForm] [-nobom] [-nf[cd]] [-u n]\n\    Required arguments:\n\        -i <file>   input file\n\        -o <file>   output file\n\    Optional arguments:\n\        -t <file>   compiled TECkit mapping (.tec) file\n\        -r          reverse (RHS->LHS, or Unicode->Byte) mapping\n\        -if <form>  input encoding form\n\        -of <form>  output encoding form\n\        -nobom      don't write a BOM to Unicode output\n\        -nf[cd]     apply NFC or NFD normalization to Unicode output\n\        -u <n>      handling of unmappable input:\n\                        0 = use replacement character\n\                        1 = use replacement but generate warning\n\                        2 = stop conversion\n\    Encoding forms:\n\        bytes utf8 utf16be utf16le utf16 utf32be utf32le utf32\n\", progName);		exit(cmdLineErr != 0);	}	// examine the mapping (if any) to determine the input and output code spaces	if (tecFileName != 0) {		tecFile = fopen(tecFileName, "rb");		if (tecFile == 0) {			fprintf(stderr, "unable to load mapping table file %s\n", tecFileName);			return 1;		}		fseek(tecFile, 0, SEEK_END);		len = ftell(tecFile);		fseek(tecFile, 0, SEEK_SET);		table = (char*)malloc(len);		if (table == 0) {			fprintf(stderr, "out of memory!\n");			fclose(tecFile);			return 1;		}		fread(table, 1, len, tecFile);		fclose(tecFile);		status = forward			? TECkit_GetMappingFlags((Byte*)table, len, &sourceFlags, &targetFlags)			: TECkit_GetMappingFlags((Byte*)table, len, &targetFlags, &sourceFlags);		if (status != kStatus_NoError) {			fprintf(stderr, "couldn't get encoding flags from mapping\n");			return 1;		}	}	else {		sourceFlags = kFlags_Unicode;		targetFlags = kFlags_Unicode;	}	inFile = fopen(inFileName, "rb");	if (inFile == 0) {		fprintf(stderr, "unable to open input file \"%s\"\n", inFileName);		return 1;	}		if (sourceFlags & kFlags_Unicode) {		// if the input is supposed to be Unicode, and the inForm is unspecified, try to guess it		// or skip over the BOM if one is found that matches the specified inForm		unsigned char	sig[4];		if (inForm == kForm_Bytes) {			fprintf(stderr, "improper input encoding form for this mapping");			return 1;		}				switch (inForm) {			case kForm_Unspecified:				fread(sig, 1, 4, inFile);				if (sig[0] == 0xef && sig[1] == 0xbb && sig[2] == 0xbf) {					inForm = kForm_UTF8;					fseek(inFile, 3, SEEK_SET);				}				else if (sig[0] == 0xfe && sig[1] == 0xff) {					inForm = kForm_UTF16BE;					fseek(inFile, 2, SEEK_SET);				}				else if (sig[0] == 0xff && sig[1] == 0xfe && sig[2] == 0x00 && sig[3] == 0x00) {					inForm = kForm_UTF32LE;				/*	fseek(inFile, 4, SEEK_SET);	*/				}				else if (sig[0] == 0xff && sig[1] == 0xfe) {					inForm = kForm_UTF16LE;					fseek(inFile, 2, SEEK_SET);				}				else if (sig[0] == 0x00 && sig[1] == 0x00 && sig[2] == 0xfe && sig[3] == 0xff) {					inForm = kForm_UTF32BE;				/*	fseek(inFile, 4, SEEK_SET);	*/				}				else {					inForm = kForm_UTF8;					fseek(inFile, 0, SEEK_SET);				}				break;					case kForm_UTF8:				fread(sig, 1, 3, inFile);				if (!(sig[0] == 0xef && sig[1] == 0xbb && sig[2] == 0xbf))					fseek(inFile, 0, SEEK_SET);				break;			case kForm_UTF16BE:				fread(sig, 1, 2, inFile);				if (!(sig[0] == 0xfe && sig[1] == 0xff))					fseek(inFile, 0, SEEK_SET);				break;			case kForm_UTF16LE:				fread(sig, 1, 2, inFile);				if (!(sig[0] == 0xff && sig[1] == 0xfe))					fseek(inFile, 0, SEEK_SET);				break;			case kForm_UTF32BE:				fread(sig, 1, 4, inFile);				if (!(sig[0] == 0x00 && sig[1] == 0x00 && sig[2] == 0xfe && sig[3] == 0xff))					fseek(inFile, 0, SEEK_SET);				break;			case kForm_UTF32LE:				fread(sig, 1, 4, inFile);				if (!(sig[0] == 0xff && sig[1] == 0xfe && sig[2] == 0x00 && sig[3] == 0x00))					fseek(inFile, 0, SEEK_SET);				break;		}	}	else {		// if the input is not Unicode, the inputForm had better be Bytes		switch (inForm) {			case kForm_Unspecified:				inForm = kForm_Bytes;				break;			case kForm_Bytes:				break;			default:				fprintf(stderr, "improper input encoding form for this mapping");				return 1;		}	}	// choose an output encoding form (if not specified) and write BOM to Unicode output file	outFile = fopen(outFileName, "wb");	if (outFile == 0) {		fprintf(stderr, "unable to open output file \"%s\"\n", outFileName);		return 1;	}	if (targetFlags & kFlags_Unicode) {		if (outForm == kForm_Bytes) {			fprintf(stderr, "improper output encoding form for this mapping");			return 1;		}		if (outForm == kForm_Unspecified) {			if (inForm == kForm_Bytes)				outForm = kForm_UTF8;			else				outForm = inForm;		}		if (!noBOM) {			switch (outForm) {				case kForm_UTF8:					fwrite("\xef\xbb\xbf", 1, 3, outFile);					break;				case kForm_UTF16BE:					fwrite("\xfe\xff", 1, 2, outFile);					break;				case kForm_UTF16LE:					fwrite("\xff\xfe", 1, 2, outFile);					break;				case kForm_UTF32BE:					fwrite("\x00\x00\xfe\xff", 1, 4, outFile);					break;				case kForm_UTF32LE:					fwrite("\xff\xfe\x00\x00", 1, 4, outFile);					break;			}		}	}	else {		switch (outForm) {			case kForm_Unspecified:				outForm = kForm_Bytes;				break;			case kForm_Bytes:				break;			default:				fprintf(stderr, "improper output encoding form for this mapping");				return 1;		}	}	// OK, we have figured out the input and output encoding forms we want to use;	// now at last we can instantiate a converter	status = TECkit_CreateConverter((Byte*)table, len, forward, inForm, outForm | normForm, &cnv);	if (table != 0)		free(table);	if (status != kStatus_NoError) {		fprintf(stderr, "bad returned status from TECkit_CreateConverter: %ld\n", status);		return status;	}	// do the actual conversion	rval = doConversion(cnv, inFile, outFile, opts);		fclose(inFile);	fclose(outFile);	status = TECkit_DisposeConverter(cnv);	if (status != kStatus_NoError) {		fprintf(stderr, "bad returned status from TECkit_DisposeConverter: %ld\n", status);		return status;	}	return rval;}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -