📄 textdocument.cpp
字号:
//
// MODULE: TextDocument.cpp
//
// PURPOSE: Basic implementation of a text data-sequence class
//
// NOTES: www.catch22.net
//
#define STRICT
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#include "TextDocument.h"
#include "TextView.h"
#include "Unicode.h"
struct _BOM_LOOKUP BOMLOOK[] =
{
// define longest headers first
{ 0x0000FEFF, 4, NCP_UTF32 },
{ 0xFFFE0000, 4, NCP_UTF32BE },
{ 0xBFBBEF, 3, NCP_UTF8 },
{ 0xFFFE, 2, NCP_UTF16BE },
{ 0xFEFF, 2, NCP_UTF16 },
{ 0, 0, NCP_ASCII },
};
//
// TextDocument constructor
//
TextDocument::TextDocument()
{
// buffer = 0;
m_nDocLength_bytes = 0;
m_nDocLength_chars = 0;
m_pLineBuf_byte = 0;
m_pLineBuf_char = 0;
m_nNumLines = 0;
m_nFileFormat = NCP_ASCII;
m_nHeaderSize = 0;
}
//
// TextDocument destructor
//
TextDocument::~TextDocument()
{
clear();
}
//
// Initialize the TextDocument with the specified file
//
bool TextDocument::init(TCHAR *filename)
{
HANDLE hFile;
hFile = CreateFile(filename, GENERIC_READ, FILE_SHARE_READ, 0, OPEN_EXISTING, 0, 0);
if(hFile == INVALID_HANDLE_VALUE)
return false;
return init(hFile);
}
//
// Initialize using a file-handle
//
bool TextDocument::init(HANDLE hFile)
{
ULONG numread;
char *buffer;
if((m_nDocLength_bytes = GetFileSize(hFile, 0)) == 0)
return false;
// allocate new file-buffer
if((buffer = new char[m_nDocLength_bytes]) == 0)
return false;
// read entire file into memory
ReadFile(hFile, buffer, m_nDocLength_bytes, &numread, 0);
m_seq.init((BYTE *)buffer, m_nDocLength_bytes);
// try to detect if this is an ascii/unicode/utf8 file
m_nFileFormat = detect_file_format(&m_nHeaderSize);
// work out where each line of text starts
if(!init_linebuffer())
clear();
CloseHandle(hFile);
delete[] buffer;
return true;
}
// Initialize the TextDocument with the specified file
//
/*bool TextDocument::save(TCHAR *filename)
{
HANDLE hFile;
hFile = CreateFile(filename, GENERIC_READ, FILE_SHARE_READ, 0, OPEN_EXISTING, 0, 0);
if(hFile == INVALID_HANDLE_VALUE)
return false;
CloseHandle(hFile);
return true;
}*/
//
// Parse the file lo
//
//
// From the unicode.org FAQ:
//
// 00 00 FE FF UTF-32, big-endian
// FF FE 00 00 UTF-32, little-endian
// FE FF UTF-16, big-endian
// FF FE UTF-16, little-endian
// EF BB BF UTF-8
//
// Match the first x bytes of the file against the
// Byte-Order-Mark (BOM) lookup table
//
int TextDocument::detect_file_format(int *m_nHeaderSize)
{
BYTE header[4] = { 0 };
m_seq.render(0, header, 4);
for(int i = 0; BOMLOOK[i].len; i++)
{
if(m_nDocLength_bytes >= BOMLOOK[i].len &&
memcmp(header, &BOMLOOK[i].bom, BOMLOOK[i].len) == 0)
{
*m_nHeaderSize = BOMLOOK[i].len;
return BOMLOOK[i].type;
}
}
*m_nHeaderSize = 0;
return NCP_ASCII; // default to ASCII
}
//
// Empty the data-TextDocument
//
bool TextDocument::clear()
{
m_seq.clear();
m_nDocLength_bytes = 0;
if(m_pLineBuf_byte)
{
delete[] m_pLineBuf_byte;
m_pLineBuf_byte = 0;
}
if(m_pLineBuf_char)
{
delete[] m_pLineBuf_char;
m_pLineBuf_char = 0;
}
m_nNumLines = 0;
return true;
}
bool TextDocument::EmptyDoc()
{
clear();
m_seq.init();
// this is not robust. it's just to get the thing
// up-and-running until I write a proper line-buffer mananger
m_pLineBuf_byte = new ULONG[0x1000];
m_pLineBuf_char = new ULONG[0x1000];
m_pLineBuf_byte[0] = 0;
m_pLineBuf_char[0] = 0;
return true;
}
//
// Return a UTF-32 character value
//
int TextDocument::getchar(ULONG offset, ULONG lenbytes, ULONG *pch32)
{
// BYTE *rawdata = (BYTE *)(buffer + offset + m_nHeaderSize);
BYTE rawdata[16];
lenbytes = min(16, lenbytes);
m_seq.render(offset+ m_nHeaderSize, rawdata, lenbytes);
#ifdef UNICODE
UTF16 *rawdata_w = (UTF16 *)rawdata;//(WCHAR*)(buffer + offset + m_nHeaderSize);
WCHAR ch16;
size_t ch32len = 1;
switch(m_nFileFormat)
{
case NCP_ASCII:
MultiByteToWideChar(CP_ACP, 0, (CCHAR*)rawdata, 1, &ch16, 1);
*pch32 = ch16;
return 1;
case NCP_UTF16:
return utf16_to_utf32(rawdata_w, lenbytes / 2, pch32, &ch32len) * sizeof(WCHAR);
case NCP_UTF16BE:
return utf16be_to_utf32(rawdata_w, lenbytes / 2, pch32, &ch32len) * sizeof(WCHAR);
case NCP_UTF8:
return utf8_to_utf32(rawdata, lenbytes, pch32);
default:
return 0;
}
#else
*pch32 = (ULONG)(BYTE)rawdata[0];
return 1;
#endif
}
//
// Fetch a buffer of UTF-16 text from the specified byte offset -
// returns the number of characters stored in buf
//
// Depending on how Neatpad was compiled (UNICODE vs ANSI) this function
// will always return text in the "native" format - i.e. Unicode or Ansi -
// so the necessary conversions will take place here.
//
// TODO: make sure the CR/LF is always fetched in one go
// make sure utf-16 surrogates kept together
// make sure that combining chars kept together
// make sure that bidirectional text kep together (will be *hard*)
//
// offset - BYTE offset within underlying data sequence
// lenbytes - max number of bytes to process (i.e. to limit to a line)
// buf - UTF16/ASCII output buffer
// plen - [in] - length of buffer, [out] - number of code-units stored
//
// returns - number of bytes processed
//
ULONG TextDocument::gettext(ULONG offset, ULONG lenbytes, TCHAR *buf, ULONG *buflen)
{
// BYTE *rawdata = (BYTE *)(buffer + offset + m_nHeaderSize);
ULONG chars_copied = 0;
ULONG bytes_processed = 0;
if(offset >= m_nDocLength_bytes)
{
*buflen = 0;
return 0;
}
while(lenbytes > 0 && *buflen > 0)
{
BYTE rawdata[0x100];
size_t rawlen = min(lenbytes, 0x100);
// get next block of data from the piece-table
m_seq.render(offset + m_nHeaderSize, rawdata, rawlen);
// convert to UTF-16
size_t tmplen = *buflen;
rawlen = rawdata_to_utf16(rawdata, rawlen, buf, &tmplen);
lenbytes -= rawlen;
offset += rawlen;
bytes_processed += rawlen;
buf += tmplen;
*buflen -= tmplen;
chars_copied += tmplen;
}
*buflen = chars_copied;
return bytes_processed;
//ULONG remaining = lenbytes;
//int charbuflen = *buflen;
//while(remaining)
/* {
lenbytes = min(lenbytes, sizeof(rawdata));
m_seq.render(offset + m_nHeaderSize, rawdata, lenbytes);
#ifdef UNICODE
switch(m_nFileFormat)
{
// convert from ANSI->UNICODE
case NCP_ASCII:
return ascii_to_utf16(rawdata, lenbytes, buf, (size_t*)buflen);
case NCP_UTF8:
return utf8_to_utf16(rawdata, lenbytes, buf, (size_t*)buflen);
// already unicode, do a straight memory copy
case NCP_UTF16:
return copy_utf16((WCHAR*)rawdata, lenbytes/sizeof(WCHAR), buf, (size_t*)buflen);
// need to convert from big-endian to little-endian
case NCP_UTF16BE:
return swap_utf16((WCHAR*)rawdata, lenbytes/sizeof(WCHAR), buf, (size_t*)buflen);
// error! we should *never* reach this point
default:
*buflen = 0;
return 0;
}
#else
switch(m_nFileFormat)
{
// we are already an ASCII app, so do a straight memory copy
case NCP_ASCII:
int len;
len = min(*buflen, lenbytes);
memcpy(buf, rawdata, len);
*buflen = len;
return len;
// anything else is an error - we cannot support Unicode or multibyte
// character sets with a plain ASCII app.
default:
*buflen = 0;
return 0;
}
#endif
// remaining -= lenbytes;
// buf += lenbytes;
// offset += lenbytes;
}*/
}
ULONG TextDocument::getdata(ULONG offset, BYTE *buf, size_t len)
{
//memcpy(buf, buffer + offset + m_nHeaderSize, len);
m_seq.render(offset + m_nHeaderSize, buf, len);
return len;
}
//
// Initialize the line-buffer
//
// With Unicode a newline sequence is defined as any of the following:
//
// \u000A | \u000B | \u000C | \u000D | \u0085 | \u2028 | \u2029 | \u000D\u000A
//
bool TextDocument::init_linebuffer()
{
ULONG offset_bytes = 0;
ULONG offset_chars = 0;
ULONG linestart_bytes = 0;
ULONG linestart_chars = 0;
ULONG bytes_left = m_nDocLength_bytes - m_nHeaderSize;
ULONG buflen = m_nDocLength_bytes - m_nHeaderSize;
// allocate the line-buffer for storing each line's BYTE offset
if((m_pLineBuf_byte = new ULONG[buflen+1]) == 0)
return false;
// allocate the line-buffer for storing each line's CHARACTER offset
if((m_pLineBuf_char = new ULONG[buflen+1]) == 0)
return false;
m_nNumLines = 0;
// loop through every byte in the file
for(offset_bytes = 0; offset_bytes < buflen; )
{
ULONG ch32;
// get a UTF-32 character from the underlying file format.
// this needs serious thought. Currently
ULONG len = getchar(offset_bytes, buflen - offset_bytes, &ch32);
offset_bytes += len;
offset_chars += 1;
if(ch32 == '\r')
{
// record where the line starts
m_pLineBuf_byte[m_nNumLines] = linestart_bytes;
m_pLineBuf_char[m_nNumLines] = linestart_chars;
linestart_bytes = offset_bytes;
linestart_chars = offset_chars;
// look ahead to next char
len = getchar(offset_bytes, buflen - offset_bytes, &ch32);
offset_bytes += len;
offset_chars += 1;
// carriage-return / line-feed combination
if(ch32 == '\n')
{
linestart_bytes = offset_bytes;
linestart_chars = offset_chars;
}
m_nNumLines++;
}
else if(ch32 == '\n' || ch32 == '\x0b' || ch32 == '\x0c' || ch32 == 0x0085 || ch32 == 0x2029 || ch32 == 0x2028)
{
// record where the line starts
m_pLineBuf_byte[m_nNumLines] = linestart_bytes;
m_pLineBuf_char[m_nNumLines] = linestart_chars;
linestart_bytes = offset_bytes;
linestart_chars = offset_chars;
m_nNumLines++;
}
// force a 'hard break'
else if(offset_chars - linestart_chars > 128)
{
m_pLineBuf_byte[m_nNumLines] = linestart_bytes;
m_pLineBuf_char[m_nNumLines] = linestart_chars;
linestart_bytes = offset_bytes;
linestart_chars = offset_chars;
m_nNumLines++;
}
}
if(buflen > 0)
{
m_pLineBuf_byte[m_nNumLines] = linestart_bytes;
m_pLineBuf_char[m_nNumLines] = linestart_chars;
m_nNumLines++;
}
m_pLineBuf_byte[m_nNumLines] = buflen;
m_pLineBuf_char[m_nNumLines] = offset_chars;
return true;
}
//
// Return the number of lines
//
ULONG TextDocument::linecount()
{
return m_nNumLines;
}
//
// Return the length of longest line
//
ULONG TextDocument::longestline(int tabwidth)
{
//ULONG i;
ULONG longest = 0;
ULONG xpos = 0;
// char *bufptr = (char *)(buffer + m_nHeaderSize);
/*
for(i = 0; i < length_bytes; i++)
{
if(bufptr[i] == '\r')
{
if(bufptr[i+1] == '\n')
i++;
longest = max(longest, xpos);
xpos = 0;
}
else if(bufptr[i] == '\n')
{
longest = max(longest, xpos);
xpos = 0;
}
else if(bufptr[i] == '\t')
{
xpos += tabwidth - (xpos % tabwidth);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -