📄 textoutputdev.cc

📁 将pdf文档转换为高质量的html文档
💻 CC
📖 第 1 页 / 共 5 页
字号:
    }  }}TextWordList::~TextWordList() {  delete words;}int TextWordList::getLength() {  return words->getLength();}TextWord *TextWordList::get(int idx) {  if (idx < 0 || idx >= words->getLength()) {    return NULL;  }  return (TextWord *)words->get(idx);}#endif // TEXTOUT_WORD_LIST//------------------------------------------------------------------------// TextPage//------------------------------------------------------------------------TextPage::TextPage(GBool rawOrderA) {  int rot;  rawOrder = rawOrderA;  curWord = NULL;  charPos = 0;  curFont = NULL;  curFontSize = 0;  nest = 0;  nTinyChars = 0;  lastCharOverlap = gFalse;  if (!rawOrder) {    for (rot = 0; rot < 4; ++rot) {      pools[rot] = new TextPool();    }  }  flows = NULL;  blocks = NULL;  rawWords = NULL;  rawLastWord = NULL;  fonts = new GList();  lastFindXMin = lastFindYMin = 0;  haveLastFind = gFalse;}TextPage::~TextPage() {  int rot;  clear();  if (!rawOrder) {    for (rot = 0; rot < 4; ++rot) {      delete pools[rot];    }  }  delete fonts;}void TextPage::startPage(GfxState *state) {  clear();  if (state) {    pageWidth = state->getPageWidth();    pageHeight = state->getPageHeight();  } else {    pageWidth = pageHeight = 0;  }}void TextPage::endPage() {  if (curWord) {    endWord();  }}void TextPage::clear() {  int rot;  TextFlow *flow;  TextWord *word;  if (curWord) {    delete curWord;    curWord = NULL;  }  if (rawOrder) {    while (rawWords) {      word = rawWords;      rawWords = rawWords->next;      delete word;    }  } else {    for (rot = 0; rot < 4; ++rot) {      delete pools[rot];    }    while (flows) {      flow = flows;      flows = flows->next;      delete flow;    }    gfree(blocks);  }  deleteGList(fonts, TextFontInfo);  curWord = NULL;  charPos = 0;  curFont = NULL;  curFontSize = 0;  nest = 0;  nTinyChars = 0;  if (!rawOrder) {    for (rot = 0; rot < 4; ++rot) {      pools[rot] = new TextPool();    }  }  flows = NULL;  blocks = NULL;  rawWords = NULL;  rawLastWord = NULL;  fonts = new GList();}void TextPage::updateFont(GfxState *state) {  GfxFont *gfxFont;  double *fm;  char *name;  int code, mCode, letterCode, anyCode;  double w;  int i;  // get the font info object  curFont = NULL;  for (i = 0; i < fonts->getLength(); ++i) {    curFont = (TextFontInfo *)fonts->get(i);    if (curFont->matches(state)) {      break;    }    curFont = NULL;  }  if (!curFont) {    curFont = new TextFontInfo(state);    fonts->append(curFont);  }  // adjust the font size  gfxFont = state->getFont();  curFontSize = state->getTransformedFontSize();  if (gfxFont && gfxFont->getType() == fontType3) {    // This is a hack which makes it possible to deal with some Type 3    // fonts.  The problem is that it's impossible to know what the    // base coordinate system used in the font is without actually    // rendering the font.  This code tries to guess by looking at the    // width of the character 'm' (which breaks if the font is a    // subset that doesn't contain 'm').    mCode = letterCode = anyCode = -1;    for (code = 0; code < 256; ++code) {      name = ((Gfx8BitFont *)gfxFont)->getCharName(code);      if (name && name[0] == 'm' && name[1] == '\0') {	mCode = code;      }      if (letterCode < 0 && name && name[1] == '\0' &&	  ((name[0] >= 'A' && name[0] <= 'Z') ||	   (name[0] >= 'a' && name[0] <= 'z'))) {	letterCode = code;      }      if (anyCode < 0 && name &&	  ((Gfx8BitFont *)gfxFont)->getWidth(code) > 0) {	anyCode = code;      }    }    if (mCode >= 0 &&	(w = ((Gfx8BitFont *)gfxFont)->getWidth(mCode)) > 0) {      // 0.6 is a generic average 'm' width -- yes, this is a hack      curFontSize *= w / 0.6;    } else if (letterCode >= 0 &&	       (w = ((Gfx8BitFont *)gfxFont)->getWidth(letterCode)) > 0) {      // even more of a hack: 0.5 is a generic letter width      curFontSize *= w / 0.5;    } else if (anyCode >= 0 &&	       (w = ((Gfx8BitFont *)gfxFont)->getWidth(anyCode)) > 0) {      // better than nothing: 0.5 is a generic character width      curFontSize *= w / 0.5;    }    fm = gfxFont->getFontMatrix();    if (fm[0] != 0) {      curFontSize *= fabs(fm[3] / fm[0]);    }  }}void TextPage::beginWord(GfxState *state, double x0, double y0) {  double *fontm;  double m[4], m2[4];  int rot;  // This check is needed because Type 3 characters can contain  // text-drawing operations (when TextPage is being used via  // {X,Win}SplashOutputDev rather than TextOutputDev).  if (curWord) {    ++nest;    return;  }  // compute the rotation  state->getFontTransMat(&m[0], &m[1], &m[2], &m[3]);  if (state->getFont()->getType() == fontType3) {    fontm = state->getFont()->getFontMatrix();    m2[0] = fontm[0] * m[0] + fontm[1] * m[2];    m2[1] = fontm[0] * m[1] + fontm[1] * m[3];    m2[2] = fontm[2] * m[0] + fontm[3] * m[2];    m2[3] = fontm[2] * m[1] + fontm[3] * m[3];    m[0] = m2[0];    m[1] = m2[1];    m[2] = m2[2];    m[3] = m2[3];  }  if (fabs(m[0] * m[3]) > fabs(m[1] * m[2])) {    rot = (m[3] < 0) ? 0 : 2;  } else {    rot = (m[2] > 0) ? 1 : 3;  }  curWord = new TextWord(state, rot, x0, y0, charPos, curFont, curFontSize);}void TextPage::addChar(GfxState *state, double x, double y,		       double dx, double dy,		       CharCode c, int nBytes, Unicode *u, int uLen) {  double x1, y1, w1, h1, dx2, dy2, base, sp, delta;  GBool overlap;  int i;  // throw away chars that aren't inside the page bounds  state->transform(x, y, &x1, &y1);  if (x1 < 0 || x1 > pageWidth ||      y1 < 0 || y1 > pageHeight) {    charPos += nBytes;    return;  }  // subtract char and word spacing from the dx,dy values  sp = state->getCharSpace();  if (c == (CharCode)0x20) {    sp += state->getWordSpace();  }  state->textTransformDelta(sp * state->getHorizScaling(), 0, &dx2, &dy2);  dx -= dx2;  dy -= dy2;  state->transformDelta(dx, dy, &w1, &h1);  // check the tiny chars limit  if (!globalParams->getTextKeepTinyChars() &&      fabs(w1) < 3 && fabs(h1) < 3) {    if (++nTinyChars > 50000) {      charPos += nBytes;      return;    }  }  // break words at space character  if (uLen == 1 && u[0] == (Unicode)0x20) {    if (curWord) {      ++curWord->charLen;    }    charPos += nBytes;    endWord();    return;  }  // start a new word if:  // (1) this character doesn't fall in the right place relative to  //     the end of the previous word (this places upper and lower  //     constraints on the position deltas along both the primary  //     and secondary axes), or  // (2) this character overlaps the previous one (duplicated text), or  // (3) the previous character was an overlap (we want each duplicated  //     character to be in a word by itself at this stage)  if (curWord && curWord->len > 0) {    base = sp = delta = 0; // make gcc happy    switch (curWord->rot) {    case 0:      base = y1;      sp = x1 - curWord->xMax;      delta = x1 - curWord->edge[curWord->len - 1];      break;    case 1:      base = x1;      sp = y1 - curWord->yMax;      delta = y1 - curWord->edge[curWord->len - 1];      break;    case 2:      base = y1;      sp = curWord->xMin - x1;      delta = curWord->edge[curWord->len - 1] - x1;      break;    case 3:      base = x1;      sp = curWord->yMin - y1;      delta = curWord->edge[curWord->len - 1] - y1;      break;    }    overlap = fabs(delta) < dupMaxPriDelta * curWord->fontSize &&              fabs(base - curWord->base) < dupMaxSecDelta * curWord->fontSize;    if (overlap || lastCharOverlap ||	sp < -minDupBreakOverlap * curWord->fontSize ||	sp > minWordBreakSpace * curWord->fontSize ||	fabs(base - curWord->base) > 0.5) {      endWord();    }    lastCharOverlap = overlap;  } else {    lastCharOverlap = gFalse;  }  if (uLen != 0) {    // start a new word if needed    if (!curWord) {      beginWord(state, x, y);    }    // page rotation and/or transform matrices can cause text to be    // drawn in reverse order -- in this case, swap the begin/end    // coordinates and break text into individual chars    if ((curWord->rot == 0 && w1 < 0) ||	(curWord->rot == 1 && h1 < 0) ||	(curWord->rot == 2 && w1 > 0) ||	(curWord->rot == 3 && h1 > 0)) {      endWord();      beginWord(state, x + dx, y + dy);      x1 += w1;      y1 += h1;      w1 = -w1;      h1 = -h1;    }    // add the characters to the current word    w1 /= uLen;    h1 /= uLen;    for (i = 0; i < uLen; ++i) {      curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, u[i]);    }  }  if (curWord) {    curWord->charLen += nBytes;  }  charPos += nBytes;}void TextPage::endWord() {  // This check is needed because Type 3 characters can contain  // text-drawing operations (when TextPage is being used via  // {X,Win}SplashOutputDev rather than TextOutputDev).  if (nest > 0) {    --nest;    return;  }  if (curWord) {    addWord(curWord);    curWord = NULL;  }}void TextPage::addWord(TextWord *word) {  // throw away zero-length words -- they don't have valid xMin/xMax  // values, and they're useless anyway  if (word->len == 0) {    delete word;    return;  }  if (rawOrder) {    if (rawLastWord) {      rawLastWord->next = word;    } else {      rawWords = word;    }    rawLastWord = word;  } else {    pools[word->rot]->addWord(word);  }}void TextPage::coalesce(GBool physLayout) {  UnicodeMap *uMap;  TextPool *pool;  TextWord *word0, *word1, *word2;  TextLine *line;  TextBlock *blkList, *blkStack, *blk, *lastBlk, *blk0, *blk1;  TextBlock **blkArray;  TextFlow *flow, *lastFlow;  int rot, poolMinBaseIdx, baseIdx, startBaseIdx;  double minBase, maxBase, newMinBase, newMaxBase;  double fontSize, colSpace1, colSpace2, lineSpace, intraLineSpace, blkSpace;  GBool found;  int count[4];  int lrCount;  int firstBlkIdx, nBlocksLeft;  int col1, col2;  int i, j, n;  if (rawOrder) {    primaryRot = 0;    primaryLR = gTrue;    return;  }  uMap = globalParams->getTextEncoding();  blkList = NULL;  lastBlk = NULL;  nBlocks = 0;  primaryRot = -1;#if 0 // for debugging  printf("*** initial words ***\n");  for (rot = 0; rot < 4; ++rot) {    pool = pools[rot];    for (baseIdx = pool->minBaseIdx; baseIdx <= pool->maxBaseIdx; ++baseIdx) {      for (word0 = pool->getPool(baseIdx); word0; word0 = word0->next) {	printf("    word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f rot=%d '",	       word0->xMin, word0->xMax, word0->yMin, word0->yMax,	       word0->base, word0->fontSize, rot*90);	for (i = 0; i < word0->len; ++i) {	  fputc(word0->text[i] & 0xff, stdout);	}	printf("'\n");      }    }  }  printf("\n");#endif  //----- assemble the blocks  //~ add an outer loop for writing mode (vertical text)  // build blocks for each rotation value  for (rot = 0; rot < 4; ++rot) {    pool = pools[rot];    poolMinBaseIdx = pool->minBaseIdx;    count[rot] = 0;    // add blocks until no more words are left    while (1) {      // find the first non-empty line in the pool      for (;	   poolMinBaseIdx <= pool->maxBaseIdx &&	     !pool->getPool(poolMinBaseIdx);	   ++poolMinBaseIdx) ;      if (poolMinBaseIdx > pool->maxBaseIdx) {	break;      }      // look for the left-most word in the first four lines of the      // pool -- this avoids starting with a superscript word      startBaseIdx = poolMinBaseIdx;      for (baseIdx = poolMinBaseIdx + 1;	   baseIdx < poolMinBaseIdx + 4 && baseIdx <= pool->maxBaseIdx;	   ++baseIdx) {	if (!pool->getPool(baseIdx)) {	  continue;	}	if (pool->getPool(baseIdx)->primaryCmp(pool->getPool(startBaseIdx))	    < 0) {	  startBaseIdx = baseIdx;	}      }      // create a new block      word0 = pool->getPool(startBaseIdx);      pool->setPool(startBaseIdx, word0->next);      word0->next = NULL;      blk = new TextBlock(this, rot);      blk->addWord(word0);      fontSize = word0->fontSize;      minBase = maxBase = word0->base;      colSpace1 = minColSpacing1 * fontSize;      colSpace2 = minColSpacing2 * fontSize;      lineSpace = maxLineSpacingDelta * fontSize;      intraLineSpace = maxIntraLineDelta * fontSize;      // add words to the block      do {	found = gFalse;	// look for words on the line above the current top edge of	// the block	newMinBase = minBase;	for (baseIdx = pool->getBaseIdx(minBase);	     baseIdx >= pool->getBaseIdx(minBase - lineSpace);	     --baseIdx) {	  word0 = NULL;	  word1 = pool->getPool(baseIdx);	  while (word1) {	    if (word1->base < minBase &&		word1->base >= minBase - lineSpace &&		((rot == 0 || rot == 2)		 ? (word1->xMin < blk->xMax && word1->xMax > blk->xMin)		 : (word1->yMin < blk->yMax && word1->yMax > blk->yMin)) &&		fabs(word1->fontSize - fontSize) <		  maxBlockFontSizeDelta1 * fontSize) {	      word2 = word1;	      if (word0) {		word0->next = word1->next;	      } else {		pool->setPool(baseIdx, word1->next);	      }	      word1 = word1->next;	      word2->next = NULL;	      blk->addWord(word2);	      found = gTrue;	      newMinBase = word2->base;	    } else {	      word0 = word1;	      word1 = word1->next;	    }	  }	}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -