📄 file.cc

📁 100 病毒源碼,原始碼,無毒 ......
💻 CC
字号:
// Larbin// Sebastien Ailleret// 14-12-99 -> 31-10-00#include <unistd.h>#include <iostream.h>#include <assert.h>#include <string.h>#include <ctype.h>#include <errno.h>#include "pthread.h"#include "types.h"#include "global.h"#include "xfetcher/file.h"#include "xfetcher/fetchOpen.h"#include "xfetcher/checker.h"#include "xutils/text.h"#include "xutils/string.h"#include "xutils/Site.h"#include "xutils/Vector.h"#include "xutils/debug.h"#define ANSWER 0#define HEADERS 1#define HTML 3#define TAG 5#define TAGCONTENT 6#define SPECIFIC 7#define NONE 0#define AHREF 1#define BASE 2#define FRAME 3/*********************************** * implementation of file ***********************************/file::file () {  answer = new String(64);}file::~file () {}/** Is it a robots.txt */bool file::isRobots () {  return true;}/*********************************** * implementation of robots ***********************************//** Constructor */robots::robots (url *next) : file() {  newPars();  this->next = next;}/** Destructor */robots::~robots () {  delPars();  delete answer;  // next and glob are not deleted on purpose  // they belong to someone else}/** we get some more chars of this file */int robots::input (char *c, int size) {  answer->addBuffer(c, size);  if (answer->getLength() > maxRobotsSize) {	// no more input, forget the  end of this file	errno = tooBig;	return 1;  } else {	return 0;  }}/** parse the robots.txt */void robots::parse (bool isError) {  if (parseHeaders()) {	if (isError) {	  // If the file could be incomplete, delete last token	  // We could have Disallow / instead of Disallow /blabla	  for (uint i = answer->getLength()-1; i>0 && !isspace((*answer)[i]); i--) {		answer->setChar(i, ' ');	  }	}    uncomment();    parseRobots();  }}/** Uncomment this file */void robots::uncomment () {  for (uint i=pos, comment=0, cont=1; cont; i++) {    switch ((*answer)[i]) {    case ':':      answer->setChar(i, ' ');      break;    case '#':      comment = 1;      answer->setChar(i, ' ');      break;    case '\n':      comment = 0;      break;    case 0:      return;    default:      if (comment) {        answer->setChar(i, ' ');      }      break;    }  }}/** test http headers * return 1 if OK, 0 otherwise */bool robots::parseHeaders () {  uint len = answer->getLength();  if (len > 12 && (*answer)[9] != '4') {    // answer code suggest success	for(pos = 9; pos+3 < len; pos++) {      if (((*answer)[pos] == '\n' && ((*answer)[pos+1] == '\n'										 || (*answer)[pos+2] == '\n'))          || ((*answer)[pos] == '\r' && ((*answer)[pos+1] == '\r'                                         || (*answer)[pos+2] == '\r'))) {        siteRobots();        return true;      }    }    return false;  } else {	return false;  }}/** read the file */void robots::parseRobots () {  robotsOK();#ifndef NOSTATS  int goodfile = 1;#endif // NOSTATS  unsigned int num = next->hostHashCode() % siteListSize;  Site *server = global::siteList + num;  assert (server->forbidden == NULL);  server->forbidden = new Vector<char>;  uint items = 0; // size of server->forbidden  // state  // 0 : not concerned  // 1 : weakly concerned  // 2 : strongly concerned  int state = 1;  char *tok = nextToken(*answer, &pos);  while (tok != NULL) {	if (!strcasecmp(tok, "useragent") || !strcasecmp(tok, "user-agent")) {	  delete [] tok;	  if (state == 2) {		// end of strong concern record => the end for us		return;	  } else {		state = 0;		// what is the new state ?		tok = nextToken(*answer, &pos);		while (tok != NULL			   && strcasecmp(tok, "useragent")			   && strcasecmp(tok, "user-agent")			   && strcasecmp(tok, "disallow")) {		  if (contain(global::userAgent, tok)) {			state = 2;		  } else if (state == 0 && !strcmp(tok, "*")) {			state = 1;		  }		  delete [] tok;		  tok = nextToken(*answer, &pos);		}	  }	  if (state) {		// delete old forbidden : we've got a better record than older ones		assert (server->forbidden != NULL);		server->forbidden->recycle();		items = 0;	  }	} else if (!strcasecmp(tok, "disallow")) {	  delete [] tok;	  if (state) {		tok = getGoodToken();		while (tok != NULL			   && strcasecmp(tok, "useragent")			   && strcasecmp(tok, "user-agent")			   && strcasecmp(tok, "disallow")               && state) {		  // add nextToken to forbidden		  if (items++ < maxRobotsItem) {			server->forbidden->addElement(tok);		  } else {			delete [] tok;            state = 0;            server->forbidden->recycle();            server->forbidden->addElement(newString("/"));            items = 1;		  }		  tok = getGoodToken();		}	  } else {		// We're not concerned by this record		tok = getGoodToken();		while (tok != NULL			   && tok[0] == '/') {		  delete [] tok;		  tok = getGoodToken();		}	  }	} else {#ifndef NOSTATS	  if (goodfile) {		robotsOKdec();		goodfile = 0;        state = 0;		server->forbidden->recycle();        server->forbidden->addElement(newString("/"));        items = 1;	  }#endif // NOSTATS	  delete [] tok;	  tok = nextToken(*answer, &pos);	}  }}/** Get a token * try to understand it as a path, so correct errors of lazzy webmasters */char *robots::getGoodToken() {  char *res = nextToken(*answer, &pos);  if (res != NULL) {	switch (res[0]) {	case '*':	  // it's forbidden to have an * here,	  // for excluding everything, the real syntax is '/'	  res[0] = '/';	  break;    case '/':      break;	default:	  {		// add / before the first char		// with this method, we lose last char		// but it should not be a shame :		// We are a little more restrictive this way		// but it avoid to realloc a char*		char c = '/';		for (int i=0; res[i] != 0; i++) {		  char tmp = res[i];		  res[i] = c;		  c = tmp;		}	  }	  break;	}  }  return res;}/************************************* * implementation of html *************************************//** Constructor */html::html (url *here) : file() {  newPars();  this->here = here;  base = here->giveBase();  state = ANSWER;  content = new String(BUF_SIZE);  headers = new String;  needAnswer = !global::isSpecific;  pages();}/** Destructor */html::~html () {  delPars();  delete here;  delete content;  delete headers;  delete answer;  delete [] base;}/** a string is arriving * return 0 usually, 1 if don't want any more input */int html::input (char *c, int size) {  int i=0;  bool to_save = true;  char *posn;  while (i < size) {    if (to_save && state!=ANSWER && state!=HEADERS && needAnswer) {      to_save = false;      answer->addBuffer (c+i, size-i);    }    switch (state) {    case SPECIFIC:      i = size;      break;    case ANSWER:      posn = index (c, '\n');      if (posn == NULL) {        content->addBuffer (c, size);        headers->addBuffer (c, size);        i = size;      } else {        int nb = posn - c;        content->addBuffer (c, nb);        headers->addBuffer (c, nb+1);        i += nb+1;        if (parseCmdline ()) {          return 1;        }      }      break;    case HEADERS:      posn = index (c+i, '\n');      if (posn == NULL) {        content->addBuffer (c+i, size-i);        headers->addBuffer (c+i, size-i);        i = size;      } else {        int nb = posn - c - i;        content->addBuffer (c+i, nb);        headers->addBuffer (c+i, nb+1);        i += nb+1;        int tmp = parseHeader();        content->recycle();        if (tmp) {          return 1;        }      }      break;    case HTML:      posn = index (c+i, '<');      if (posn != NULL) {        state = TAG;        tagId = NONE;        tagPos = 1;        i = posn - c + 1;      } else {        i = size;      }      break;    case TAG:      // That's a tag      if (c[i]!='\n' && c[i]!=' ' && c[i]!='\t' && c[i]!='\r') {        // interesting char        matchTag(lowerCase(c[i]));      }      i++;      break;    case TAGCONTENT:      // Content of the tag      {        int end_item = i + MAX_URL_SIZE - content->getLength();        int it = i;        while (it < end_item && c[it]!='\"'               && c[it]!='\n' && c[it]!=' ') {          it++;        }        if (c[it] == '\"') { // compute this url          content->addBuffer(c+i, it-i);          tagContent ();        } else { // unusual end : forget this url          state = HTML;          content->recycle ();        }        i = it+1;      }      break;    default:      cerr << "switch not exhausive in input(char *c, int size) (file.cc)\n";      break;    }  }  return 0;}/** parse the answer code line */inline int html::parseCmdline () {  state = HEADERS;   // usefull only if return 0  errno = err40X;    // usefull only if return 1  int tmp = (content->getLength() < 12             || ((*content)[9] != '2' && (*content)[9] != '3'));  content->recycle();  return tmp;}/** parse a line of header * @return 0 if OK, 1 if we don't want to read the file */int html::parseHeader () {  if (content->getLength() < 2) {	// end of http headers	if (global::isSpecific && needAnswer) {	  state = SPECIFIC;	} else {	  state = HTML;	}  } else {	char *line = content->getString();	if (startWithIgnoreCase("Content-Type: ", line)) {	  // Let's read the type of this doc	  if (!startWithIgnoreCase("text/html", line+14)) {		if (global::isSpecific			&& startWithIgnoreCase(global::contentType, line+14)) {		  interestingSeen();		  needAnswer = true;		} else {		  errno = badType;		  return 1;		}	  }	}  }  return 0;}/** Try to understand this tag */void html::matchTag (char c) {  switch (tagId) {  case NONE:	//no tag yet	switch (c) {	case 'a':	  tagId = AHREF;	  break;	case 'b':	  tagId = BASE;	  break;	default:	  state = HTML;	}	break;  case AHREF:	// ahref tag	if ("ahref=\""[tagPos] == c) {	  tagPos++;	  if (tagPos == 7) {		state = TAGCONTENT;	  }	} else {	  state = HTML;	}	break;  case FRAME:	// framesrc tag	if ("framesrc=\""[tagPos] == c) {	  tagPos++;	  if (tagPos == 10) {		tagId = AHREF;		state = TAGCONTENT;	  }	} else {	  state = HTML;	}	break;  case BASE:	// basehref tag	if ("basehref=\""[tagPos] == c) {	  tagPos++;	  if (tagPos == 10) {		state = TAGCONTENT;	  }	} else {	  state = HTML;	}	break;  }}/** read the content of an interesting tag */void html::tagContent () {  switch (tagId) {  case AHREF:    {      // try to understand this new link      uint dpth = here->getDepth();      url *nouv = new url(newString(content->getString()), dpth-1, base);      if (nouv->isValid()) {        nouv->setDepth(here->getHost());        if (filter1(nouv->getHost(), nouv->getFile())) {          // The extension is not stupid (gz, pdf...)          links.addElement(nouv->giveUrl());          // good url => send it          if (dpth > 0) {            check(nouv);          } else {            // We are too deep in this site            delete nouv;          }        } else {          // The extension is stupid          delete nouv;        }      } else {        // Bad url => delete it        delete nouv;      }    }    break;  case BASE:    // This page has a BASE HREF tag    char *value = content->getString();    if (startWithIgnoreCase("http://", value)) {      // http://host/a/b/c.html becomes host/a/b/      uint i, end = strlen(value) - 1;      while (end >= 7 && value[end] != '/') {        end--;      }      if (end > 7) {        // the value looks normal        delete [] base;        base = new char[end-5];        for (i=0; i<=end-7; i++) {          base[i] = value[i+7];        }        base[end-6] = 0;      }    }    break;  }  state = HTML;  content->recycle();}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -