📄 file.cc
字号:
// Larbin// Sebastien Ailleret// 14-12-99 -> 31-10-00#include <unistd.h>#include <iostream.h>#include <assert.h>#include <string.h>#include <ctype.h>#include <errno.h>#include "pthread.h"#include "types.h"#include "global.h"#include "xfetcher/file.h"#include "xfetcher/fetchOpen.h"#include "xfetcher/checker.h"#include "xutils/text.h"#include "xutils/string.h"#include "xutils/Site.h"#include "xutils/Vector.h"#include "xutils/debug.h"#define ANSWER 0#define HEADERS 1#define HTML 3#define TAG 5#define TAGCONTENT 6#define SPECIFIC 7#define NONE 0#define AHREF 1#define BASE 2#define FRAME 3/*********************************** * implementation of file ***********************************/file::file () { answer = new String(64);}file::~file () {}/** Is it a robots.txt */bool file::isRobots () { return true;}/*********************************** * implementation of robots ***********************************//** Constructor */robots::robots (url *next) : file() { newPars(); this->next = next;}/** Destructor */robots::~robots () { delPars(); delete answer; // next and glob are not deleted on purpose // they belong to someone else}/** we get some more chars of this file */int robots::input (char *c, int size) { answer->addBuffer(c, size); if (answer->getLength() > maxRobotsSize) { // no more input, forget the end of this file errno = tooBig; return 1; } else { return 0; }}/** parse the robots.txt */void robots::parse (bool isError) { if (parseHeaders()) { if (isError) { // If the file could be incomplete, delete last token // We could have Disallow / instead of Disallow /blabla for (uint i = answer->getLength()-1; i>0 && !isspace((*answer)[i]); i--) { answer->setChar(i, ' '); } } uncomment(); parseRobots(); }}/** Uncomment this file */void robots::uncomment () { for (uint i=pos, comment=0, cont=1; cont; i++) { switch ((*answer)[i]) { case ':': answer->setChar(i, ' '); break; case '#': comment = 1; answer->setChar(i, ' '); break; case '\n': comment = 0; break; case 0: return; default: if (comment) { answer->setChar(i, ' '); } break; } }}/** test http headers * return 1 if OK, 0 otherwise */bool robots::parseHeaders () { uint len = answer->getLength(); if (len > 12 && (*answer)[9] != '4') { // answer code suggest success for(pos = 9; pos+3 < len; pos++) { if (((*answer)[pos] == '\n' && ((*answer)[pos+1] == '\n' || (*answer)[pos+2] == '\n')) || ((*answer)[pos] == '\r' && ((*answer)[pos+1] == '\r' || (*answer)[pos+2] == '\r'))) { siteRobots(); return true; } } return false; } else { return false; }}/** read the file */void robots::parseRobots () { robotsOK();#ifndef NOSTATS int goodfile = 1;#endif // NOSTATS unsigned int num = next->hostHashCode() % siteListSize; Site *server = global::siteList + num; assert (server->forbidden == NULL); server->forbidden = new Vector<char>; uint items = 0; // size of server->forbidden // state // 0 : not concerned // 1 : weakly concerned // 2 : strongly concerned int state = 1; char *tok = nextToken(*answer, &pos); while (tok != NULL) { if (!strcasecmp(tok, "useragent") || !strcasecmp(tok, "user-agent")) { delete [] tok; if (state == 2) { // end of strong concern record => the end for us return; } else { state = 0; // what is the new state ? tok = nextToken(*answer, &pos); while (tok != NULL && strcasecmp(tok, "useragent") && strcasecmp(tok, "user-agent") && strcasecmp(tok, "disallow")) { if (contain(global::userAgent, tok)) { state = 2; } else if (state == 0 && !strcmp(tok, "*")) { state = 1; } delete [] tok; tok = nextToken(*answer, &pos); } } if (state) { // delete old forbidden : we've got a better record than older ones assert (server->forbidden != NULL); server->forbidden->recycle(); items = 0; } } else if (!strcasecmp(tok, "disallow")) { delete [] tok; if (state) { tok = getGoodToken(); while (tok != NULL && strcasecmp(tok, "useragent") && strcasecmp(tok, "user-agent") && strcasecmp(tok, "disallow") && state) { // add nextToken to forbidden if (items++ < maxRobotsItem) { server->forbidden->addElement(tok); } else { delete [] tok; state = 0; server->forbidden->recycle(); server->forbidden->addElement(newString("/")); items = 1; } tok = getGoodToken(); } } else { // We're not concerned by this record tok = getGoodToken(); while (tok != NULL && tok[0] == '/') { delete [] tok; tok = getGoodToken(); } } } else {#ifndef NOSTATS if (goodfile) { robotsOKdec(); goodfile = 0; state = 0; server->forbidden->recycle(); server->forbidden->addElement(newString("/")); items = 1; }#endif // NOSTATS delete [] tok; tok = nextToken(*answer, &pos); } }}/** Get a token * try to understand it as a path, so correct errors of lazzy webmasters */char *robots::getGoodToken() { char *res = nextToken(*answer, &pos); if (res != NULL) { switch (res[0]) { case '*': // it's forbidden to have an * here, // for excluding everything, the real syntax is '/' res[0] = '/'; break; case '/': break; default: { // add / before the first char // with this method, we lose last char // but it should not be a shame : // We are a little more restrictive this way // but it avoid to realloc a char* char c = '/'; for (int i=0; res[i] != 0; i++) { char tmp = res[i]; res[i] = c; c = tmp; } } break; } } return res;}/************************************* * implementation of html *************************************//** Constructor */html::html (url *here) : file() { newPars(); this->here = here; base = here->giveBase(); state = ANSWER; content = new String(BUF_SIZE); headers = new String; needAnswer = !global::isSpecific; pages();}/** Destructor */html::~html () { delPars(); delete here; delete content; delete headers; delete answer; delete [] base;}/** a string is arriving * return 0 usually, 1 if don't want any more input */int html::input (char *c, int size) { int i=0; bool to_save = true; char *posn; while (i < size) { if (to_save && state!=ANSWER && state!=HEADERS && needAnswer) { to_save = false; answer->addBuffer (c+i, size-i); } switch (state) { case SPECIFIC: i = size; break; case ANSWER: posn = index (c, '\n'); if (posn == NULL) { content->addBuffer (c, size); headers->addBuffer (c, size); i = size; } else { int nb = posn - c; content->addBuffer (c, nb); headers->addBuffer (c, nb+1); i += nb+1; if (parseCmdline ()) { return 1; } } break; case HEADERS: posn = index (c+i, '\n'); if (posn == NULL) { content->addBuffer (c+i, size-i); headers->addBuffer (c+i, size-i); i = size; } else { int nb = posn - c - i; content->addBuffer (c+i, nb); headers->addBuffer (c+i, nb+1); i += nb+1; int tmp = parseHeader(); content->recycle(); if (tmp) { return 1; } } break; case HTML: posn = index (c+i, '<'); if (posn != NULL) { state = TAG; tagId = NONE; tagPos = 1; i = posn - c + 1; } else { i = size; } break; case TAG: // That's a tag if (c[i]!='\n' && c[i]!=' ' && c[i]!='\t' && c[i]!='\r') { // interesting char matchTag(lowerCase(c[i])); } i++; break; case TAGCONTENT: // Content of the tag { int end_item = i + MAX_URL_SIZE - content->getLength(); int it = i; while (it < end_item && c[it]!='\"' && c[it]!='\n' && c[it]!=' ') { it++; } if (c[it] == '\"') { // compute this url content->addBuffer(c+i, it-i); tagContent (); } else { // unusual end : forget this url state = HTML; content->recycle (); } i = it+1; } break; default: cerr << "switch not exhausive in input(char *c, int size) (file.cc)\n"; break; } } return 0;}/** parse the answer code line */inline int html::parseCmdline () { state = HEADERS; // usefull only if return 0 errno = err40X; // usefull only if return 1 int tmp = (content->getLength() < 12 || ((*content)[9] != '2' && (*content)[9] != '3')); content->recycle(); return tmp;}/** parse a line of header * @return 0 if OK, 1 if we don't want to read the file */int html::parseHeader () { if (content->getLength() < 2) { // end of http headers if (global::isSpecific && needAnswer) { state = SPECIFIC; } else { state = HTML; } } else { char *line = content->getString(); if (startWithIgnoreCase("Content-Type: ", line)) { // Let's read the type of this doc if (!startWithIgnoreCase("text/html", line+14)) { if (global::isSpecific && startWithIgnoreCase(global::contentType, line+14)) { interestingSeen(); needAnswer = true; } else { errno = badType; return 1; } } } } return 0;}/** Try to understand this tag */void html::matchTag (char c) { switch (tagId) { case NONE: //no tag yet switch (c) { case 'a': tagId = AHREF; break; case 'b': tagId = BASE; break; default: state = HTML; } break; case AHREF: // ahref tag if ("ahref=\""[tagPos] == c) { tagPos++; if (tagPos == 7) { state = TAGCONTENT; } } else { state = HTML; } break; case FRAME: // framesrc tag if ("framesrc=\""[tagPos] == c) { tagPos++; if (tagPos == 10) { tagId = AHREF; state = TAGCONTENT; } } else { state = HTML; } break; case BASE: // basehref tag if ("basehref=\""[tagPos] == c) { tagPos++; if (tagPos == 10) { state = TAGCONTENT; } } else { state = HTML; } break; }}/** read the content of an interesting tag */void html::tagContent () { switch (tagId) { case AHREF: { // try to understand this new link uint dpth = here->getDepth(); url *nouv = new url(newString(content->getString()), dpth-1, base); if (nouv->isValid()) { nouv->setDepth(here->getHost()); if (filter1(nouv->getHost(), nouv->getFile())) { // The extension is not stupid (gz, pdf...) links.addElement(nouv->giveUrl()); // good url => send it if (dpth > 0) { check(nouv); } else { // We are too deep in this site delete nouv; } } else { // The extension is stupid delete nouv; } } else { // Bad url => delete it delete nouv; } } break; case BASE: // This page has a BASE HREF tag char *value = content->getString(); if (startWithIgnoreCase("http://", value)) { // http://host/a/b/c.html becomes host/a/b/ uint i, end = strlen(value) - 1; while (end >= 7 && value[end] != '/') { end--; } if (end > 7) { // the value looks normal delete [] base; base = new char[end-5]; for (i=0; i<=end-7; i++) { base[i] = value[i+7]; } base[end-6] = 0; } } break; } state = HTML; content->recycle();}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -