📄 file.cc
字号:
// Larbin// Sebastien Ailleret// 14-12-99 -> 19-03-02#include <unistd.h>#include <iostream.h>#include <assert.h>#include <string.h>#include <ctype.h>#include <errno.h>#include <sys/types.h>#include <sys/socket.h>#include "options.h"#include "types.h"#include "global.h"#include "utils/text.h"#include "utils/url.h"#include "utils/string.h"#include "utils/Vector.h"#include "fetch/site.h"#include "fetch/file.h"#include "fetch/fetchOpen.h"#include "fetch/checker.h"#include "utils/debug.h"#define ANSWER 0#define HEADERS 1#define HEADERS30X 2#define HTML 3#define SPECIFIC 4#define LINK 0#define BASE 1/*********************************** * implementation of file ***********************************/file::file (Connexion *conn) { buffer = conn->buffer; pos = 0; posParse = buffer;}file::~file () {}/*********************************** * implementation of robots ***********************************//** Constructor */robots::robots (NamedSite *server, Connexion *conn) : file(conn) { newPars(); this->server = server; answerCode = false; isRobots = true;}/** Destructor */robots::~robots () { delPars(); // server is not deleted on purpose // it belongs to someone else}/** we get some more chars of this file */int robots::endInput () { return 0;}/** input and parse headers */int robots::inputHeaders (int size) { pos += size; if (!answerCode && pos > 12) { if (buffer[9] == '2') { answerCode = true; } else { errno = err40X; return 1; } } if (pos > maxRobotsSize) { // no more input, forget the end of this file errno = tooBig; return 1; } else { return 0; }}/** parse the robots.txt */void robots::parse (bool isError) { if (answerCode && parseHeaders()) { siteRobots(); buffer[pos] = 0; if (isError) { // The file could be incomplete, delete last token // We could have Disallow / instead of Disallow /blabla for (uint i=pos-1; i>0 && !isspace(buffer[i]); i--) { buffer[i] = ' '; } } parseRobots(); }}/** test http headers * return true if OK, false otherwise */bool robots::parseHeaders () { for(posParse = buffer+9; posParse[3] != 0; posParse++) { if ((posParse[0] == '\n' && (posParse[1] == '\n' || posParse[2] == '\n')) || (posParse[0] == '\r' && (posParse[1] == '\r' || posParse[2] == '\r'))) { return true; } } return false;}/** try to understand the file */void robots::parseRobots () { robotsOK();#ifndef NOSTATS bool goodfile = true;#endif // NOSTATS server->forbidden.recycle(); uint items = 0; // size of server->forbidden // state // 0 : not concerned // 1 : weakly concerned // 2 : strongly concerned int state = 1; char *tok = nextToken(&posParse, ':'); while (tok != NULL) { if (!strcasecmp(tok, "useragent") || !strcasecmp(tok, "user-agent")) { if (state == 2) { // end of strong concern record => the end for us return; } else { state = 0; // what is the new state ? tok = nextToken(&posParse, ':'); while (tok != NULL && strcasecmp(tok, "useragent") && strcasecmp(tok, "user-agent") && strcasecmp(tok, "disallow")) { if (caseContain(tok, global::userAgent)) { state = 2; } else if (state == 0 && !strcmp(tok, "*")) { state = 1; } tok = nextToken(&posParse, ':'); } } if (state) { // delete old forbidden : we've got a better record than older ones server->forbidden.recycle(); items = 0; } else { // forget this record while (tok != NULL && strcasecmp(tok, "useragent") && strcasecmp(tok, "user-agent")) { tok = nextToken(&posParse, ':'); } } } else if (!strcasecmp(tok, "disallow")) { tok = nextToken(&posParse, ':'); while (tok != NULL && strcasecmp(tok, "useragent") && strcasecmp(tok, "user-agent") && strcasecmp(tok, "disallow")) { // add nextToken to forbidden if (items++ < maxRobotsItem) { // make this token a good token if (tok[0] == '*') { // * is not correct, / disallows everything tok[0] = '/'; } else if (tok[0] != '/') { tok--; tok[0] = '/'; } if (fileNormalize(tok)) { server->forbidden.addElement(newString(tok)); } } tok = nextToken(&posParse, ':'); } } else {#ifndef NOSTATS if (goodfile) { robotsOKdec(); goodfile = false; }#endif // NOSTATS tok = nextToken(&posParse, ':'); } }}/************************************* * implementation of html *************************************//////////////////////////////////////////#ifdef SPECIFICSEARCH#include "fetch/specbuf.cc"#define _newSpec() if (state==SPECIFIC) newSpec()#define _destructSpec() if (state==SPECIFIC) destructSpec()#define _endOfInput() if (state==SPECIFIC) return endOfInput()#define _getContent() \ if (state==SPECIFIC) return getContent(); \ else return contentStart#define _getSize() \ if (state==SPECIFIC) return getSize(); \ else return (buffer + pos - contentStart)///////////////////////////////////////#else // not a SPECIFICSEARCHvoid initSpecific () { }#define constrSpec() ((void) 0)#define _newSpec() ((void) 0)#define pipeSpec() 0#define _endOfInput() ((void) 0)#define _destructSpec() ((void) 0)#define _getContent() return contentStart#define _getSize() return (buffer + pos - contentStart)#endif // SPECIFICSEARCH/////////////////////////////////////////#if CGILEVEL >= 1#define notCgiChar(c) (c!='?' && c!='=' && c!='*')#else#define notCgiChar(c) true#endif // CGILEVEL/** Constructor */html::html (url *here, Connexion *conn) : file(conn) { newPars(); this->here = here; base = here->giveBase(); state = ANSWER; isInteresting = false; constrSpec(); pages(); isRobots = false;}/** Destructor */html::~html () { _destructSpec(); delPars(); delete here; delete base;}/* get the content of the page */char *html::getPage () { _getContent();}int html::getLength () { _getSize();}/* manage a new url : verify and send it */void html::manageUrl (url *nouv, bool isRedir) { if (nouv->isValid() && filter1(nouv->getHost(), nouv->getFile()) && (global::externalLinks || isRedir || !strcmp(nouv->getHost(), this->here->getHost()))) { // The extension is not stupid (gz, pdf...)#ifdef LINKS_INFO links.addElement(nouv->giveUrl());#endif // LINKS_INFO if (nouv->initOK(here)) { check(nouv); } else { // this url is forbidden for errno reason (set by initOK) answers(errno); delete nouv; } } else { // The extension is stupid delete nouv; }}/**********************************************//* This part manages command line and headers *//**********************************************//** a string is arriving, treat it only up to the end of headers * return 0 usually, 1 if no more input and set errno accordingly */int html::inputHeaders (int size) { pos += size; buffer[pos] = 0; char *posn; while (posParse < buffer + pos) { switch (state) { case ANSWER: posn = strchr(posParse, '\n'); if (posn != NULL) { posParse = posn; if (parseCmdline ()) { return 1; } area = ++posParse; } else { return 0; } break; case HEADERS: case HEADERS30X: posn = strchr(posParse, '\n'); if (posn != NULL) { posParse = posn; int tmp;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -