📄 site.cc
字号:
// Larbin// Sebastien Ailleret// 08-02-00 -> 15-05-00#include <unistd.h>#include <errno.h>#include <iostream.h>#include <string.h>#include <assert.h>#include <time.h>#include <fcntl.h>#include <sys/socket.h>#include <netinet/in.h>#include <netdb.h>#include <adns.h>#include <arpa/inet.h>#include <ctype.h>#include "types.h"#include "xutils/debug.h"#include "xutils/Site.h"#include "xutils/text.h"#include "xutils/connexion.h"#include "xutils/ConstantSizedFifoPriority.h"///////////////////////////////////////////////////////////// class Interval////////////////////////////////////////////////////////////** Constructor */Interval::Interval (uint size) { this->size = size; pos = 0; pthread_mutex_init (&lock, NULL); pthread_cond_init (&nonFull, NULL);}/** Destructor : never used */Interval::~Interval () { pthread_mutex_destroy (&lock); pthread_cond_destroy (&nonFull);}/** Ask the permission to put an url */void Interval::putOne () { pthread_mutex_lock(&lock); while (pos >= size) { pthread_cond_wait(&nonFull, &lock); } pos++; pthread_mutex_unlock(&lock);}/** How many urls can we put * block until at least one is possible */uint Interval::putAll () { pthread_mutex_lock(&lock); while (pos >= size) { pthread_cond_wait(&nonFull, &lock); } if (pos == size-1) { pos = size; pthread_mutex_unlock(&lock); return 1; } else { // This avoid some unnecessary signals (maybe stupid) assert (pos < size-1); int res = size-1 - pos; pos = size-1; pthread_mutex_unlock(&lock); return res; }}/** Warn an url has been retrieved */void Interval::getOne () { pthread_mutex_lock(&lock); if (pos-- == size) { pthread_cond_broadcast(&nonFull); } pthread_mutex_unlock(&lock);}///////////////////////////////////////////////////////////// class Site////////////////////////////////////////////////////////////** Constructor : initiate fields used by the program */Site::Site () { pthread_mutex_init (&lock, NULL); name = newString(""); forbidden = NULL; addr = NULL; in = 0; out = 0; size = fifoSiteSize; tab = new url *[size]; inFifo = false;}/** Destructor : This one is never used */Site::~Site () { cerr << "Some site is deleted (should not happen\n"; pthread_mutex_destroy (&lock); delete [] name; if (forbidden != NULL) { delete forbidden; } if (addr != NULL) { delete addr; } while (in != out) { delete tab[out]; out = (out+1) % size; } delete [] tab;}/** connect to this server using connection conn * return the state of the socket */char Site::getFds (Connexion *conn) { // use proxy if (global::proxyAddr != NULL) return global::getProxyFds(conn); // no proxy assert (addr != NULL); int fd = socket(AF_INET, SOCK_STREAM, 0); if (fd < 0) return EMPTY; conn->socket = fd; for (;;) { fcntl(fd, F_SETFL, O_NONBLOCK); if (connect(fd, (struct sockaddr*) addr, sizeof (struct sockaddr_in)) == 0) { // success return WRITE; } else if (errno == EINPROGRESS) { // would block return CONNECTING; } else { // error (void) close(fd); return EMPTY; } }}/** Put an url in the fifo * If there are too much, put it back in UrlsInternal (ie on disk) */void Site::putUrl (url *u) { pthread_mutex_lock(&lock); if ((in - out + size) % size > maxUrlsBySite && global::URLsInternal->getLength() > ramUrls/2) { // Already enough Urls in memory for this Site global::URLsInternal->put(u); global::inter->getOne(); } else { // All right, put this url inside at the end of the queue tab[in] = u; in = (in + 1) % size; // Change size if necessary if (in == out) { uint i; url **tmp = new url*[2*size]; for (i=out; i<size; i++) { tmp[i] = tab[i]; } for (i=0; i<in; i++) { tmp[i+size] = tab[i]; } in += size; size *= 2; delete [] tab; tab = tmp; } // Put Site in fifo if not yet in if (!inFifo) { inFifo = true; if (!strcmp(name, tab[out]->getHost()) && port == tab[out]->getPort() && lastUpdate + dnsValidTime >= time(NULL)) { global::okSites->put(this); } else { global::dnsSites->put(this); } } } pthread_mutex_unlock(&lock);}/** Put an prioritarian url in the fifo * Up to now, it's very naive * because we have no memory of priority inside the url */void Site::putPriorityUrl (url *u) { pthread_mutex_lock(&lock); if (in == out) { // first url on this site tab[in] = u; in = (in+1) % size; } else { // store the url in second position (the first might be in use) uint tmp = out; out = (out + size - 1) % size; tab[out] = tab[tmp]; tab[tmp] = u; // Change size if necessary if (in == out) { uint i; url **tmp = new url*[2*size]; for (i=out; i<size; i++) { tmp[i] = tab[i]; } for (i=0; i<in; i++) { tmp[i+size] = tab[i]; } in += size; size *= 2; delete [] tab; tab = tmp; } } // Put Site in fifo if not yet in if (!inFifo) { inFifo = true; if (!strcmp(name, tab[out]->getHost()) && port == tab[out]->getPort() && lastUpdate + dnsValidTime >= time(NULL)) { global::okSites->put(this); } else { global::dnsSites->put(this); } } pthread_mutex_unlock(&lock);}/** Get an url from the fifo * resize tab if too big * the lock must be set when calling this method */url *Site::getUrl () { url *u = tab[out]; out = (out + 1) % size; if (size > fifoSiteSize && ((in - out + size) % size)*3 < size) { // if the tab is too big, reduce it url **tmp = new url*[size/2]; uint i; for (i=0; ((out+i) % size) != in; i++) { tmp[i] = tab[(out+i) % size]; } out = 0; in = i; size /= 2; delete [] tab; tab = tmp; } return u;}/** fetch the first page in the fifo * never perform dns calls */void Site::fetchNonBlock () { pthread_mutex_lock(&lock); if (in == out) { // no more url to read // This is possible because this function can be called recursively // (fetchBlock cannot) inFifo = false; pthread_mutex_unlock(&lock); } else { if (!strcmp(name, tab[out]->getHost()) && port == tab[out]->getPort() && lastUpdate + dnsValidTime >= time(NULL)) { int waitD = lastAccess + global::waitDuration - time(NULL); if (waitD > 0) { // We should wait a little sleep(waitD); } // all right, let's go urls(); url *u = getUrl(); pthread_mutex_unlock(&lock);#ifndef NOSTATS if (global::isSpecific && endWithIgnoreCase(global::privilegedExt, u->getFile())) { extensionTreated(); }#endif global::inter->getOne(); goodSite(u); } else { // That's a new site => don't manage it pthread_mutex_unlock(&lock); global::dnsSites->put(this); } }}/** Init a new dns query */void Site::newQuery (uint *nbCalls) { // Update our stats newId(); if (global::proxyAddr != NULL) { // we use a proxy, no need to get the sockaddr // give anything for going on siteSeen(); siteDNS(); // Get the robots.txt dnsOK(new sockaddr_in); } else if (isdigit(name[0])) { // the name already in numbers-and-dots notation siteSeen(); sockaddr_in *saddr = new sockaddr_in; if (inet_aton(name, &saddr->sin_addr)) { // Yes, it is in numbers-and-dots notation siteDNS(); // compute the new addr saddr->sin_family = AF_INET; saddr->sin_port = htons(port); // Get the robots.txt dnsOK(saddr); } else { // No, it isn't : this site is a non sense delete saddr; dnsErr(); } } else { (*nbCalls)++; adns_query quer = NULL; // adns_query *quer = new adns_query; crash("Submit an adns query"); int res = adns_submit(global::ads, name, (adns_rrtype) adns_r_addr, (adns_queryflags) 0, this, &quer); crash("End of submission"); assert (res == 0); }}/** The dns query ended with success */void Site::dnsAns (adns_answer *ans) { siteSeen(); assert (addr == NULL); if (ans->status != adns_s_ok) { // No addr inet dnsErr(); } else { siteDNS(); // compute the new addr sockaddr_in *saddr = new sockaddr_in; memcpy (saddr, &ans->rrs.addr->addr.inet, sizeof (sockaddr_in)); saddr->sin_family = AF_INET; saddr->sin_port = htons(port); // Get the robots.txt dnsOK(saddr); }}/** we've got a good dns answer * get the robots.txt */void Site::dnsOK (sockaddr_in *saddr) { urls(); addr = saddr; pthread_mutex_lock(&lock); url *u = getUrl(); pthread_mutex_unlock(&lock);#ifndef NOSTATS if (global::isSpecific && endWithIgnoreCase(global::privilegedExt, u->getFile())) { extensionTreated(); }#endif global::inter->getOne(); stateBlock(14); Connexion *conn = global::freeConns->getPriority(); stateBlock(15); char res = getFds(conn); if (res != EMPTY) { conn->timeout = time(NULL) + timeoutPage; if (global::proxyAddr != NULL) { conn->request.addString("GET http://"); conn->request.addString(name); char tmp[15]; sprintf(tmp, ":%u", port); conn->request.addString(tmp); conn->request.addString("/robots.txt HTTP/1.0\r\nHost: "); } else { conn->request.addString("GET /robots.txt HTTP/1.0\r\nHost: "); } conn->request.addString(u->getHost()); conn->request.addString(global::headers); conn->parser = new robots(u); conn->pos = 0; // This must be done in last // Be careful, There is no lock !!!! conn->state = res; // keep lock while fetching } else { // Unable to get a socket fetchFail(u, noConnection); answers(noConnection); // stat delete u; global::freeConns->put(conn); fetchNonBlock(); }}/** Cannot get the inet addr */void Site::dnsErr () { pthread_mutex_lock(&lock); url *u = getUrl(); while (!strcmp(name, u->getHost()) && port == u->getPort()) { fetchFail(u, noConnection); urls(); answers(noDNS);#ifndef NOSTATS if (global::isSpecific && endWithIgnoreCase(global::privilegedExt, u->getFile())) { extensionTreated(); }#endif delete u; global::inter->getOne(); if (in == out) { break; } else { u = getUrl(); } } if (in != out) { // put the url in the tab out = (out + size - 1) % size; tab[out] = u; global::dnsSites->put(this); } else { inFifo = false; } pthread_mutex_unlock(&lock);}/** We have an url on the good site : Connect it */void Site::goodSite (url *u) { // That's the good site if (addr == NULL) { // We didn't manage to get the inet addr of this site fetchFail(u, noDNS); answers(noDNS); delete u; fetchNonBlock(); } else { // Connection is possible Connexion *conn = global::freeConns->get(); connectUrl(conn, u); }}/* try to connect to a site * and ask for an file * if you modifie this function, don't forget connectThisUrl */void Site::connectUrl (Connexion *conn, url *u) { if (testRobots(u->getFile())) { // We're allowed to fetch this one // open the socket char res = getFds(conn); if (res != EMPTY) { lastAccess = time(NULL); conn->timeout = lastAccess + timeoutPage; conn->request.addString("GET "); if (global::proxyAddr != NULL) { char *tmp = u->giveUrl(); conn->request.addString(tmp); delete [] tmp; } else { conn->request.addString(u->getFile()); } conn->request.addString(" HTTP/1.0\r\nHost: "); conn->request.addString(u->getHost()); conn->request.addString(global::headers); conn->parser = new html (u); conn->pos = 0; // This must be done in last // Be careful, There is no lock !!!! conn->state = res; // We keep the lock while fetching } else { // Unable to connect fetchFail(u, noConnection); answers(noConnection); delete u; global::freeConns->put(conn); fetchNonBlock(); } } else { // We're not welcome on this site fetchFail(u, forbiddenRobots); answers(forbiddenRobots); delete u; global::freeConns->put(conn); fetchNonBlock(); }}/* try to connect to a site * and ask for an file * do not perform any freeConns.get (might cause deadlock) * if you modifie this function, don't forget connectUrl */void Site::connectThisUrl (Connexion *conn, url *u) { if (testRobots(u->getFile())) { // We're allowed to fetch this one // open the socket char res = getFds(conn); if (res != EMPTY) { lastAccess = time(NULL); conn->timeout = lastAccess + timeoutPage; conn->request.addString("GET "); if (global::proxyAddr != NULL) { char *tmp = u->giveUrl(); conn->request.addString(tmp); delete [] tmp; } else { conn->request.addString(u->getFile()); } conn->request.addString(" HTTP/1.0\r\nHost: "); conn->request.addString(u->getHost()); conn->request.addString(global::headers); conn->parser = new html (u); conn->pos = 0; // This must be done in last // Be careful, There is no lock !!!! conn->state = res; // We keep the lock while fetching } else { // Unable to connect fetchFail(u, noConnection); answers(noConnection); delete u; global::freeConns->put(conn); putInFifo(); } } else { // We're not welcome on this site fetchFail(u, forbiddenRobots); answers(forbiddenRobots); delete u; global::freeConns->put(conn); putInFifo(); }}/** test if a file can be fetched thanks to the robots.txt */bool Site::testRobots(char *file) { if (forbidden == NULL) { return true; } else { int i=0; while ((*forbidden)[i] != NULL) { if (startWith((*forbidden)[i], file)) { return false; } i++; } return true; }}/** Delete the old identity of the site */void Site::newId () { // The lock protects tab, which can be change by putUrl pthread_mutex_lock(&lock); assert (strcmp(name, tab[out]->getHost()) || port != tab[out]->getPort() || lastUpdate + dnsValidTime <= time(NULL)); // Change the identity of this site#ifndef NDEBUG if (name[0] == 0) { addsite(); }#endif // NDEBUG url *u = tab[out]; pthread_mutex_unlock(&lock); delete [] name; name = newString(u->getHost()); port = u->getPort(); lastUpdate = time(NULL); lastAccess = 0; // Delete old forbidden list if (forbidden != NULL) { delete forbidden; forbidden = NULL; } if (addr != NULL) { delete addr; addr = NULL; }}/** After a fetch, decide whether or not the site must be * put in okSites or dnsSites */void Site::putInFifo () { assert (inFifo); pthread_mutex_lock(&lock); if (in == out) { inFifo = false; } else if (!strcmp(name, tab[out]->getHost()) && port == tab[out]->getPort() && lastUpdate + dnsValidTime >= time(NULL)) { global::okSites->put(this); } else { global::dnsSites->put(this); } pthread_mutex_unlock(&lock);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -