http.c

来自「网络爬虫程序」· C语言 代码 · 共 1,770 行 · 第 1/3 页

C
1,770
字号
/***************************************************************************//*    This code is part of WWW grabber called pavuk                        *//*    Copyright (c) 1997 - 2001 Stefan Ondrejicka                          *//*    Distributed under GPL 2 or later                                     *//***************************************************************************/#include "config.h"#include <assert.h>#include <stdio.h>#include <stdlib.h>#include <errno.h>#include <netdb.h>#include <errno.h>#include <sys/types.h>#include <unistd.h>#include <string.h>#include <sys/stat.h>#include <time.h>#include "url.h"#include "form.h"#include "http.h"#include "net.h"#include "base64.h"#include "tools.h"#include "cookie.h"#include "mime.h"#include "errcode.h"#include "abstract.h"#include "myssl.h"#include "authinfo.h"#include "gui_api.h"#include "times.h"#include "uexit.h"#include "cookie.h"#include "ntlm_auth.h"#include "digest_auth.h"#include "doc.h"static void http_process_response_11flags(doc *, http_response *);const http_auth_type_info_t http_auths[] = {  {"", HTTP_AUTH_NONE},  {"user", HTTP_AUTH_USER},  {"Basic", HTTP_AUTH_BASIC},  {"Digest", HTTP_AUTH_DIGEST},  {"NTLM", HTTP_AUTH_NTLM},  {NULL, HTTP_AUTH_NONE},};httphdr *httphdr_parse(char *str){  char *p;  httphdr *rv = _malloc(sizeof(httphdr));  rv->all = FALSE;  if(*str == '+')  {    rv->all = TRUE;    str++;  }  p = strchr(str, ':');  if(!p)  {    free(rv);    return NULL;  }  rv->name = tl_strndup(str, p - str + 1);  p++;  rv->val = (*p == ' ') ? tl_strdup(p + 1) : tl_strdup(p);  return rv;}void httphdr_free(httphdr * hdr){  _free(hdr->val);  _free(hdr->name);  _free(hdr);}static char *http_get_additional_headers(url * urlp){  char pom[2048];  char *req = NULL;  if(priv_cfg.http_headers)  {    dllist *ptr;    ptr = priv_cfg.http_headers;    while(ptr)    {      httphdr *hdr = (httphdr *) ptr->data;      if(!urlp->parent_url || (urlp->parent_url && hdr->all))      {        snprintf(pom, sizeof(pom), "%s %s\r\n", hdr->name, hdr->val);        req = tl_str_append(req, pom);      }      ptr = ptr->next;    }  }  return req;}static char *http_get_auth_str(doc * docp, char *method){  char auth[2048];  char pom[1024];  char *user, *pass, *p;  int auth_scheme;  http_digest_info *auth_digest;  auth[0] = '\0';  if(docp->doc_url->type == URLT_FTP)    return NULL;  auth_digest = (http_digest_info *) docp->auth_digest;  user = url_get_user(docp->doc_url, auth_digest ? auth_digest->realm : NULL);  pass = url_get_pass(docp->doc_url, auth_digest ? auth_digest->realm : NULL);  auth_scheme = url_get_auth_scheme(docp->doc_url, auth_digest ?    auth_digest->realm : NULL);  if(user)  {    if(pass)    {      if(auth_scheme == HTTP_AUTH_DIGEST && auth_digest)      {        http_get_digest_auth_str(auth_digest,          method, user, pass, docp->doc_url, auth, sizeof(auth));      }      else if(auth_scheme == HTTP_AUTH_USER)      {        snprintf(auth, sizeof(auth), "user %s:%s", user, pass);      }      else if(auth_scheme == HTTP_AUTH_BASIC)      {        snprintf(pom, sizeof(pom), "%s:%s", user, pass);        p = base64_encode(pom);        snprintf(auth, sizeof(auth), "Basic %s", p);        free(p);      }    }    else    {      strncpy(auth, user, sizeof(auth));      auth[sizeof(auth) - 1] = '\0';    }  }  return auth[0] ? tl_strdup(auth) : NULL;}static char *http_get_proxy_auth_str(doc * docp, char *method){  char auth[2048];  char pom[1024];  char *proxy_user, *proxy_pass, *p;  int proxy_auth_scheme;  http_digest_info *auth_proxy_digest;  auth_proxy_digest = (http_digest_info *) docp->auth_proxy_digest;  auth[0] = '\0';  proxy_user = priv_cfg.http_proxy_user;  proxy_pass = priv_cfg.http_proxy_pass;  proxy_auth_scheme = cfg.proxy_auth_scheme;  if(docp->http_proxy_port)  {    authinfo *ai;    ai = authinfo_match_entry(docp->doc_url->type,      docp->http_proxy, docp->http_proxy_port, NULL, NULL);    if(ai)    {      proxy_user = ai->user;      proxy_pass = ai->pass;      proxy_auth_scheme = ai->type;    }  }  if(proxy_user)  {    if(proxy_pass)    {      if(auth_proxy_digest && proxy_auth_scheme == HTTP_AUTH_DIGEST)      {        http_get_digest_auth_str(auth_proxy_digest,          method, proxy_user, proxy_pass, docp->doc_url, auth, sizeof(auth));      }      else if(proxy_auth_scheme == HTTP_AUTH_USER)      {        snprintf(auth, sizeof(auth), "user %s:%s", proxy_user, proxy_pass);      }      else if(proxy_auth_scheme == HTTP_AUTH_BASIC)      {        snprintf(pom, sizeof(pom), "%s:%s", proxy_user, proxy_pass);        p = base64_encode(pom);        snprintf(auth, sizeof(auth), "Basic %s", p);        free(p);      }    }    else    {      snprintf(auth, sizeof(auth), "user %s", proxy_user);    }  }  return auth[0] ? tl_strdup(auth) : NULL;}http_auth_type_t http_get_authorization_type(char *auth_field){  char *p = auth_field;  int l, i;  while(tl_ascii_isspace(*p))    p++;  l = strcspn(p, " \t");  for(i = 0; http_auths[i].name; i++)  {    if(!strncasecmp(p, http_auths[i].name, l))      return http_auths[i].id;  }  return HTTP_AUTH_NONE;}int http_handle_site_auth_info(doc * docu){  int rv = -1;  char *authtag = "";  int is_digest = -1;#ifdef ENABLE_NTLM  int is_ntlm = -1;#endif  int i;  for(i = 0; authtag; i++)  {    authtag = get_mime_n_param_val_str("WWW-Authenticate:", docu->mime, i);    if(authtag)    {      switch (http_get_authorization_type(authtag))      {      case HTTP_AUTH_DIGEST:        is_digest = i;        break;#ifdef ENABLE_NTLM      case HTTP_AUTH_NTLM:        is_ntlm = i;        break;#endif      default:        break;      }      free(authtag);    }  }  if(is_digest >= 0)  {    authtag = get_mime_n_param_val_str("WWW-Authenticate:",      docu->mime, is_digest);    rv = http_digest_do_auth(docu, authtag);    _free(authtag);  }#ifdef ENABLE_NTLM  else if(is_ntlm >= 0)  {    authtag = get_mime_n_param_val_str("WWW-Authenticate:",      docu->mime, is_ntlm);    rv = ntlm_negotiate_connection(docu, authtag);    _free(authtag);  }#endif  return rv;}int http_handle_proxy_auth_info(doc * docu){  int rv = -1;  char *authtag = "";  int is_digest = -1;#ifdef ENABLE_NTLM  int is_ntlm = -1;#endif  int i;  for(i = 0; authtag; i++)  {    authtag = get_mime_n_param_val_str("Proxy-Authenticate:", docu->mime, i);    if(authtag)    {      switch (http_get_authorization_type(authtag))      {      case HTTP_AUTH_DIGEST:        is_digest = i;        break;#ifdef ENABLE_NTLM      case HTTP_AUTH_NTLM:        is_ntlm = i;        break;#endif      default:        break;      }      free(authtag);    }  }  if(is_digest >= 0)  {    authtag = get_mime_n_param_val_str("Proxy-Authenticate:",      docu->mime, is_digest);    rv = http_digest_do_proxy_auth(docu, authtag);    _free(authtag);  }#ifdef ENABLE_NTLM  else if(is_ntlm >= 0)  {    authtag = get_mime_n_param_val_str("Proxy-Authenticate:",      docu->mime, is_ntlm);    rv = ntlm_negotiate_proxy_connection(docu, authtag);    _free(authtag);  }#endif  return rv;}static int http_dumy_proxy_send_connect(doc * docp, char *host, int port){  char pom[1024];  char *req, *p;  if(cfg.use_http11)    snprintf(pom, sizeof(pom), "CONNECT %s:%d HTTP/1.1\r\nHost: %s:%d\r\n",      host, port, host, port);  else    snprintf(pom, sizeof(pom), "CONNECT %s:%d HTTP/1.0\r\n", host, port);  req = tl_strdup(pom);  /**** information for HTTP proxy authorization ****/  p = http_get_proxy_auth_str(docp, "CONNECT");  if(p)  {    req = tl_str_concat(req, "Proxy-Authorization: ", p, "\r\n", NULL);    _free(p);  }  if(docp->additional_headers)    req = tl_str_concat(req, docp->additional_headers, NULL);  /**** additional headers via -httpadd ****/  if((p = http_get_additional_headers(docp->doc_url)))  {    req = tl_str_append(req, p);    _free(p);  }  req = tl_str_concat(req, "\r\n", NULL);  DEBUG_PROTOC(gettext    ("****************** Proxy connect request *****************\n"));  DEBUG_PROTOC("%s", req);  DEBUG_PROTOC    ("**********************************************************\n");  if(abs_write(docp->datasock, req, strlen(req)) != strlen(req))  {    xperror("Proxy connect request");    _free(req);    return -1;  }  _free(req);  return 0;}int http_dumy_proxy_connect_real(doc * docp, char *host, int port,  char *proxy_host, int proxy_port){  char *p;  int len;  bool_t rem_proxy = FALSE;  int rv = 0;  docp->request_type = HTTP_REQ_CONNECT;  docp->connect_host = host;  docp->connect_port = port;  if(!docp->http_proxy && proxy_host)  {    rem_proxy = TRUE;    docp->http_proxy = tl_strdup(proxy_host);    docp->http_proxy_port = proxy_port;  }  if(http_dumy_proxy_send_connect(docp, host, port))    return -1;  p = NULL;  if(http_read_mime_header(docp, &p, &len) > 0)  {    http_response *resp;    DEBUG_PROTOS(gettext      ("***************** Proxy connect response *****************\n"));    DEBUG_PROTOS("%s", p);    DEBUG_PROTOS      ("**********************************************************\n");    docp->mime = p;    resp = http_get_response_info(p);    http_process_response_11flags(docp, resp);    /*** proxy authorization required ***/    if(resp->ret_code == 407)    {      int nauth = docp->num_proxy_auth;      rv = http_handle_proxy_auth_info(docp);      if(rv >= 0)      {        _free(docp->mime);        http_response_free(resp);        return rv ? -1 : 0;      }      else if(nauth)      {        rv = 0;      }    }    else if(resp->ret_code >= 400)      rv = -1;    http_response_free(resp);  }  else    rv = -1;  if(rem_proxy)    docp->http_proxy = NULL;  return rv;}int http_dumy_proxy_connect(doc * docp, char *host, int port,  char *proxy_host, int proxy_port){  int rv;  doc docs;  /* quick save of values */  memcpy(&docs, docp, sizeof(doc));  rv = http_dumy_proxy_connect_real(docp, host, port, proxy_host, proxy_port);  /* restore some of affected values */  docp->is_parsable = docs.is_parsable;  docp->size = docs.size;  docp->totsz = docs.totsz;  docp->origsize = docs.origsize;  docp->rest_pos = docs.rest_pos;  docp->rest_end_pos = docs.rest_end_pos;  docp->is_http11 = docs.is_http11;  docp->chunk_size = docs.chunk_size;  docp->is_chunked = docs.is_chunked;  docp->read_chunksize = docs.read_chunksize;  docp->read_trailer = docs.read_trailer;  docp->is_persistent = docs.is_persistent;  docp->current_size = docs.current_size;  docp->adj_sz = docs.adj_sz;  /* reset errcode */  if(!rv)    docp->errcode = ERR_NOERROR;  return rv;}/************************************************//* create and send whole HTTP request with      *//* respective document body (POST req)          *//************************************************/static int http_request(doc * docp, char *method, char *data, int datalen,  char *conttype){  char *req = NULL;  char pom[2048];  char *p;  char **al;  int len, wlen;  url *urlp = docp->doc_url;  gui_set_status(gettext("Sending request ..."));  if((!cfg.http_proxy && urlp->type == URLT_HTTP) || urlp->type == URLT_HTTPS)  {    p = url_to_request_urlstr(urlp, FALSE);  }  else  {    /* authorization of nonanonymous FTP access */    /* via HTTP gateways is done different way  */    /* use ftp://user:password@server/... in    */    /* request instead of Authorization: ...    */    if(urlp->type == URLT_FTP)    {      char *user;      char *pass;      user = url_get_user(urlp, NULL);      pass = url_get_pass(urlp, NULL);      if(user && !urlp->p.ftp.user)        urlp->p.ftp.user = user;      else        user = NULL;      if(pass && !urlp->p.ftp.password)        urlp->p.ftp.password = pass;      else        pass = NULL;      p = url_to_request_urlstr(urlp, TRUE);      if(user)        urlp->p.ftp.user = NULL;      if(pass)        urlp->p.ftp.password = NULL;    }    else      p = url_to_request_urlstr(urlp, TRUE);  }  if(cfg.use_http11)    req = tl_str_concat(req, method, " ", p, " HTTP/1.1\r\n", NULL);  else    req = tl_str_concat(req, method, " ", p, " HTTP/1.0\r\n", NULL);  _free(p);  if(priv_cfg.identity)  {    snprintf(pom, sizeof(pom), "User-Agent: %s\r\n", priv_cfg.identity);  }  else  {    snprintf(pom, sizeof(pom), "User-Agent: %s/%s %s\r\n", PACKAGE, VERSION, HOSTTYPE);  }  req = tl_str_append(req, pom);  if(url_get_port(urlp) != prottable[urlp->type].default_port)    snprintf(pom, sizeof(pom), "Host: %s:%d\r\n", url_get_site(urlp), url_get_port(urlp));  else    snprintf(pom, sizeof(pom), "Host: %s\r\n", url_get_site(urlp));  req = tl_str_append(req, pom);  if(cfg.send_from && priv_cfg.from)  {

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?