http.c

来自「网络爬虫程序」· C语言 代码 · 共 1,770 行 · 第 1/3 页

C
1,770
字号
    snprintf(pom, sizeof(pom), "From: %s\r\n", priv_cfg.from);    req = tl_str_append(req, pom);  }  if(cfg.send_cookies && (p = cookie_get_field(urlp)))  {    req = tl_str_append(req, p);    _free(p);  }  /*** HTTP authorization field ****/  p = http_get_auth_str(docp, method);  if(p)  {    req = tl_str_concat(req, "Authorization: ", p, "\r\n", NULL);    _free(p);  }  /**** information for HTTP proxy authorization ****/  p = http_get_proxy_auth_str(docp, method);  if(p)  {    req = tl_str_concat(req, "Proxy-Authorization: ", p, "\r\n", NULL);    _free(p);  }  /**** prefered language ****/  if(priv_cfg.accept_lang)  {    bool_t f = FALSE;    al = priv_cfg.accept_lang;    if(*al)    {      snprintf(pom, sizeof(pom), "Accept-Language: %s", *al);      al++;      f = TRUE;    }    while(*al)    {      strcat(pom, ","); /* FIXME: Security */      strcat(pom, *al);      al++;    }    if(f)    {      strcat(pom, "\r\n");      req = tl_str_append(req, pom);    }  }  /*** preffered character sets ***/  if(priv_cfg.accept_chars)  {    bool_t f = FALSE;    al = priv_cfg.accept_chars;    if(*al)    {      snprintf(pom, sizeof(pom), "Accept-Charset: %s", *al);      al++;      f = TRUE;    }    while(*al)    {      strcat(pom, ",");      strcat(pom, *al);      al++;    }    if(f)    {      strcat(pom, "\r\n");      req = tl_str_append(req, pom);    }  }  /**** referer URL ****/  if(cfg.referer)  {    if(urlp->parent_url)    {      url *par_url;      LOCK_URL(urlp);      par_url = (url *) urlp->parent_url->data;      UNLOCK_URL(urlp);      if(par_url->type != URLT_FILE)      {        p = url_to_urlstr(par_url, FALSE);        req = tl_str_concat(req, "Referer: ", p, "\r\n", NULL);        _free(p);      }    }    else if(cfg.auto_referer)    {      p = url_to_urlstr(urlp, FALSE);      req = tl_str_concat(req, "Referer: ", p, "\r\n", NULL);      _free(p);    }  }  /**** allow transfer encoding with gzip, compress ****/  if(cfg.use_enc)  {#ifdef HAVE_ZLIB#if 0                           /*MJF: Horrible workaround hack until deflate code works */    req =      tl_str_append(req,      "Accept-Encoding: x-gzip, gzip, x-compress, compress, deflate\r\n");#else    req = tl_str_append(req, "Accept-Encoding: x-gzip, gzip\r\n");#endif#else    req =      tl_str_append(req,      "Accept-Encoding: x-gzip, gzip, x-compress, compress\r\n");#endif  }  /**** reget (not supported by all servers) ****/  if(docp->rest_pos || docp->rest_end_pos > 0)  {    if(docp->rest_end_pos > 0)    {      snprintf(pom, sizeof(pom), "Range: bytes=%ld-%ld\r\n",      (long) docp->rest_pos, (long) docp->rest_end_pos);    }    else    {      snprintf(pom, sizeof(pom), "Range: bytes=%ld-\r\n",      (long) docp->rest_pos);    }    req = tl_str_append(req, pom);    if(cfg.send_if_range && docp->etag)    {      snprintf(pom, sizeof(pom), "If-Range: %s\r\n", docp->etag);      req = tl_str_append(req, pom);    }  }  /*** conditional GET for sync mode ***/  if(docp->origtime && !docp->rest_pos)  {    LOCK_TIME;    strftime(pom, sizeof(pom),      "If-Modified-Since: %a, %d %b %Y %H:%M:%S GMT\r\n",      gmtime(&docp->origtime));    UNLOCK_TIME;    req = tl_str_append(req, pom);  }  /**** no caching ****/  if(!cfg.cache)  {    req =      tl_str_append(req, "Pragma: no-cache\r\nCache-Control: no-cache\r\n");  }  /**** additional headers via -httpadd ****/  if((p = http_get_additional_headers(urlp)))  {    req = tl_str_append(req, p);    _free(p);  }  if(conttype)  {    snprintf(pom, sizeof(pom), "Content-type: %s\r\n", conttype);    req = tl_str_append(req, pom);  }  if(datalen)  {    snprintf(pom, sizeof(pom), "Content-length: %d\r\n", datalen);    req = tl_str_append(req, pom);  }  if(docp->additional_headers)  {    req = tl_str_append(req, docp->additional_headers);  }  req = tl_str_append(req, "\r\n");  DEBUG_PROTOC(gettext    ("************ Client HTTP MIME header ***************\n"));  DEBUG_PROTOC("%s", req);  DEBUG_PROTOC("****************************************************\n");  /**** send request ****/  len = strlen(req);  if((wlen = abs_write(docp->datasock, req, len)) != len)  {    xperror("abs_write");    _free(req);    docp->errcode = ERR_HTTP_SNDREQ;    return -1;  }  _free(req);  if(data)  {    gui_set_status(gettext("Sending data ..."));    DEBUG_PROTOD(gettext("************ HTTP request data ***************\n"));    DEBUG_PROTOD("%s\n", data);    DEBUG_PROTOD("*************************************************\n");    if(abs_write(docp->datasock, data, datalen) != datalen)    {      docp->errcode = ERR_HTTP_SNDREQDATA;      return -1;    }  }  gui_set_status(gettext("Waiting for response ..."));  /*** handling of 1xx response codes ***/  while((wlen = abs_readln(docp->datasock, pom, sizeof(pom) - 1)) > 0)  {    http_response *resp;    if(!timerisset(&docp->first_byte_time))    {      gettimeofday(&docp->first_byte_time, NULL);    }    resp = http_get_response_info(pom);    if(resp)      _free(resp->text);    bufio_unread(docp->datasock, pom, wlen);    if(resp && (resp->ret_code < 200))    {      _free(resp);      if(http_read_mime_header(docp, &p, &len) <= 0)      {        xprintf(1, gettext("Error reading HTTP 1xx class response\n"));        break;      }      else      {        DEBUG_PROTOS(gettext          ("***************** class 1xx HTTP response ****************\n"));        DEBUG_PROTOS("%s", p);        DEBUG_PROTOS          ("***********************************************************\n");      }    }    else    {      _free(resp);      break;    }  }  if(docp->is_http11 && docp->is_persistent && (!wlen || wlen < 0))  {    docp->is_persistent = FALSE;    docp->errcode = ERR_HTTP_CLOSURE;    abs_close_socket(docp, FALSE);    return -1;  }  if(wlen < 0)  {    docp->errcode = ERR_HTTP_RCVRESP;    return -1;  }  return 0;}int http_get_request(doc * docp){  docp->request_type = HTTP_REQ_GET;  return http_request(docp, "GET", 0, 0, 0);}int http_head_request(doc * docp){  docp->request_type = HTTP_REQ_HEAD;  return http_request(docp, "HEAD", 0, 0, 0);}int http_post_request(doc * docp){  form_info *fi = (form_info *) docp->doc_url->extension;  char *data = NULL;  int datalen = 0;  char *type = NULL;  int rv;  docp->request_type = HTTP_REQ_POST;  if(fi->encoding == FORM_E_MULTIPART)  {    fi->text = form_encode_multipart_boundary();    type = tl_str_concat(type, "multipart/form-data; boundary=",      fi->text, NULL);  }  else                          /*if (fi->encoding == FORM_E_URLENCODED) */    type = tl_strdup("application/x-www-form-urlencoded");  data = form_encode_query(fi, &datalen);  if(!data && fi->infos)  {    _free(data);    _free(type);    _free(fi->text);    docp->errcode = ERR_HTTP_BADRQ;    return -1;  }  rv = http_request(docp, "POST", data, datalen, type);  _free(data);  _free(type);  _free(fi->text);  return rv;}static bufio *http_open_socket(doc * docp){  char *host = NULL;  int port = 0;#define PENAULT_VAL     10  docp->errcode = ERR_NOERROR;  if(docp->http_proxy)  {    host = docp->http_proxy;    port = docp->http_proxy_port;  }  else  {    host = url_get_site(docp->doc_url);    port = url_get_port(docp->doc_url);  }  if(!docp->datasock)  {    gui_set_status(gettext("Connecting ..."));    docp->datasock = bufio_sock_fdopen(net_connect(host, port, docp));#ifdef USE_SSL    if(docp->datasock && docp->doc_url->type == URLT_HTTPS)    {      bufio *ssl_sock;      ssl_sock = my_ssl_do_connect(docp, docp->datasock, NULL);      if(!ssl_sock)      {        if(!docp->errcode)          docp->errcode = ERR_HTTPS_CONNECT;        bufio_close(docp->datasock);        docp->datasock = NULL;        return NULL;      }      else      {        docp->datasock = ssl_sock;      }    }#endif  }  if(!docp->datasock)  {    if(_h_errno_ != 0)      xherror(host);    else      xperror("net_connect");    if(docp->http_proxy)    {      docp->errcode = ERR_HTTP_PROXY_CONN;      /*** for penaulting failed HTTP proxy servers ***/      if(docp->doc_url->type == URLT_HTTP)      {        http_proxy *pr;        LOCK_PROXY;        pr = http_proxy_find(docp->http_proxy, docp->http_proxy_port);        if(pr)        {          pr->penault = PENAULT_VAL + pr->fails;          pr->fails++;        }        UNLOCK_PROXY;      }      _free(docp->http_proxy);    }    else      docp->errcode = ERR_HTTP_CONNECT;    return NULL;  }  return docp->datasock;}void http_handle_redirect(doc * docu, int redir_code){  url *pomurl;  char *pomcr = NULL;  pomcr = get_mime_param_val_str("Location:", docu->mime);  if(!pomcr)  {    pomcr = get_mime_param_val_str("URI:", docu->mime);    if(pomcr)    {      char *p;      p = strchr(pomcr, ';');      if(p)        *p = '\0';      p = strchr(pomcr, '>');      if(p)        *p = '\0';      if(pomcr[0] == '<')      {        p = pomcr;        pomcr = tl_strdup(pomcr + 1);        _free(p);      }    }    else    {      docu->errcode = ERR_HTTP_BADREDIRECT;    }  }  if(pomcr)  {    pomurl = url_parse(pomcr);    assert(pomurl->type != URLT_FROMPARENT);    if(pomurl->type == URLT_FILE)    {      char *base;      char *baset;      char *xp;      baset = url_to_urlstr(docu->doc_url, FALSE);      if((xp = strrchr(baset, '#')))        *xp = '\0';      if((xp = strrchr(baset, '?')))        *xp = '\0';      base = url_to_urlstr(docu->doc_url, FALSE);      if(!tl_is_dirname(base))      {        xp = strrchr(base, '/');        if(xp)          *(xp + 1) = '\0';      }      xp = url_to_absolute_url(base, baset, docu->doc_url, pomcr);      free_deep_url(pomurl);      _free(pomurl);      pomurl = url_parse(xp);      _free(xp);      _free(base);      _free(baset);    }    _free(pomcr);    if(pomurl && prottable[pomurl->type].supported)    {      if(docu->is_robot)      {        docu->doc_url->moved_to = pomurl;        docu->errcode = ERR_HTTP_REDIR;      }      else      {        if(url_redirect_to(docu->doc_url, pomurl, (redir_code == 303)))          docu->errcode = ERR_HTTP_CYCLIC;        else          docu->errcode = ERR_HTTP_REDIR;      }    }    else    {      if(pomurl)      {        free_deep_url(pomurl);        _free(pomurl);        docu->errcode = ERR_HTTP_UNSUPREDIR;      }      else        docu->errcode = ERR_HTTP_BADREDIRECT;    }  }  else    docu->errcode = ERR_HTTP_BADREDIRECT;}static void http_process_response_11flags(doc * docu, http_response * resp){  char *p;  if(cfg.use_http11 && !docu->http_proxy_10 &&    resp->ver_maj == 1 && resp->ver_min == 1)  {    docu->is_http11 = TRUE;    docu->is_persistent = TRUE;    docu->is_chunked = FALSE;    docu->read_trailer = FALSE;  }  else  {    docu->is_http11 = FALSE;    docu->is_persistent = FALSE;    docu->is_chunked = FALSE;    docu->read_trailer = FALSE;  }  p = get_mime_param_val_str("Content-Length:", docu->mime);  if(p)  {    docu->totsz = _atoi(p);    if(errno == ERANGE)      docu->totsz = -1;    _free(p);  }  p = get_mime_param_val_str("Transfer-Encoding:", docu->mime);  if(p)  {    if(!strcasecmp(p, "chunked") || !strncasecmp(p, "chunked;", 8))    {      docu->is_chunked = TRUE;      docu->read_chunksize = TRUE;      docu->read_trailer = FALSE;    }    _free(p);  }  p = get_mime_param_val_str("Connection:", docu->mime);  if(p)  {    if(!strcasecmp(p, "close"))      docu->is_persistent = FALSE;    _free(p);  }  if(docu->http_proxy)  {    p = get_mime_param_val_str("Proxy-Connection:", docu->mime);    if(p)    {      if(!strcasecmp(p, "close"))        docu->is_persistent = FALSE;      else if(!strcasecmp(p, "keep-alive"))        docu->is_persistent = TRUE;      _free(p);    }  }}/* * -1 - failure before sending auth data - http_process_response() can *      continue safely *  0 - OK - http_process_response() must return immediately *  1 - failure after sending auth data - http_process_response() must *      return immediately */static int http_do_proxy_redirect(doc * docp){  char *loc;  int port;  char proxy[256];  int rv = 0;  loc = get_mime_param_val_str("Location:", docp->mime);  if(!loc)

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?