📄 url.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 5 页
字号:
上一页 1 2 3 45
{  CURLcode result;  Curl_ipconnect *addr;  /*************************************************************   * Connect to server/proxy   *************************************************************/  result= Curl_connecthost(conn,                           hostaddr,                           conn->port,                           &conn->firstsocket,                           &addr,                           connected);  if(CURLE_OK == result) {    /* All is cool, then we store the current information from the hostaddr       struct to the serv_addr, as it might be needed later. The address       returned from the function above is crucial here. */    conn->connect_addr = hostaddr;#ifdef ENABLE_IPV6    conn->serv_addr = addr;#else    memset((char *) &conn->serv_addr, '\0', sizeof(conn->serv_addr));    memcpy((char *)&(conn->serv_addr.sin_addr),           (struct in_addr *)addr, sizeof(struct in_addr));    conn->serv_addr.sin_family = hostaddr->addr->h_addrtype;    conn->serv_addr.sin_port = htons((unsigned short)conn->port);#endif    if (conn->data->set.proxytype == CURLPROXY_SOCKS5) {      return handleSock5Proxy(conn->proxyuser,                              conn->proxypasswd,                              conn,                              conn->firstsocket) ?        CURLE_COULDNT_CONNECT : CURLE_OK;    }    else if (conn->data->set.proxytype == CURLPROXY_HTTP) {      /* do nothing here. handled later. */    }    else {      failf(conn->data, "unknown proxytype option given");      return CURLE_COULDNT_CONNECT;    }  }  return result;}/* * ALERT! The 'dns' pointer being passed in here might be NULL at times. */static void verboseconnect(struct connectdata *conn,                           struct Curl_dns_entry *dns){#ifdef HAVE_INET_NTOA_R  char ntoa_buf[64];#endif  struct SessionHandle *data = conn->data;  /* Figure out the ip-number and display the first host name it shows: */#ifdef ENABLE_IPV6  (void)dns; /* not used in the IPv6 enabled version */  {    char hbuf[NI_MAXHOST];#ifdef NI_WITHSCOPEID    const int niflags = NI_NUMERICHOST | NI_WITHSCOPEID;#else    const int niflags = NI_NUMERICHOST;#endif    struct addrinfo *ai = conn->serv_addr;    if (getnameinfo(ai->ai_addr, ai->ai_addrlen, hbuf, sizeof(hbuf), NULL, 0,	niflags)) {      snprintf(hbuf, sizeof(hbuf), "?");    }    if (ai->ai_canonname) {      infof(data, "Connected to %s (%s) port %d\n", ai->ai_canonname, hbuf,            conn->port);    } else {      infof(data, "Connected to %s port %d\n", hbuf, conn->port);    }  }#else  {    Curl_addrinfo *hostaddr=dns?dns->addr:NULL;    struct in_addr in;    (void) memcpy(&in.s_addr, &conn->serv_addr.sin_addr, sizeof (in.s_addr));    infof(data, "Connected to %s (%s) port %d\n",          hostaddr?hostaddr->h_name:"",#if defined(HAVE_INET_NTOA_R)          inet_ntoa_r(in, ntoa_buf, sizeof(ntoa_buf)),#else          inet_ntoa(in),#endif          conn->port);  }#endif}/* * We have discovered that the TCP connection has been successful, we can now * proceed with some action. * * If we're using the multi interface, this host address pointer is most * likely NULL at this point as we can't keep the resolved info around. This * may call for some reworking, like a reference counter in the struct or * something. The hostaddr is not used for very much though, we have the * 'serv_addr' field in the connectdata struct for most of it. */CURLcode Curl_protocol_connect(struct connectdata *conn,                               struct Curl_dns_entry *hostaddr){  struct SessionHandle *data = conn->data;  CURLcode result=CURLE_OK;  if(conn->bits.tcpconnect)    /* We already are connected, get back. This may happen when the connect       worked fine in the first call, like when we connect to a local server       or proxy. */    return CURLE_OK;  Curl_pgrsTime(data, TIMER_CONNECT); /* connect done */  if(data->set.verbose)    verboseconnect(conn, hostaddr);  if(conn->curl_connect) {    /* is there a protocol-specific connect() procedure? */    /* set start time here for timeout purposes in the     * connect procedure, it is later set again for the     * progress meter purpose */    conn->now = Curl_tvnow();    /* Call the protocol-specific connect function */    result = conn->curl_connect(conn);  }  return result; /* pass back status */}/* * CreateConnection() sets up a new connectdata struct, or re-uses an already * existing one, and resolves host name. * * if this function returns CURLE_OK and *async is set to TRUE, the resolve * response will be coming asynchronously. If *async is FALSE, the name is * already resolved. */static CURLcode CreateConnection(struct SessionHandle *data,                                 struct connectdata **in_connect,                                 struct Curl_dns_entry **addr,                                 bool *async){  char *tmp;  CURLcode result=CURLE_OK;  char resumerange[40]="";  struct connectdata *conn;  struct connectdata *conn_temp;  int urllen;  struct Curl_dns_entry *hostaddr;#ifdef HAVE_ALARM  unsigned int prev_alarm=0;#endif  char endbracket;  char user[MAX_CURL_USER_LENGTH];  char passwd[MAX_CURL_PASSWORD_LENGTH];  int rc;#ifdef HAVE_SIGACTION  struct sigaction keep_sigact;   /* store the old struct here */  bool keep_copysig=FALSE;        /* did copy it? */#else#ifdef HAVE_SIGNAL  void *keep_sigact;              /* store the old handler here */#endif#endif  *addr = NULL; /* nothing yet */  *async = FALSE;    /*************************************************************   * Check input data   *************************************************************/  if(!data->change.url)    return CURLE_URL_MALFORMAT;  /* First, split up the current URL in parts so that we can use the     parts for checking against the already present connections. In order     to not have to modify everything at once, we allocate a temporary     connection data struct and fill in for comparison purposes. */  conn = (struct connectdata *)malloc(sizeof(struct connectdata));  if(!conn) {    *in_connect = NULL; /* clear the pointer */    return CURLE_OUT_OF_MEMORY;  }  /* We must set the return variable as soon as possible, so that our     parent can cleanup any possible allocs we may have done before     any failure */  *in_connect = conn;  /* we have to init the struct */  memset(conn, 0, sizeof(struct connectdata));  /* and we setup a few fields in case we end up actually using this struct */  conn->data = data;           /* remember our daddy */  conn->firstsocket = -1;     /* no file descriptor */  conn->secondarysocket = -1; /* no file descriptor */  conn->connectindex = -1;    /* no index */  conn->bits.httpproxy = (data->change.proxy && *data->change.proxy &&                          (data->set.proxytype == CURLPROXY_HTTP))?    TRUE:FALSE; /* http proxy or not */  /* Default protocol-independent behavior doesn't support persistant     connections, so we set this to force-close. Protocols that support     this need to set this to FALSE in their "curl_do" functions. */  conn->bits.close = TRUE;  /* maxdownload must be -1 on init, as 0 is a valid value! */  conn->maxdownload = -1;  /* might have been used previously! */  /* Store creation time to help future close decision making */  conn->created = Curl_tvnow();  conn->bits.use_range = data->set.set_range?TRUE:FALSE; /* range status */  conn->range = data->set.set_range;              /* clone the range setting */  conn->resume_from = data->set.set_resume_from;   /* inherite resume_from */  /* Set the start time temporary to this creation time to allow easier     timeout checks before the transfer has started for real. The start time     is later set "for real" using Curl_pgrsStartNow(). */  conn->data->progress.start = conn->created;  conn->bits.user_passwd = data->set.userpwd?1:0;  conn->bits.proxy_user_passwd = data->set.proxyuserpwd?1:0;  /* This initing continues below, see the comment "Continue connectdata   * initialization here" */  /***********************************************************   * We need to allocate memory to store the path in. We get the size of the   * full URL to be sure, and we need to make it at least 256 bytes since   * other parts of the code will rely on this fact   ***********************************************************/#define LEAST_PATH_ALLOC 256  urllen=strlen(data->change.url);  if(urllen < LEAST_PATH_ALLOC)    urllen=LEAST_PATH_ALLOC;  conn->path=(char *)malloc(urllen);  if(NULL == conn->path)    return CURLE_OUT_OF_MEMORY; /* really bad error */  /*************************************************************   * Parse the URL.   *   * We need to parse the url even when using the proxy, because we will need   * the hostname and port in case we are trying to SSL connect through the   * proxy -- and we don't know if we will need to use SSL until we parse the   * url ...   ************************************************************/  if((2 == sscanf(data->change.url, "%64[^:]:%[^\n]",                  conn->protostr,                  conn->path)) && strequal(conn->protostr, "file")) {    if(conn->path[0] == '/' && conn->path[1] == '/') {      /* Allow omitted hostname (e.g. file:/<path>).  This is not strictly       * speaking a valid file: URL by RFC 1738, but treating file:/<path> as       * file://localhost/<path> is similar to how other schemes treat missing       * hostnames.  See RFC 1808. */      /* This cannot be done with strcpy() in a portable manner, since the         memory areas overlap! */      memmove(conn->path, conn->path + 2, strlen(conn->path + 2)+1);    }    /*     * we deal with file://<host>/<path> differently since it supports no     * hostname other than "localhost" and "127.0.0.1", which is unique among     * the URL protocols specified in RFC 1738     */    if(conn->path[0] != '/') {      /* the URL included a host name, we ignore host names in file:// URLs         as the standards don't define what to do with them */      char *ptr=strchr(conn->path, '/');      if(ptr) {        /* there was a slash present           RFC1738 (section 3.1, page 5) says:           The rest of the locator consists of data specific to the scheme,           and is known as the "url-path". It supplies the details of how the           specified resource can be accessed. Note that the "/" between the           host (or port) and the url-path is NOT part of the url-path.           As most agents use file://localhost/foo to get '/foo' although the           slash preceeding foo is a separator and not a slash for the path,           a URL as file://localhost//foo must be valid as well, to refer to           the same file with an absolute path.        */        if(ptr[1] && ('/' == ptr[1]))          /* if there was two slashes, we skip the first one as that is then             used truly as a separator */          ptr++;        /* This cannot be made with strcpy, as the memory chunks overlap! */        memmove(conn->path, ptr, strlen(ptr)+1);      }    }    strcpy(conn->protostr, "file"); /* store protocol string lowercase */  }  else {    /* Set default host and default path */    strcpy(conn->gname, "curl.haxx.se");    strcpy(conn->path, "/");    /* We need to search for '/' OR '?' - whichever comes first after host     * name but before the path. We need to change that to handle things like     * http://example.com?param= (notice the missing '/'). Later we'll insert     * that missing slash at the beginning of the path.     */    if (2 > sscanf(data->change.url,                   "%64[^\n:]://%512[^\n/?]%[^\n]",                   conn->protostr, conn->gname, conn->path)) {      /*       * The URL was badly formatted, let's try the browser-style _without_       * protocol specified like 'http://'.       */      if((1 > sscanf(data->change.url, "%512[^\n/?]%[^\n]",                     conn->gname, conn->path)) ) {        /*         * We couldn't even get this format.         */        failf(data, "<url> malformed");        return CURLE_URL_MALFORMAT;      }      /*       * Since there was no protocol part specified, we guess what protocol it       * is based on the first letters of the server name.       */      /* Note: if you add a new protocol, please update the list in       * lib/version.c too! */      if(checkprefix("FTP", conn->gname)) {        strcpy(conn->protostr, "ftp");      }      else if(checkprefix("GOPHER", conn->gname))        strcpy(conn->protostr, "gopher");#ifdef USE_SSLEAY      else if(checkprefix("HTTPS", conn->gname))        strcpy(conn->protostr, "https");      else if(checkprefix("FTPS", conn->gname))        strcpy(conn->protostr, "ftps");#endif /* USE_SSLEAY */      else if(checkprefix("TELNET", conn->gname))        strcpy(conn->protostr, "telnet");      else if (checkprefix("DICT", conn->gname))        strcpy(conn->protostr, "DICT");      else if (checkprefix("LDAP", conn->gname))        strcpy(conn->protostr, "LDAP");      else {        strcpy(conn->protostr, "http");      }      conn->protocol |= PROT_MISSING; /* not given in URL */    }  }  /* If the URL is malformatted (missing a '/' after hostname before path) we   * insert a slash here. The only letter except '/' we accept to start a path   * is '?'.   */  if(conn->path[0] == '?') {    /* We need this function to deal with overlapping memory areas. We know       that the memory area 'path' points to is 'urllen' bytes big and that       is bigger than the path. Use +1 to move the zero byte too. */    memmove(&conn->path[1], conn->path, strlen(conn->path)+1);    conn->path[0] = '/';  }  /*   * So if the URL was A://B/C,   *   conn->protostr is A   *   conn->gname is B   *   conn->path is /C   */  /*************************************************************   * Take care of proxy authentication stuff   *************************************************************/  if(conn->bits.proxy_user_passwd) {    char proxyuser[MAX_CURL_USER_LENGTH]="";    char proxypasswd[MAX_CURL_PASSWORD_LENGTH]="";    sscanf(data->set.proxyuserpwd,           "%" MAX_CURL_USER_LENGTH_TXT "[^:]:"           "%" MAX_CURL_PASSWORD_LENGTH_TXT "[^\n]",           proxyuser, proxypasswd);    conn->proxyuser = strdup(proxyuser);    if(!conn->proxyuser)      return CURLE_OUT_OF_MEMORY;        conn->proxypasswd = strdup(proxypasswd);    if(!conn->proxypasswd)      return CURLE_OUT_OF_MEMORY;  }  /*************************************************************   * Set a few convenience pointers   *************************************************************/  conn->name = conn->gname;  conn->ppath = conn->path;  conn->hostname = conn->name;  /*************************************************************   * Detect what (if any) proxy to use   *************************************************************/  if(!data->change.proxy) {    /* If proxy was not specified, we check for default proxy environment     * variables, to enable i.e Lynx compliance:     *     * http_proxy=http://some.server.dom:port/     * https_proxy=http://some.server.dom:port/     * ftp_proxy=http://some.server.dom:port/     * gopher_proxy=http://some.server.dom:port/     * no_proxy=domain1.dom,host.domain2.dom     *   (a comma-separated list of hosts whic
上一页 1 2 3 45
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -