⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 tools.c

📁 网络爬虫程序
💻 C
📖 第 1 页 / 共 2 页
字号:
    p = NULL;  return p;}/****************************************//* duplicate n characters from string   *//****************************************/char *tl_strndup(const char *s, int n){  char *p;  if(!s)    return NULL;  p = (char *) _malloc(n + 1);  if(n)    strncpy(p, s, n);  *(p + n) = '\0';  return p;}/***************************************************//* create all directories in path specification    *//***************************************************/int makealldirs(const char *path){  char pom[PATH_MAX];  const char *p;  pom[0] = '\0';  if(path)  {    p = path;#ifdef __CYGWIN__/* we can't create new drive on WIN32, so the drive *//* specification part of path must be skipped     */#ifdef HAVE_NEWSTYLE_CYGWIN    if(p[0] == '/' && p[1] == '/')    {      char *sp;      if((sp = strchr(p + 2, '/')) && (sp = strchr(sp + 1, '/')))      {        strncpy(pom, p, sp - p);        pom[sp - p] = '\0';        p = sp;      }    }    else if(!strncmp(p, "/cygdrive/", 10) &&      (strlen(p) > 10) && tl_ascii_isalpha(p[10]))    {      strncpy(pom, p, 11);      pom[11] = '\0';      p += 11;    }    p += strspn(p, "/");#else    if(strlen(p) > 2 && p[0] == '/' && p[1] == '/' &&      tl_ascii_isalpha(p[2]) && (p[3] == '\0' || p[3] == '/'))    {      strncpy(pom, p, 3);      pom[3] = '\0';      p += 3;    }    p += strspn(p, "/");#endif#endif    while(*p)    {      int ilen = strcspn(p, "/");      strcat(pom, "/");      strncat(pom, p, ilen);      p += ilen;      p += strspn(p, "/");      if(*p && access(pom, F_OK))      {        if(mkdir(pom, S_IRWXU | S_IRGRP | S_IROTH | S_IXGRP | S_IXOTH))          return -1;      }    }  }  return 0;}/***********************************************//* z relativnej cesty urobi absolutnu a vyhodi *//* vsetky "." a ".." adresare                  *//* FIXME: Translate me!                        *//***********************************************/char *get_abs_file_path_oss(char *path){  char *p, pom[PATH_MAX], *tmp, result[PATH_MAX] = "/";  int ilen;  bool_t last = 1;  for(p = path; tl_ascii_isspace(*p) && *p; p++);  if(*p != '/')  {    tmp = (char *) getcwd(NULL, PATH_MAX);    if(priv_cfg.cache_dir)      sprintf(pom, "%s/%s", priv_cfg.cache_dir, p);    else      sprintf(pom, "%s/%s", tmp, p);    _free(tmp);  }  else  {    sprintf(pom, "%s", p);  }  p = pom;#ifdef __CYGWIN__#ifndef HAVE_NEWSTYLE_CYGWIN  /* workaround to allow //[drive]/... paths on WIN32 */  if(strlen(pom) > 2 && pom[0] == '/' && pom[1] == '/' &&    tl_ascii_isalpha(pom[2]) && (pom[3] == '\0' || pom[3] == '/'))  {    strncpy(result, pom, 3);    result[3] = '\0';    p = pom + 3;  }#else  /* workaround to allow //host/share/... paths on WIN32 */  /* AFAIK this type of paths work with cygwin-1.1 =<    */  if(pom[0] == '/' && pom[1] == '/')  {    strcpy(result, "//");    p++;  }#endif#endif  if(!*p)    strcpy(result, "/");  while(*p)  {    ilen = strcspn(p, "/");    if(*(p + ilen))      *(p + ilen) = '\0';    else      last = 0;    if(strcmp(p, "."))    {      if(strcmp(p, ".."))      {        if(!tl_is_dirname(result))          strcat(result, "/");        strcat(result, p);      }      else      {        tmp = strrchr(result, '/');        *(tmp + 1 - (tmp != result)) = '\0';      }    }    p += ilen + last;    p += strspn(p, "/");  }  ilen = strlen(path);  p = path + ilen - 1;  if(ilen && *p != '/' && tl_is_dirname(result))  {    result[strlen(result) - 1] = '\0';  }  if((tl_is_dirname(path) && !tl_is_dirname(result)) || (strlen(result) == 0))  {    result[strlen(result) + 1] = '\0';    result[strlen(result)] = '/';  }  return tl_strdup(result);}/***********************************************//* z relativnej cesty urobi absolutnu a vyhodi *//* vsetky "." a ".." adresare                  *//* FIXME: Translate me!                        *//***********************************************/char *get_abs_file_path(char *path){  char *p, pom[PATH_MAX], *tmp, result[PATH_MAX] = "/";  int ilen;  bool_t last = 1;  for(p = path; tl_ascii_isspace(*p) && *p; p++);  if(*p != '/')  {    tmp = (char *) getcwd(NULL, PATH_MAX);    sprintf(pom, "%s/%s", tmp ? tmp : "", p);    _free(tmp);  }  else  {    sprintf(pom, "%s", p);  }  p = pom;  if(!*p)    strcpy(result, "/");  while(*p)  {    ilen = strcspn(p, "/");    if(*(p + ilen))      *(p + ilen) = '\0';    else      last = 0;    if(strcmp(p, "."))    {      if(strcmp(p, ".."))      {        if(!tl_is_dirname(result))          strcat(result, "/");        strcat(result, p);      }      else      {        tmp = strrchr(result, '/');        *(tmp + 1 - (tmp != result)) = '\0';      }    }    p += ilen + last;    p += strspn(p, "/");  }  ilen = strlen(path);  p = path + ilen - 1;  if(ilen && *p != '/' && tl_is_dirname(result) == '/')  {    result[strlen(result) - 1] = '\0';  }  if((tl_is_dirname(path) && !tl_is_dirname(result)) || (strlen(result) == 0))  {    result[strlen(result) + 1] = '\0';    result[strlen(result)] = '/';  }  return tl_strdup(result);}#ifdef __CYGWIN__char *cvt_win32_to_unix_path(char *path){  char pom[PATH_MAX];  char *p = tl_strdup(path);  char *p1;  pom[0] = '\0';  p1 = p;  if(strlen(p) >= 2 && tl_ascii_isalpha(p[0]) && p[1] == ':')  {#ifdef HAVE_NEWSTYLE_CYGWIN    sprintf(pom, "/cygdrive/%c", p[0]);    p += 2;#else    sprintf(pom, "//%c", p[0]);    p += 2;#endif  }  _strtrchr(p, '\\', '/');  strcat(pom, p);  _free(p1);  return get_abs_file_path_oss(pom);}char *cvt_unix_to_win32_path(char *path){  char res[PATH_MAX];  cygwin32_conv_to_win32_path(path, res);  return tl_strdup(res);}#endif/**********************************//* spocita vyskyt znaku v retazci *//* FIXME: Translate me!           *//**********************************/static int str_cnt_chr(char *s, int c){  char *p = s;  int ret = 0;  while(*p)  {    if(*p == c)      ret++;    p++;  }  return ret;}/********************************************//* urci relativnu cestu z adresara v ktorom *//* sa nachadza prvy subor na druhy subor    *//* FIXME: Translate me!                     *//********************************************/char *get_relative_path(char *fromabs, char *toabs){  char *p1, *p2, *pom1, *pom2;  int offset = 0, i, plom;  char *rv = NULL;  pom1 = p1 = get_abs_file_path_oss(fromabs);  pom2 = p2 = get_abs_file_path_oss(toabs);  while(*p1 && *p2 && *p1 == *p2)  {    p1++;    p2++;  }#if 0  /* this is not good behaviour, as lynx and netscape behaves */  /* differently on empty HREFs                               */  if(!strcmp(p1, p2))  {    free(pom1);    free(pom2);    return tl_strdup("");  }#endif  if(*p1)    p1--;  while((p1 >= pom1) && *p1 != '/')    p1--;  if(*p1 != '/')  {    free(pom1);    free(pom2);    return NULL;  }  offset = p1 - pom1;  plom = str_cnt_chr(p1 + 1, '/');  for(i = 0; i < plom; i++)  {    rv = tl_str_concat(rv, "../", NULL);  }  rv = tl_str_concat(rv, pom2 + offset + 1, NULL);  free(pom1);  free(pom2);  return rv;}/********************************//* vrati poziciu pripony suboru *//* FIXME: Translate me!         *//********************************/char *tl_get_extension(char *fname){  char *p1, *p2;  p1 = strrchr(fname, '.');  p2 = strrchr(fname, '/');  if(p1 > p2)  {    return (p1 + 1);  }  else    return "";}char *tl_get_basename(char *fname){  char *p;  if((p = strrchr(fname, '/')))    p++;  else    p = fname;  return p;}static const char *html_tag_tab[] = {  "<HTML",  "<html",  "<HEAD",  "<head",  "<META",  "<meta",  "<TITLE",  "<title",  "<BODY",  "<body",  "<script",  "<SCRIPT",  "<style",  "<STYLE",  "<!DOCTYPE HTML",  "<!doctype html",  "<!--",};bool_t ext_is_html(char *fn){  char *ext = tl_get_extension(fn);  return str_is_in_list(0, ext, "html", "htm", "shtml", "phtml", "css", NULL);}static bool_t ext_is_nothtml(char *fn){  char *ext = tl_get_extension(fn);  return str_is_in_list(0, ext, "gif", "jpg", "jpeg", "png", "mpeg",    "mpg", "avi", "pdf", "gz", "tgz", "zip", "arj",    "hqx", "rar", "tar", "Z", "doc", "doc", "xls", "wav", "au", "mp3", NULL);}bool_t file_is_html(char *fn){  int i, j, len;  char pom[256];  bufio *fd;  if(ext_is_html(fn))    return TRUE;  if(ext_is_nothtml(fn))    return FALSE;  if((fd = bufio_open(fn, O_BINARY | O_RDONLY)))  {    for(j = 0; j < 10; j++)    {      if((len = bufio_readln(fd, pom, sizeof(pom))) > 0)      {        for(i = 0; i < NUM_ELEM(html_tag_tab); i++)          if(strstr(pom, html_tag_tab[i]))          {            bufio_close(fd);            return TRUE;          }      }      else      {        if(len < 0)          xperror("file_is_html");        bufio_close(fd);        return FALSE;      }    }    bufio_close(fd);  }  return FALSE;}void tl_sleep(unsigned int s){  /* if we measure timings, we don't sleep */  if(cfg.time_logfile)  {    return;  }#ifndef HAVE_MT  if(cfg.xi_face)  {    gui_msleep(s * 1000);  }  else#endif  {#ifdef HAVE_MT    struct timeval tout;    tout.tv_sec = s;    tout.tv_usec = 0;    select(0, NULL, NULL, NULL, &tout);#else    sleep(s);#endif  }}void tl_msleep(unsigned int ms){  /* if we measure timings, we don't sleep */  if(cfg.time_logfile)  {    return;  }#ifndef HAVE_MT  if(cfg.xi_face)  {    gui_msleep(ms);  }  else#endif#if defined HAVE_USLEEP && !defined HAVE_MT    usleep(ms * 1000);#else  {    struct timeval tout;    tout.tv_sec = ms / 1000;    tout.tv_usec = (ms % 1000) * 1000;    select(0, NULL, NULL, NULL, &tout);  }#endif}int unlink_recursive(char *fn){  struct stat estat;  if(lstat(fn, &estat))  {    xperror(fn);    return -1;  }  if(!S_ISDIR(estat.st_mode))  {    if(unlink(fn))    {      xperror(fn);      return (-1);    }  }  else  {    DIR *dir;    struct dirent *dent;    char next_dir[PATH_MAX];    if(!(dir = opendir(fn)))    {      xperror(fn);      return -1;    }    while((dent = readdir(dir)))    {      sprintf(next_dir, "%s/%s", fn, dent->d_name);      if(!strcmp(dent->d_name, "."))        continue;      if(!strcmp(dent->d_name, ".."))        continue;      unlink_recursive(next_dir);    }    closedir(dir);    if(rmdir(fn))    {      xperror(next_dir);      return -1;    }  }  return 0;}int str_is_in_list(int casesensitive, char *str, ...){  char *which;  va_list args;  int found = FALSE;  va_start(args, str);  for(which = va_arg(args, char *); which; which = va_arg(args, char *))  {    if(casesensitive ? !strcmp(str, which) : !strcasecmp(str, which))    {      found = TRUE;      break;    }  }  va_end(args);  return found;}int copy_fd_to_file(int fd, char *filename){  char pom[32768];  int len;  int dfd;  if((dfd = open(filename, O_BINARY | O_WRONLY | O_CREAT, 0644)) < 0)  {    xperror(filename);    return -1;  }  lseek(fd, 0, SEEK_SET);  while((len = read(fd, pom, sizeof(pom))) > 0)  {    if(len != write(dfd, pom, len))      return -1;  }  close(dfd);  lseek(fd, 0, SEEK_END);  return len;}char *tl_adjust_filename(char *path){  char *pom;  char *p, *p2;  int l, n;  pom = _malloc(strlen(path) + 1);  p = pom;#ifdef __CYGWIN__#ifndef HAVE_NEWSTYLE_CYGWIN  l = strspn(path, "/");  strncpy(p, path, l);  p += l;  *p = '\0';  path += l;#endif#endif  while(*path)  {    n = strspn(path, "/");    path += n;    l = strcspn(path, "/");    if(n)    {      *p = '/';      p++;    }    if(!*(path + l) && ((NAME_MAX - 4) < l))    {      strncpy(p, path + l - (NAME_MAX - 4), NAME_MAX - 4);      p += NAME_MAX - 4;    }    else if(l > NAME_MAX)    {      strncpy(p, path, NAME_MAX);      p += NAME_MAX;    }    else    {      strncpy(p, path, l);      p += l;    }    path += l;  }  *p = '\0';  n = strlen(pom);  p = strrchr(pom, '/');  while(p && (n > PATH_MAX))  {    *p = '\0';    p2 = strrchr(pom, '/');    if(p2)    {      strcpy(p2 + 1, p + 1);      n -= p - p2;    }    p = p2;  }  return pom;}int tl_filename_needs_adjust(char *path){  int l;  if(strlen(path) > PATH_MAX)    return TRUE;  while(*path)  {    path += strspn(path, "/");    l = strcspn(path, "/");    if(!*(path + l) && ((NAME_MAX - 4) < l))      return TRUE;    else if(l > NAME_MAX)      return TRUE;    path += l;  }  return FALSE;}int tl_is_dirname(const char *path){  const char *p = strrchr(path, '/');  return (p && (*(p + 1) == '\0'));}char *tl_str_append(char *str1, char *str2){  int l1, l2;  char *rv;  l1 = str1 ? strlen(str1) : 0;  l2 = strlen(str2);  rv = _realloc(str1, l1 + l2 + 1);  strcpy(rv + l1, str2);  return rv;}char *tl_str_nappend(char *str1, const char *str2, int n){  int l1;  char *rv;  l1 = str1 ? strlen(str1) : 0;  rv = _realloc(str1, l1 + n + 1);  strncpy(rv + l1, str2, n);  rv[l1 + n] = '\0';  return rv;}char *tl_str_concat(char *str1, ...){  char *p;  va_list args;  int len;  char *rv = str1;  len = str1 ? strlen(str1) : 0;  va_start(args, str1);  for(p = va_arg(args, char *); p; p = va_arg(args, char *))  {    int slen = strlen(p);    rv = _realloc(rv, len + slen + 1);    strcpy(rv + len, p);    len += slen;  }  va_end(args);  return rv;}char *tl_data_concat_str(int *len, char *data, ...){  char *p;  va_list args;  char *rv = data;  va_start(args, data);  for(p = va_arg(args, char *); p; p = va_arg(args, char *))  {    int slen = strlen(p);    rv = _realloc(rv, *len + slen + 1);    strcpy(rv + *len, p);    *len += slen;  }  va_end(args);  return rv;}char *tl_data_concat_data(int *tlen, char *tdata, int len, char *data){  tdata = _realloc(tdata, *tlen + len);  memcpy(tdata + *tlen, data, len);  *tlen += len;  return tdata;}char *tl_load_text_file(char *filename){  char pom[1024];  int tlen, len, fd;  char *rv = NULL;  if((fd = open(filename, O_RDONLY | O_BINARY)) < 0)  {    xperror(filename);    return NULL;  }  tlen = 0;  while((len = read(fd, pom, sizeof(pom))) > 0)  {    rv = tl_data_concat_data(&tlen, rv, len, pom);  }  close(fd);  if(rv)  {    rv = _realloc(rv, tlen + 1);    rv[tlen] = '\0';  }  return rv;}int tl_save_text_file(char *filename, char *content, int length){  int fd;  int rv = 0;  if(length < 0)    length = strlen(content);  if((fd = open(filename, O_WRONLY | O_BINARY | O_CREAT | O_TRUNC), 0644) < 0)  {    xperror(filename);    return -1;  }  if(write(fd, content, length) != length)  {    xperror(filename);    rv = -1;  }  close(fd);  return rv;}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -