📄 htparse.c
字号:
/* Parse HyperText Document Address HTParse.c** ================================*/#include <HTUtils.h>#include <HTParse.h>#include <LYLeaks.h>#define HEX_ESCAPE '%'struct struct_parts { char * access; char * host; char * absolute; char * relative; char * search; /* treated normally as part of path */ char * anchor;};/* Strip white space off a string. HTStrip()** -------------------------------**** On exit,** Return value points to first non-white character, or to 0 if none.** All trailing white space is OVERWRITTEN with zero.*/PUBLIC char * HTStrip ARGS1( char *, s){#define SPACE(c) ((c == ' ') || (c == '\t') || (c == '\n')) char * p = s; for (p = s; *p; p++) ; /* Find end of string */ for (p--; p >= s; p--) { if (SPACE(*p)) *p = '\0'; /* Zap trailing blanks */ else break; } while (SPACE(*s)) s++; /* Strip leading blanks */ return s;}/* Scan a filename for its consituents. scan()** ------------------------------------**** On entry,** name points to a document name which may be incomplete.** On exit,** absolute or relative may be nonzero (but not both).** host, anchor and access may be nonzero if they were specified.** Any which are nonzero point to zero terminated strings.*/PRIVATE void scan ARGS2( char *, name, struct struct_parts *, parts){ char * after_access; char * p;#ifdef NOTDEFINED int length = strlen(name);#endif /* NOTDEFINED */ parts->access = NULL; parts->host = NULL; parts->absolute = NULL; parts->relative = NULL; parts->search = NULL; /* normally not used - kw */ parts->anchor = NULL; /* ** Scan left-to-right for a scheme (access). */ after_access = name; for (p = name; *p; p++) { if (*p==':') { *p = '\0'; parts->access = name; /* Access name has been specified */ after_access = (p + 1); break; } if (*p == '/' || *p == '#' || *p == ';' || *p == '?') break; }#ifdef NOTDEFINED for (p = (name + length-1); p >= name; p--) {}#endif /* NOTDEFINED */ /* ** Scan left-to-right for a fragment (anchor). */ for (p = after_access; *p; p++) { if (*p =='#') { parts->anchor = (p + 1); *p = '\0'; /* terminate the rest */ break; /* leave things after first # alone - kw */ } } /* ** Scan left-to-right for a host or absolute path. */ p = after_access; if (*p == '/') { if (p[1] == '/') { parts->host = (p + 2); /* host has been specified */ *p = '\0'; /* Terminate access */ p = strchr(parts->host, '/'); /* look for end of host name if any */ if (p != NULL) { *p = '\0'; /* Terminate host */ parts->absolute = (p + 1); /* Root has been found */ } else { p = strchr(parts->host, '?'); if (p != NULL) { *p = '\0'; /* Terminate host */ parts->search = (p + 1); } } } else { parts->absolute = (p + 1); /* Root found but no host */ } } else { parts->relative = (*after_access) ? after_access : NULL; /* NULL for "" */ } /* ** Check schemes that commonly have unescaped hashes. */ if (parts->access && parts->anchor) { if ((!parts->host && strcasecomp(parts->access, "lynxcgi")) || !strcasecomp(parts->access, "nntp") || !strcasecomp(parts->access, "snews") || !strcasecomp(parts->access, "news") || !strcasecomp(parts->access, "data")) { /* * Access specified but no host and not a lynxcgi URL, so the * anchor may not really be one, e.g., news:j462#36487@foo.bar, * or it's an nntp or snews URL, or news URL with a host. * Restore the '#' in the address. */ /* but only if we have found a path component of which this will * become part. - kw */ if (parts->relative || parts->absolute) { *(parts->anchor - 1) = '#'; parts->anchor = NULL; } } }#ifdef NOT_DEFINED /* search is just treated as part of path */ { char *p = (relative ? relative : absolute); if (p != NULL) { char *q = strchr(p, '?'); /* Any search string? */ if (q != NULL) { *q = '\0'; /* If so, chop that off. */ parts->search = (q + 1); } } }#endif /* NOT_DEFINED */} /*scan *//* Parse a Name relative to another name. HTParse()** --------------------------------------**** This returns those parts of a name which are given (and requested)** substituting bits from the related name where necessary.**** On entry,** aName A filename given** relatedName A name relative to which aName is to be parsed** wanted A mask for the bits which are wanted.**** On exit,** returns A pointer to a calloc'd string which MUST BE FREED*/PUBLIC char * HTParse ARGS3( CONST char *, aName, CONST char *, relatedName, int, wanted){ char * result = NULL; char * return_value = NULL; int len; char * name = NULL; char * rel = NULL; char * p; char * acc_method; struct struct_parts given, related; CTRACE((tfp, "HTParse: aName:`%s'\n", aName)); CTRACE((tfp, " relatedName:`%s'\n", relatedName)); if (wanted & (PARSE_STRICTPATH | PARSE_QUERY)) { /* if detail wanted... */ if ((wanted & (PARSE_STRICTPATH | PARSE_QUERY)) == (PARSE_STRICTPATH | PARSE_QUERY)) /* if strictpath AND query */ wanted |= PARSE_PATH; /* then treat as if PARSE_PATH wanted */ if (wanted & PARSE_PATH) /* if PARSE_PATH wanted */ wanted &= ~(PARSE_STRICTPATH | PARSE_QUERY); /* ignore details */ } /* ** Allocate the output string. */ len = strlen(aName) + strlen(relatedName) + 10; result = typecallocn(char, len); /* Lots of space: more than enough */ if (result == NULL) { outofmem(__FILE__, "HTParse"); } /* ** Make working copies of the input strings to cut up. */ StrAllocCopy(name, aName); StrAllocCopy(rel, relatedName); /* ** Cut up the strings into URL fields. */ scan(name, &given); scan(rel, &related); /* ** Handle the scheme (access) field. */ if (given.access && given.host && !given.relative && !given.absolute) { if (!strcmp(given.access, "http") || !strcmp(given.access, "https") || !strcmp(given.access, "ftp")) /* ** Assume root. */ given.absolute = ""; } acc_method = given.access ? given.access : related.access; if (wanted & PARSE_ACCESS) { if (acc_method) { strcat(result, acc_method); if (wanted & PARSE_PUNCTUATION) strcat(result, ":"); } } /* ** If different schemes, inherit nothing. ** ** We'll try complying with RFC 1808 and ** the Fielding draft, and inherit nothing ** if both schemes are given, rather than ** only when they differ, except for ** file URLs - FM ** ** After trying it for a while, it's still ** premature, IHMO, to go along with it, so ** this is back to inheriting for identical ** schemes whether or not they are "file". ** If you want to try it again yourself, ** uncomment the strcasecomp() below. - FM */ if ((given.access && related.access) && (/* strcasecomp(given.access, "file") || */ strcmp(given.access, related.access))) { related.host = NULL; related.absolute = NULL; related.relative = NULL; related.search = NULL; related.anchor = NULL; } /* ** Handle the host field. */ if (wanted & PARSE_HOST) { if (given.host || related.host) { char *tail = result + strlen(result); if (wanted & PARSE_PUNCTUATION) strcat(result, "//"); strcat(result, given.host ? given.host : related.host);#define CLEAN_URLS#ifdef CLEAN_URLS /* ** Ignore default port numbers, and trailing dots on FQDNs, ** which will only cause identical addresses to look different. */ { char *p2, *h; if ((p2 = strchr(result, '@')) != NULL) tail = (p2 + 1); p2 = strchr(tail, ':'); if (p2 != NULL && !isdigit(UCH(p2[1]))) /* ** Colon not followed by a port number. */ *p2 = '\0'; if (p2 != NULL && *p2 != '\0' && acc_method != NULL) { /* ** Port specified. */ if ((!strcmp(acc_method, "http" ) && !strcmp(p2, ":80" )) || (!strcmp(acc_method, "https" ) && !strcmp(p2, ":443")) || (!strcmp(acc_method, "gopher" ) && !strcmp(p2, ":70" )) || (!strcmp(acc_method, "ftp" ) && !strcmp(p2, ":21" )) || (!strcmp(acc_method, "wais" ) && !strcmp(p2, ":210")) || (!strcmp(acc_method, "nntp" ) && !strcmp(p2, ":119")) || (!strcmp(acc_method, "news" ) && !strcmp(p2, ":119")) || (!strcmp(acc_method, "newspost" ) && !strcmp(p2, ":119")) || (!strcmp(acc_method, "newsreply" ) && !strcmp(p2, ":119")) || (!strcmp(acc_method, "snews" ) && !strcmp(p2, ":563")) || (!strcmp(acc_method, "snewspost" ) && !strcmp(p2, ":563")) || (!strcmp(acc_method, "snewsreply") && !strcmp(p2, ":563")) || (!strcmp(acc_method, "finger" ) && !strcmp(p2, ":79" )) || (!strcmp(acc_method, "telnet" ) && !strcmp(p2, ":23" )) || (!strcmp(acc_method, "tn3270" ) && !strcmp(p2, ":23" )) || (!strcmp(acc_method, "rlogin" ) && !strcmp(p2, ":513")) || (!strcmp(acc_method, "cso" ) && !strcmp(p2, ":105"))) *p2 = '\0'; /* It is the default: ignore it */ } if (p2 == NULL) { int len2 = strlen(tail); if (len2 > 0) { h = tail + len2 - 1; /* last char of hostname */ if (*h == '.') *h = '\0'; /* chop final . */ } } else if (p2 != result) { h = p2; h--; /* End of hostname */ if (*h == '.') { /* ** Slide p2 over h. */ while (*p2 != '\0') *h++ = *p2++; *h = '\0'; /* terminate */ } } }#endif /* CLEAN_URLS */ } } /* ** If host in given or related was ended directly with a '?' (no ** slash), fake the search part into absolute. This is the only ** case search is returned from scan. A host must have been present. ** this restores the '?' at which the host part had been truncated in ** scan, we have to do this after host part handling is done. - kw ** */ if (given.search && *(given.search - 1) == '\0') { given.absolute = given.search - 1; given.absolute[0] = '?'; } else if (related.search && !related.absolute && *(related.search - 1) == '\0') { related.absolute = related.search - 1; related.absolute[0] = '?'; } /* ** If different hosts, inherit no path. */ if (given.host && related.host) if (strcmp(given.host, related.host) != 0) { related.absolute = NULL; related.relative = NULL; related.anchor = NULL; } /* ** Handle the path. */ if (wanted & (PARSE_PATH | PARSE_STRICTPATH | PARSE_QUERY)) { char *tail = NULL; int want_detail = (wanted & (PARSE_STRICTPATH | PARSE_QUERY)); if (want_detail) tail = result + strlen(result); if (acc_method && !given.absolute && given.relative) { if (!strcasecomp(acc_method, "nntp") || !strcasecomp(acc_method, "snews") || (!strcasecomp(acc_method, "news") && !strncasecomp(result, "news://", 7))) { /* * Treat all given nntp or snews paths, * or given paths for news URLs with a host, * as absolute. */ given.absolute = given.relative; given.relative = NULL; } } if (given.absolute) { /* All is given */ if (wanted & PARSE_PUNCTUATION) strcat(result, "/"); strcat(result, given.absolute); CTRACE((tfp, "HTParse: (ABS)\n")); } else if (related.absolute) { /* Adopt path not name */ strcat(result, "/"); strcat(result, related.absolute); if (given.relative) { p = strchr(result, '?'); /* Search part? */ if (p == NULL) p = (result + strlen(result) - 1); for (; *p != '/'; p--) ; /* last / */ p[1] = '\0'; /* Remove filename */ strcat(result, given.relative); /* Add given one */ HTSimplify (result); } CTRACE((tfp, "HTParse: (Related-ABS)\n")); } else if (given.relative) { strcat(result, given.relative); /* what we've got */ CTRACE((tfp, "HTParse: (REL)\n")); } else if (related.relative) { strcat(result, related.relative); CTRACE((tfp, "HTParse: (Related-REL)\n")); } else { /* No inheritance */ if (strncasecomp(aName, "lynxcgi:", 8) && strncasecomp(aName, "lynxexec:", 9) && strncasecomp(aName, "lynxprog:", 9)) { strcat(result, "/"); } if (!strcmp(result, "news:/")) result[5] = '*'; CTRACE((tfp, "HTParse: (No inheritance)\n")); } if (want_detail) { p = strchr(tail, '?'); /* Search part? */ if (p) { if (PARSE_STRICTPATH) { *p = '\0'; } else { if (!(wanted & PARSE_PUNCTUATION)) p++; do { *tail++ = *p; } while (*p++); } } else { if (wanted & PARSE_QUERY) *tail = '\0'; } } } /* ** Handle the fragment (anchor). */ if (wanted & PARSE_ANCHOR) if ((given.anchor && *given.anchor) || (!given.anchor && related.anchor)) { if (wanted & PARSE_PUNCTUATION) strcat(result, "#"); strcat(result, (given.anchor) ? given.anchor : related.anchor); } CTRACE((tfp, "HTParse: result:%s\n", result)); FREE(rel); FREE(name); StrAllocCopy(return_value, result); FREE(result); return return_value; /* exactly the right length */}/* Simplify a filename. HTSimplify()** --------------------**** A unix-style file is allowed to contain the seqeunce xxx/../ which may** be replaced by "" , and the seqeunce "/./" which may be replaced by "/".** Simplification helps us recognize duplicate filenames.**** Thus, /etc/junk/../fred becomes /etc/fred
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -