📄 archer.c
字号:
int sequence_occurrence_count = 0; int something_was_greater_than_max; bow_wa *wa; float scaler; archer_doc *doc; /* Parse the query */ lex = bow_default_lexer->open_str (bow_default_lexer, (char*)query_string); if (lex == NULL) return NULL; query_len = 0; while (bow_default_lexer->get_word (bow_default_lexer, lex, word, BOW_MAX_WORD_LENGTH)) { /* Add the field-restricting suffix string, e.g. "xxxtitle" */ if (suffix_string[0]) { strcat (word, "xxx"); strcat (word, suffix_string); assert (strlen (word) < BOW_MAX_WORD_LENGTH); } wi = bow_word2int_no_add (word); if (wi >= 0) { di[query_len] = pi[query_len] = -300; query[query_len++] = wi; } else if ((bow_lexer_stoplist_func && !(*bow_lexer_stoplist_func) (word)) || (!bow_lexer_stoplist_func && strlen (word) < 2)) { /* If a query term wasn't present, and its not because it was in the stoplist or the word is a single char, then return no hits. */ query_len = 0; break; } /* If we have no more room for more query terms, just don't use the rest of them. */ if (query_len >= MAX_QUERY_WORDS) break; } bow_default_lexer->close (bow_default_lexer, lex); if (query_len == 0) return NULL; if (query_len == 1) { wa = archer_query_hits_matching_wi (query[0], &sequence_occurrence_count); goto search_done; } /* Initialize the array of document scores */ wa = bow_wa_new (0); /* Search for documents containing the query words in the same order as the query. */ bow_wi2pv_rewind (archer_wi2pv); max_di = max_pi = -200; /* Loop while we look for matches. We'll break out of this loop when any of the query words are at the end of their PV's. */ for (;;) { /* Keep reading DI and PI from one or more of the query-word PVs until none of the DIs or PIs is greater than the MAX_DI or MAX_PI. At this point the DIs and PI's should all be equal, indicating a match. Break out of this loop if all PVs are at the end, (at which piont they return -1 for both DI and PI). */ do { something_was_greater_than_max = 0; for (i = 0; i < query_len; i++) { /* Keep looking for instances of word query[wi] */ while (di[i] != -1 && (di[i] < max_di || (di[i] <= max_di && pi[i] < max_pi))) { bow_wi2pv_wi_next_di_pi (archer_wi2pv, query[i], &(di[i]), &(pi[i])); /* If any of the query words is at the end of their PV, then we're not going to find any more matches, and we're done setting the scores. Go print the matching documents. */ if (di[i] == -1) goto search_done; /* Make it so that all PI's will be equal if the words are in order. */ pi[i] -= i; bow_verbosify (bow_verbose, "%20s %10d %10d %10d %10d\n", bow_int2word (query[i]), di[i], pi[i], max_di, max_pi); } if (di[i] > max_di) { max_di = di[i]; max_pi = pi[i]; something_was_greater_than_max = 1; } else if (pi[i] > max_pi && di[i] == max_di) { max_pi = pi[i]; something_was_greater_than_max = 1; } } } while (something_was_greater_than_max); bow_verbosify (bow_verbose, "something_was_greater_than_max di=%d\n", di[0]); for (i = 1; i < query_len; i++) assert (di[i] == di[0] && pi[i] == pi[0]); /* Make sure this DI'th document hasn't been deleted. If it hasn't then add this DI to the WA---the list of hits */ doc = bow_sarray_entry_at_index (archer_docs, di[0]); if (doc->word_count > 0) { bow_wa_add_to_end (wa, di[0], 1); sequence_occurrence_count++; } /* Set up so that next time through we'll read the next words from each PV. */ for (i = 0; i < query_len; i++) { if (di[i] != -1) di[i] = -300; if (pi[i] != -1) pi[i] = -300; } } search_done: if (wa->length == 0) { bow_wa_free (wa); return NULL; } /* Scale the scores by the log of the occurrence count of this sequence, and take the log of the count (shifted) to encourage documents that have all query term to be ranked above documents that have many repetitions of a few terms. */ if (!archer_arg_state.score_is_raw_count) { double document_frequency = wa->length; scaler = 1.0 / log (5 + document_frequency); for (i = 0; i < wa->length; i++) wa->entry[i].weight = scaler * log (5 + wa->entry[i].weight); } return wa;}/* A temporary hack. Also, does not work for queries containing repeated words */voidarcher_query (){ int i; int num_hits_to_print;#define NUM_FLAGS 3 enum {pos = 0, reg, neg, num_flags}; struct _word_hit { const char *term; bow_wa *wa; int flag; } word_hits[num_flags][MAX_QUERY_WORDS]; int word_hits_count[num_flags]; int current_wai[num_flags][MAX_QUERY_WORDS]; struct _doc_hit { int di; float score; const char **terms; int terms_count; } *doc_hits; int doc_hits_count; int doc_hits_size; bow_wa *term_wa; int current_di, h, f, min_di; int something_was_greater_than_max; char *query_copy, *query_remaining, *end; char query_string[BOW_MAX_WORD_LENGTH]; char suffix_string[BOW_MAX_WORD_LENGTH]; int found_flag, flag, length; /* For sorting the combined list of document hits */ int compare_doc_hits (struct _doc_hit *hit1, struct _doc_hit *hit2) { if (hit1->score < hit2->score) return 1; else if (hit1->score == hit2->score) return 0; else return -1; } void archer_sort_hits (struct _doc_hit *hits, int hits_count, int num_to_sort) { int i, j,max_j; float max_score; struct _doc_hit tmp; /* Find the highest score NUM_TO_SORT times */ for (i = 0; i < num_to_sort; i++) { /* Find the next highest score */ max_score = -FLT_MAX; max_j = -1; for (j = i; j < hits_count; j++) { if (hits[j].score > max_score) { max_score = hits[j].score; max_j = j; } } /* Move the high score into position */ assert (max_j >= 0); tmp = hits[i]; hits[i] = hits[max_j]; hits[max_j] = tmp; } } /* Initialize the list of target documents associated with each term */ for (i = 0; i < num_flags; i++) word_hits_count[i] = 0; /* Initialize the combined list of target documents */ doc_hits_size = 1000; doc_hits_count = 0; doc_hits = bow_malloc (doc_hits_size * sizeof (struct _doc_hit)); /* Process each term in the query. Quoted sections count as one term here. */ query_remaining = query_copy = strdup (archer_arg_state.query_string); assert (query_copy); /* Chop any trailing newline or carriage return. */ end = strpbrk (query_remaining, "\n\r"); if (end) *end = '\0'; while (*query_remaining) { /* Find the beginning of the next query term, and record +/- flags */ while (*query_remaining && (!isalnum ((unsigned char)*query_remaining) && *query_remaining != ':' && *query_remaining != '+' && *query_remaining != '-' && *query_remaining != '"')) query_remaining++; flag = reg; found_flag = 0; if (*query_remaining == '\0') { break; } if (*query_remaining == '+') { query_remaining++; flag = pos; } else if (*query_remaining == '-') { query_remaining++; flag = neg; } /* See if there is a field-restricting tag here, and if so, deal with it */ if ((end = strpbrk (query_remaining, ": \"\t")) && *end == ':') { /* The above condition ensures that a ':' appears before any term-delimiters */ /* Find the end of the field-restricting suffix */ length = end - query_remaining; assert (length < BOW_MAX_WORD_LENGTH); /* Remember the suffix, and move ahead the QUERY_REMAINING */ memcpy (suffix_string, query_remaining, length); suffix_string[length] = '\0'; query_remaining = end + 1; } else suffix_string[0] = '\0'; /* Find the end of the next query term. */ if (*query_remaining == '"') { query_remaining++; end = strchr (query_remaining, '"'); } else { end = strchr (query_remaining, ' '); } if (end == NULL) end = strchr (query_remaining, '\0'); /* Put the next query term into QUERY_STRING and increment QUERY_REMAINING */ length = end - query_remaining; length = MIN (length, BOW_MAX_WORD_LENGTH-1); memcpy (query_string, query_remaining, length); query_string[length] = '\0'; if (*end == '"') query_remaining = end + 1; else query_remaining = end; if (length == 0) continue; /* printf ("%d %s\n", flag, query_string); */ /* Get the list of documents matching the term */ term_wa = archer_query_hits_matching_sequence (query_string, suffix_string); if (!term_wa) { if (flag == pos) /* A required term didn't appear anywhere. Print nothing */ goto hit_combination_done; else continue; } word_hits[flag][word_hits_count[flag]].term = strdup (query_string); word_hits[flag][word_hits_count[flag]].wa = term_wa; word_hits[flag][word_hits_count[flag]].flag = flag; word_hits_count[flag]++; assert (word_hits_count[flag] < MAX_QUERY_WORDS); bow_verbosify (bow_progress, "%8d %s\n", term_wa->length, query_string); } /* Bring together the WORD_HITS[*], following the correct +/- semantics */ current_di = 0; for (f = 0; f < num_flags; f++) for (h = 0; h < word_hits_count[f]; h++) current_wai[f][h] = 0; next_current_di: if (word_hits_count[pos] == 0) { /* Find a document in which a regular term appears, and align the CURRENT_WAI[REG][*] to point to the document if exists in that list */ min_di = INT_MAX; for (h = 0; h < word_hits_count[reg]; h++) { if (current_wai[reg][h] != -1 && (word_hits[reg][h].wa->entry[current_wai[reg][h]].wi < current_di)) { if (current_wai[reg][h] < word_hits[reg][h].wa->length - 1) current_wai[reg][h]++; else current_wai[reg][h] = -1; } assert (current_wai[reg][h] == -1 || (word_hits[reg][h].wa->entry[current_wai[reg][h]].wi >= current_di)); if (current_wai[reg][h] != -1 && word_hits[reg][h].wa->entry[current_wai[reg][h]].wi < min_di) min_di = word_hits[reg][h].wa->entry[current_wai[reg][h]].wi; } if (min_di == INT_MAX) goto hit_combination_done; current_di = min_di; } else { /* Find a document index in which all the +terms appear */ /* Loop until current_wai[pos][*] all point to the same document index */ do { something_was_greater_than_max = 0; for (h = 0; h < word_hits_count[pos]; h++) { while (word_hits[pos][h].wa->entry[current_wai[pos][h]].wi < current_di) { if (current_wai[pos][h] < word_hits[pos][h].wa->length - 1) current_wai[pos][h]++; else /* We are at the end of a + list, and thus are done. */ goto hit_combination_done; } if (word_hits[pos][h].wa->entry[current_wai[pos][h]].wi > current_di) { current_di = word_hits[pos][h].wa->entry[current_wai[pos][h]].wi; something_was_greater_than_max = 1; } } } while (something_was_greater_than_max); /* At this point all the CURRENT_WAI[pos][*] should be pointing to the same document. Verify this. */ for (h = 1; h < word_hits_count[pos]; h++) assert (word_hits[pos][h].wa->entry[current_wai[pos][h]].wi == word_hits[pos][0].wa->entry[current_wai[pos][0]].wi); } /* Make sure the CURRENT_DI doesn't appear in any of the -term lists. */ for (h = 0; h < word_hits_count[neg]; h++) { /* Loop until we might have found the CURRENT_DI in this neg list */ while (current_wai[neg][h] != -1 && (word_hits[neg][h].wa->entry[current_wai[neg][h]].wi < current_di)) { if (current_wai[neg][h] < word_hits[neg][h].wa->length - 1) current_wai[neg][h]++; else current_wai[neg][h] = -1; } if (word_hits[neg][h].wa->entry[current_wai[neg][h]].wi == current_di) { current_di++; goto next_current_di; } }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -