📄 archer.c

📁 在Linux下处理英语文本分类
💻 C
📖 第 1 页 / 共 3 页
字号:
  int sequence_occurrence_count = 0;  int something_was_greater_than_max;  bow_wa *wa;  float scaler;  archer_doc *doc;  /* Parse the query */  lex = bow_default_lexer->open_str (bow_default_lexer, (char*)query_string);  if (lex == NULL)    return NULL;  query_len = 0;  while (bow_default_lexer->get_word (bow_default_lexer, lex,				      word, BOW_MAX_WORD_LENGTH))    {      /* Add the field-restricting suffix string, e.g. "xxxtitle" */      if (suffix_string[0])	{	  strcat (word, "xxx");	  strcat (word, suffix_string);	  assert (strlen (word) < BOW_MAX_WORD_LENGTH);	}      wi = bow_word2int_no_add (word);      if (wi >= 0)	{	  di[query_len] = pi[query_len] = -300;	  query[query_len++] = wi;	}      else if ((bow_lexer_stoplist_func		&& !(*bow_lexer_stoplist_func) (word))	       || (!bow_lexer_stoplist_func		   && strlen (word) < 2))	{	  /* If a query term wasn't present, and its not because it	     was in the stoplist or the word is a single char, then	     return no hits. */	  query_len = 0;	  break;	}      /* If we have no more room for more query terms, just don't use         the rest of them. */      if (query_len >= MAX_QUERY_WORDS)	break;    }  bow_default_lexer->close (bow_default_lexer, lex);  if (query_len == 0)    return NULL;  if (query_len == 1)    {      wa = archer_query_hits_matching_wi (query[0], 					  &sequence_occurrence_count);      goto search_done;    }  /* Initialize the array of document scores */  wa = bow_wa_new (0);  /* Search for documents containing the query words in the same order     as the query. */  bow_wi2pv_rewind (archer_wi2pv);  max_di = max_pi = -200;  /* Loop while we look for matches.  We'll break out of this loop when     any of the query words are at the end of their PV's. */  for (;;)    {      /* Keep reading DI and PI from one or more of the query-word PVs	 until none of the DIs or PIs is greater than the MAX_DI or	 MAX_PI.  At this point the DIs and PI's should all be equal,	 indicating a match.  Break out of this loop if all PVs are	 at the end, (at which piont they return -1 for both DI and	 PI). */      do	{	  something_was_greater_than_max = 0;	  for (i = 0; i < query_len; i++)	    {	      /* Keep looking for instances of word query[wi] */	      while (di[i] != -1		  && (di[i] < max_di		      || (di[i] <= max_di && pi[i] < max_pi)))		{		  bow_wi2pv_wi_next_di_pi (archer_wi2pv, query[i],					   &(di[i]), &(pi[i]));		  /* If any of the query words is at the end of their		     PV, then we're not going to find any more		     matches, and we're done setting the scores.  Go		     print the matching documents. */		  if (di[i] == -1)		    goto search_done;		  /* Make it so that all PI's will be equal if the words		     are in order. */		  pi[i] -= i;		  bow_verbosify (bow_verbose, "%20s %10d %10d %10d %10d\n", 				 bow_int2word (query[i]), 				 di[i], pi[i], max_di, max_pi);		}	      if (di[i] > max_di) 		{		  max_di = di[i];		  max_pi = pi[i];		  something_was_greater_than_max = 1;		}	      else if (pi[i] > max_pi && di[i] == max_di) 		{		  max_pi = pi[i];		  something_was_greater_than_max = 1;		}	    }	}      while (something_was_greater_than_max);      bow_verbosify (bow_verbose, 		     "something_was_greater_than_max di=%d\n", di[0]);      for (i = 1; i < query_len; i++)	assert (di[i] == di[0] && pi[i] == pi[0]);            /* Make sure this DI'th document hasn't been deleted.  If it         hasn't then add this DI to the WA---the list of hits */      doc = bow_sarray_entry_at_index (archer_docs, di[0]);      if (doc->word_count > 0)	{	  bow_wa_add_to_end (wa, di[0], 1);	  sequence_occurrence_count++;	}      /* Set up so that next time through we'll read the next words         from each PV. */      for (i = 0; i < query_len; i++)	{	  if (di[i] != -1)	    di[i] = -300;	  if (pi[i] != -1)	    pi[i] = -300;	}    } search_done:  if (wa->length == 0)    {      bow_wa_free (wa);      return NULL;    }  /* Scale the scores by the log of the occurrence count of this sequence,     and take the log of the count (shifted) to encourage documents that     have all query term to be ranked above documents that have many      repetitions of a few terms. */  if (!archer_arg_state.score_is_raw_count)    {      double document_frequency = wa->length;      scaler = 1.0 / log (5 + document_frequency);      for (i = 0; i < wa->length; i++)	wa->entry[i].weight = scaler * log (5 + wa->entry[i].weight);    }  return wa;}/* A temporary hack.  Also, does not work for queries containing   repeated words */voidarcher_query (){  int i;  int num_hits_to_print;#define NUM_FLAGS 3  enum {pos = 0,	reg,	neg,	num_flags};  struct _word_hit {    const char *term;    bow_wa *wa;    int flag;  } word_hits[num_flags][MAX_QUERY_WORDS];  int word_hits_count[num_flags];  int current_wai[num_flags][MAX_QUERY_WORDS];  struct _doc_hit {    int di;    float score;    const char **terms;    int terms_count;  } *doc_hits;  int doc_hits_count;  int doc_hits_size;  bow_wa *term_wa;  int current_di, h, f, min_di;  int something_was_greater_than_max;  char *query_copy, *query_remaining, *end;  char query_string[BOW_MAX_WORD_LENGTH];  char suffix_string[BOW_MAX_WORD_LENGTH];  int found_flag, flag, length;  /* For sorting the combined list of document hits */  int compare_doc_hits (struct _doc_hit *hit1, struct _doc_hit *hit2)    {      if (hit1->score < hit2->score)	return 1;      else if (hit1->score == hit2->score)	return 0;      else	return -1;    }  void archer_sort_hits (struct _doc_hit *hits, int hits_count, 			 int num_to_sort)    {      int i, j,max_j;      float max_score;      struct _doc_hit tmp;      /* Find the highest score NUM_TO_SORT times */      for (i = 0; i < num_to_sort;  i++)	{	  /* Find the next highest score */	  max_score = -FLT_MAX;	  max_j = -1;	  for (j = i; j < hits_count; j++)	    {	      if (hits[j].score > max_score)		{		  max_score = hits[j].score;		  max_j = j;		}	    }	  /* Move the high score into position */	  assert (max_j >= 0);	  tmp = hits[i];	  hits[i] = hits[max_j];	  hits[max_j] = tmp;	}    }	  /* Initialize the list of target documents associated with each term */  for (i = 0; i < num_flags; i++)    word_hits_count[i] = 0;  /* Initialize the combined list of target documents */  doc_hits_size = 1000;  doc_hits_count = 0;  doc_hits = bow_malloc (doc_hits_size * sizeof (struct _doc_hit));  /* Process each term in the query.  Quoted sections count as one     term here. */  query_remaining = query_copy = strdup (archer_arg_state.query_string);  assert (query_copy);  /* Chop any trailing newline or carriage return. */  end = strpbrk (query_remaining, "\n\r");  if (end)    *end = '\0';  while (*query_remaining)    {      /* Find the beginning of the next query term, and record +/- flags */      while (*query_remaining 	     && (!isalnum ((unsigned char)*query_remaining)		 && *query_remaining != ':'		 && *query_remaining != '+'		 && *query_remaining != '-'		 && *query_remaining != '"'))	query_remaining++;      flag = reg;      found_flag = 0;      if (*query_remaining == '\0')	{	  break;	}      if (*query_remaining == '+')	{	  query_remaining++;	  flag = pos;	}      else if (*query_remaining == '-')	{	  query_remaining++;	  flag = neg;	}      /* See if there is a field-restricting tag here, and if so, deal         with it */      if ((end = strpbrk (query_remaining, ": \"\t"))	  && *end == ':')	{	  /* The above condition ensures that a ':' appears before any	     term-delimiters */	  /* Find the end of the field-restricting suffix */	  length = end - query_remaining;	  assert (length < BOW_MAX_WORD_LENGTH);	  /* Remember the suffix, and move ahead the QUERY_REMAINING */	  memcpy (suffix_string, query_remaining, length);	  suffix_string[length] = '\0';	  query_remaining = end + 1;	}      else	suffix_string[0] = '\0';      /* Find the end of the next query term. */      if (*query_remaining == '"')	{	  query_remaining++;	  end = strchr (query_remaining, '"');	}      else	{	  end = strchr (query_remaining, ' ');	}      if (end == NULL)	end = strchr (query_remaining, '\0');      /* Put the next query term into QUERY_STRING and increment         QUERY_REMAINING */      length = end - query_remaining;      length = MIN (length, BOW_MAX_WORD_LENGTH-1);      memcpy (query_string, query_remaining, length);      query_string[length] = '\0';      if (*end == '"')	query_remaining = end + 1;      else	query_remaining = end;      if (length == 0)	continue;      /* printf ("%d %s\n", flag, query_string); */      /* Get the list of documents matching the term */      term_wa = archer_query_hits_matching_sequence (query_string, 						     suffix_string);      if (!term_wa)	{	  if (flag == pos)	    /* A required term didn't appear anywhere.  Print nothing */	    goto hit_combination_done;	  else	    continue;	}      word_hits[flag][word_hits_count[flag]].term = strdup (query_string);      word_hits[flag][word_hits_count[flag]].wa = term_wa;      word_hits[flag][word_hits_count[flag]].flag = flag;      word_hits_count[flag]++;      assert (word_hits_count[flag] < MAX_QUERY_WORDS);      bow_verbosify (bow_progress, "%8d %s\n", term_wa->length, query_string);    }  /* Bring together the WORD_HITS[*], following the correct +/-     semantics */  current_di = 0;  for (f = 0; f < num_flags; f++)    for (h = 0; h < word_hits_count[f]; h++)      current_wai[f][h] = 0; next_current_di:  if (word_hits_count[pos] == 0)    {      /* Find a document in which a regular term appears, and align the	 CURRENT_WAI[REG][*] to point to the document if exists in that list */      min_di = INT_MAX;      for (h = 0; h < word_hits_count[reg]; h++)	{	  if (current_wai[reg][h] != -1	      && (word_hits[reg][h].wa->entry[current_wai[reg][h]].wi		  < current_di))	    {	      if (current_wai[reg][h] < word_hits[reg][h].wa->length - 1)		current_wai[reg][h]++;	      else		current_wai[reg][h] = -1;	    }	  assert (current_wai[reg][h] == -1		  || (word_hits[reg][h].wa->entry[current_wai[reg][h]].wi		      >= current_di));	  if (current_wai[reg][h] != -1	      && word_hits[reg][h].wa->entry[current_wai[reg][h]].wi < min_di)	    min_di = word_hits[reg][h].wa->entry[current_wai[reg][h]].wi;	}      if (min_di == INT_MAX)	goto hit_combination_done;	      current_di = min_di;    }  else    {      /* Find a document index in which all the +terms appear */      /* Loop until current_wai[pos][*] all point to the same document index */      do	{	  something_was_greater_than_max = 0;	  for (h = 0; h < word_hits_count[pos]; h++)	    {	      while (word_hits[pos][h].wa->entry[current_wai[pos][h]].wi		     < current_di)		{		  if (current_wai[pos][h] < word_hits[pos][h].wa->length - 1)		    current_wai[pos][h]++;		  else		    /* We are at the end of a + list, and thus are done. */		    goto hit_combination_done;		}	      if (word_hits[pos][h].wa->entry[current_wai[pos][h]].wi 		  > current_di)		{		  current_di = 		    word_hits[pos][h].wa->entry[current_wai[pos][h]].wi;		  something_was_greater_than_max = 1;		}	    }	}      while (something_was_greater_than_max);      /* At this point all the CURRENT_WAI[pos][*] should be pointing to the	 same document.  Verify this. */      for (h = 1; h < word_hits_count[pos]; h++)	assert (word_hits[pos][h].wa->entry[current_wai[pos][h]].wi		== word_hits[pos][0].wa->entry[current_wai[pos][0]].wi);    }  /* Make sure the CURRENT_DI doesn't appear in any of the -term lists. */  for (h = 0; h < word_hits_count[neg]; h++)    {      /* Loop until we might have found the CURRENT_DI in this neg list */      while (current_wai[neg][h] != -1	     && (word_hits[neg][h].wa->entry[current_wai[neg][h]].wi		 < current_di))	{	  if (current_wai[neg][h] < word_hits[neg][h].wa->length - 1)	    current_wai[neg][h]++;	  else	    current_wai[neg][h] = -1;	}      if (word_hits[neg][h].wa->entry[current_wai[neg][h]].wi == current_di)	{	  current_di++;	  goto next_current_di;	}    }
💿 文件大小 11 K
👤 上传用户 c_word
📂 所属分类 Linux/Unix编程
📄 代码行数 1,299 行
💻 语言类型 C语言
🏷️ 相关标签

#Linux #英语 #文本分类
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -