gunichartables.cpp

来自「clucene是c++版的全文检索引擎,完全移植于lucene,采用 stl 编」· C++ 代码 · 共 380 行
CPP
380 行
/*
 * Copyright (C) 1999 Tom Tromey
 * Copyright (C) 2000 Red Hat, Inc.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 02111-1307, USA.
 *
 *
 ************************************************
 * Also licensed with permission from Tom Tromey 
 * and Owen Taylor under the Apache license.
 * Original location:
 * http://cvs.gnome.org/viewcvs/glib/glib/guniprop.c?view=log
 ************************************************
 * 
 * Copyright 2003-2006 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "CLucene/StdHeader.h"

typedef unsigned long  gunichar;
typedef unsigned short guint16;
typedef          short gint16;
typedef          char  gchar;
typedef unsigned char  guchar;

/* These are the possible character classifications.
 * See http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
   or http://www.unicode.org/Public/UNIDATA/UCD.html.
      
   todo: i think there is a new version of the unicode, which we should use.
   data is licensed like this: http://www.unicode.org/copyright.html... not sure but looks apache compatible
 */
typedef enum
{
  G_UNICODE_CONTROL,
  G_UNICODE_FORMAT,
  G_UNICODE_UNASSIGNED,
  G_UNICODE_PRIVATE_USE,
  G_UNICODE_SURROGATE,
  G_UNICODE_LOWERCASE_LETTER,
  G_UNICODE_MODIFIER_LETTER,
  G_UNICODE_OTHER_LETTER,
  G_UNICODE_TITLECASE_LETTER,
  G_UNICODE_UPPERCASE_LETTER,
  G_UNICODE_COMBINING_MARK,
  G_UNICODE_ENCLOSING_MARK,
  G_UNICODE_NON_SPACING_MARK,
  G_UNICODE_DECIMAL_NUMBER,
  G_UNICODE_LETTER_NUMBER,
  G_UNICODE_OTHER_NUMBER,
  G_UNICODE_CONNECT_PUNCTUATION,
  G_UNICODE_DASH_PUNCTUATION,
  G_UNICODE_CLOSE_PUNCTUATION,
  G_UNICODE_FINAL_PUNCTUATION,
  G_UNICODE_INITIAL_PUNCTUATION,
  G_UNICODE_OTHER_PUNCTUATION,
  G_UNICODE_OPEN_PUNCTUATION,
  G_UNICODE_CURRENCY_SYMBOL,
  G_UNICODE_MODIFIER_SYMBOL,
  G_UNICODE_MATH_SYMBOL,
  G_UNICODE_OTHER_SYMBOL,
  G_UNICODE_LINE_SEPARATOR,
  G_UNICODE_PARAGRAPH_SEPARATOR,
  G_UNICODE_SPACE_SEPARATOR
} GUnicodeType;


#include "gunichartables.h"

#define ATTR_TABLE(Page) (((Page) <= G_UNICODE_LAST_PAGE_PART1) \
                          ? attr_table_part1[Page] \
                          : attr_table_part2[(Page) - 0xe00])

#define ATTTABLE(Page, Char) \
  ((ATTR_TABLE(Page) == G_UNICODE_MAX_TABLE_INDEX) ? 0 : (attr_data[ATTR_TABLE(Page)][Char]))


#define TTYPE_PART1(Page, Char) \
  ((type_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
   ? (type_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
   : (type_data[type_table_part1[Page]][Char]))

#define TTYPE_PART2(Page, Char) \
  ((type_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
   ? (type_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
   : (type_data[type_table_part2[Page]][Char]))

#define TYPE(Char) \
  (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
   ? TTYPE_PART1 ((Char) >> 8, (Char) & 0xff) \
   : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
      ? TTYPE_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
      : G_UNICODE_UNASSIGNED))

/* Count the number of elements in an array. The array must be defined
 * as such; using this with a dynamically allocated array will give
 * incorrect results.
 */
#define G_N_ELEMENTS(arr)		(sizeof (arr) / sizeof ((arr)[0]))




#if defined(LUCENE_USE_INTERNAL_CHAR_FUNCTIONS)
#ifdef _LUCENE_PRAGMA_WARNINGS
 #pragma message ("===== Using internal character function =====")
#else
 #warning "===== Using internal character function ====="
#endif

bool cl_isletter(gunichar c)
{
    int t = TYPE (c);
    switch(t)
    {
      case G_UNICODE_LOWERCASE_LETTER: return true;
      case G_UNICODE_TITLECASE_LETTER: return true;
      case G_UNICODE_UPPERCASE_LETTER: return true;
      case G_UNICODE_MODIFIER_LETTER: return true;
      case G_UNICODE_OTHER_LETTER: return true;
      default: return false;
    }
}

bool cl_isalnum(gunichar c)
{
    int t = TYPE (c);
    switch(t)
    {
      case G_UNICODE_LOWERCASE_LETTER: return true;
      case G_UNICODE_TITLECASE_LETTER: return true;
      case G_UNICODE_UPPERCASE_LETTER: return true;
      case G_UNICODE_MODIFIER_LETTER: return true;
      case G_UNICODE_OTHER_LETTER: return true;
      case G_UNICODE_DECIMAL_NUMBER: return true;
      case G_UNICODE_LETTER_NUMBER: return true;
      case G_UNICODE_OTHER_NUMBER: return true;
      default: return false;
    }
}

bool cl_isdigit(gunichar c)
{
    int t = TYPE (c);
    switch(t)
    {
      case G_UNICODE_DECIMAL_NUMBER: return true;
      case G_UNICODE_LETTER_NUMBER: return true;
      case G_UNICODE_OTHER_NUMBER: return true;
      default: return false;
    }
}

/**
 * cl_isspace:
 * @c: a Unicode character
 *
 * Determines whether a character is a space, tab, or line separator
 * (newline, carriage return, etc.).  Given some UTF-8 text, obtain a
 * character value with lucene_utf8towc().
 *
 * (Note: don't use this to do word breaking; you have to use
 * Pango or equivalent to get word breaking right, the algorithm
 * is fairly complex.)
 *
 * Return value: %TRUE if @c is a punctuation character
 **/
bool cl_isspace (gunichar c)
{
  switch (c)
  {
      /* special-case these since Unicode thinks they are not spaces */
    case '\t':
    case '\n':
    case '\r':
    case '\f':
      return true;

    default:
    {
     int t = TYPE ((gunichar)c);
     return (t == G_UNICODE_SPACE_SEPARATOR || t == G_UNICODE_LINE_SEPARATOR
             || t == G_UNICODE_PARAGRAPH_SEPARATOR);
    }
  }
}



/**
 * cl_tolower:
 * @c: a Unicode character.
 *
 * Converts a character to lower case.
 *
 * Return value: the result of converting @c to lower case.
 *               If @c is not an upperlower or titlecase character,
 *               or has no lowercase equivalent @c is returned unchanged.
 **/
TCHAR cl_tolower (TCHAR ch)
{
  gunichar c=ch;
  int t = TYPE ((gunichar)c);
  if (t == G_UNICODE_UPPERCASE_LETTER)
  {
      gunichar val = ATTTABLE (c >> 8, c & 0xff);
      if (val >= 0x1000000)
      {
        const gchar *p = special_case_table + val - 0x1000000;
        int len=0;
		wchar_t ret=0;
		lucene_utf8towc(&ret,p,6);
#ifdef _UCS2
		return ret;
#else
        return LUCENE_OOR_CHAR(ret);
#endif
        //return cl_utf8_get_char (p, &len);
      }else
        return val ? val : c;
  }else if (t == G_UNICODE_TITLECASE_LETTER){
      unsigned int i;
      for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
      {
        if (title_table[i][0] == c)
          return title_table[i][2];
      }
  }
  return c;
}

/**
 * cl_toupper:
 * @c: a Unicode character
 * 
 * Converts a character to uppercase.
 * 
 * Return value: the result of converting @c to uppercase.
 *               If @c is not an lowercase or titlecase character,
 *               or has no upper case equivalent @c is returned unchanged.
 **/
TCHAR cl_toupper (TCHAR ch)
{
  gunichar c=ch;
  int t = TYPE (c);
  if (t == G_UNICODE_LOWERCASE_LETTER)
    {
      gunichar val = ATTTABLE (c >> 8, c & 0xff);
      if (val >= 0x1000000)
	{
	  const gchar *p = special_case_table + val - 0x1000000;
	  
	  wchar_t ret=0;
	  lucene_utf8towc(&ret,p,6);
#ifdef _UCS2
	  return ret;
#else
      return LUCENE_OOR_CHAR(ret);
#endif
	  //return lucene_utf8towc (p);
	}
      else
	return val ? val : c;
    }
  else if (t == G_UNICODE_TITLECASE_LETTER)
    {
      unsigned int i;
      for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
	{
	  if (title_table[i][0] == c)
	    return title_table[i][1];
	}
    }
  return c;
}



/**
 * cl_tcasefold:
 * @str: a unicode string
 *
 * Converts a string into a form that is independent of case. The
 * result will not correspond to any particular case, but can be
 * compared for equality or ordered with the results of calling
 * cl_tcasefold() on other strings.
 *
 * Note that calling cl_tcasefold() followed by g_utf8_collate() is
 * only an approximation to the correct linguistic case insensitive
 * ordering, though it is a fairly good one. Getting this exactly
 * right would require a more sophisticated collation function that
 * takes case sensitivity into account. GLib does not currently
 * provide such a function.
 *
 * Return value: a newly allocated string, that is a
 *   case independent form of @str.
 **/
TCHAR cl_tcasefold(const TCHAR ch){
    int start = 0;
    int end = G_N_ELEMENTS (casefold_table);
    
	if (ch >= casefold_table[start].ch &&
        ch <= casefold_table[end - 1].ch)
    {
        while (1)
        {
            int half = (start + end) / 2;
            if (ch == casefold_table[half].ch)
            {
				   wchar_t ret=0;
				   lucene_utf8towc(&ret,casefold_table[half].data,6);

               #ifdef _UCS2
		           return ret;
               #else
                   LUCENE_OOR_CHAR(ret)
               #endif
            }else if (half == start){
                break;
            }else if (ch > casefold_table[half].ch){
                start = half;
            }else{
                end = half;
            }
        }
    }
    return cl_tolower(ch);
    
}


//this function was not taken from gnome
TCHAR* cl_tcscasefold( TCHAR * str, int len ) //len default is -1
{
    TCHAR *p = str;
    while ((len < 0 || p < str + len) && *p)
    {
        *p = cl_tcasefold(*p);
		p++;
    }
    return str;
}
//this function was not taken from gnome
int cl_tcscasefoldcmp(const TCHAR * dst, const TCHAR * src){
    TCHAR f,l;
    
    do{
        f = cl_tcasefold( (*(dst++)) );
        l = cl_tcasefold( (*(src++)) );
    } while ( (f) && (f == l) );
    
    return (int)(f - l);
}

#endif
gunichartables.cpp - 源码说明

本页面展示了「clucene是c++版的全文检索引擎,完全移植于lucene,采用 stl 编写.」中的 gunichartables.cpp 源码文件，采用 C++ 编程语言编写，共 380 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫开发者社区收录了大量与CLucene相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?