Main Page | Modules | Alphabetical List | Data Structures | File List | Data Fields | Globals | Related Pages

utf8_tables.h File Reference


Detailed Description

Unicode Lookup Tables.

Author:
Christian Biere
Date:
2004-2005

Go to the source code of this file.

Data Structures

struct  utf32_nfkd
 The upper 8 bit of c are reserved for flags. More...

struct  utf32_comb_class
struct  utf32_general_category

Defines

#define UTF32_NFKD_REPLACE_MAXLEN   18
#define UTF32_F_MASK   0xff000000U
#define UTF32_F_NFKD   0x80000000U /**< Set for compatibility compositions */
 Set for compatibility compositions.

#define UTF32_NFD(x)   (x)
#define UTF32_NFKD(x)   (UTF32_F_NFKD | (x))
#define UNI_GC_LU   UNI_GC_LETTER_UPPERCASE
#define UNI_GC_LL   UNI_GC_LETTER_LOWERCASE
#define UNI_GC_LT   UNI_GC_LETTER_TITLECASE
#define UNI_GC_LM   UNI_GC_LETTER_MODIFIER
#define UNI_GC_LO   UNI_GC_LETTER_OTHER
#define UNI_GC_MN   UNI_GC_MARK_NONSPACING
#define UNI_GC_MC   UNI_GC_MARK_SPACING_COMBINE
#define UNI_GC_ME   UNI_GC_MARK_ENCLOSING
#define UNI_GC_ND   UNI_GC_NUMBER_DECIMAL
#define UNI_GC_NL   UNI_GC_NUMBER_LETTER
#define UNI_GC_NO   UNI_GC_NUMBER_OTHER
#define UNI_GC_PC   UNI_GC_PUNCT_CONNECTOR
#define UNI_GC_PD   UNI_GC_PUNCT_DASH
#define UNI_GC_PS   UNI_GC_PUNCT_OPEN
#define UNI_GC_PE   UNI_GC_PUNCT_CLOSE
#define UNI_GC_PI   UNI_GC_PUNCT_INIT_QUOTE
#define UNI_GC_PF   UNI_GC_PUNCT_FINAL_QUOTE
#define UNI_GC_PO   UNI_GC_PUNCT_OTHER
#define UNI_GC_SM   UNI_GC_SYMBOL_MATH
#define UNI_GC_SC   UNI_GC_SYMBOL_CURRENCY
#define UNI_GC_SK   UNI_GC_SYMBOL_MODIFIER
#define UNI_GC_SO   UNI_GC_SYMBOL_OTHER
#define UNI_GC_ZS   UNI_GC_SEPARATOR_SPACE
#define UNI_GC_ZL   UNI_GC_SEPARATOR_LINE
#define UNI_GC_ZP   UNI_GC_SEPARATOR_PARAGRAPH
#define UNI_GC_CC   UNI_GC_OTHER_CONTROL
#define UNI_GC_CF   UNI_GC_OTHER_FORMAT
#define UNI_GC_CS   UNI_GC_OTHER_SURROGATE
#define UNI_GC_CO   UNI_GC_OTHER_PRIVATE_USE
#define UNI_GC_CN   UNI_GC_OTHER_NOT_ASSIGNED

Enumerations

enum  uni_gc_t {
  UNI_GC_LETTER_UPPERCASE = 0, UNI_GC_LETTER_LOWERCASE, UNI_GC_LETTER_TITLECASE, UNI_GC_LETTER_MODIFIER,
  UNI_GC_LETTER_OTHER, UNI_GC_MARK_NONSPACING, UNI_GC_MARK_SPACING_COMBINE, UNI_GC_MARK_ENCLOSING,
  UNI_GC_NUMBER_DECIMAL, UNI_GC_NUMBER_LETTER, UNI_GC_NUMBER_OTHER, UNI_GC_PUNCT_CONNECTOR,
  UNI_GC_PUNCT_DASH, UNI_GC_PUNCT_OPEN, UNI_GC_PUNCT_CLOSE, UNI_GC_PUNCT_INIT_QUOTE,
  UNI_GC_PUNCT_FINAL_QUOTE, UNI_GC_PUNCT_OTHER, UNI_GC_SYMBOL_MATH, UNI_GC_SYMBOL_CURRENCY,
  UNI_GC_SYMBOL_MODIFIER, UNI_GC_SYMBOL_OTHER, UNI_GC_SEPARATOR_SPACE, UNI_GC_SEPARATOR_LINE,
  UNI_GC_SEPARATOR_PARAGRAPH, UNI_GC_OTHER_CONTROL, UNI_GC_OTHER_FORMAT, UNI_GC_OTHER_SURROGATE,
  UNI_GC_OTHER_PRIVATE_USE, UNI_GC_OTHER_NOT_ASSIGNED
}

Variables

struct {
   const guchar   start
   const guchar   end
utf8_2nd_byte_tab [64]
const struct utf32_nfkd utf32_nfkd_lut []
 The upper 8 bit of c are reserved for flags.

struct {
   guint16   lower
   guint16   upper
utf32_uppercase_lut []
struct {
   guint16   upper
   guint16   lower
utf32_lowercase_lut []
const struct utf32_comb_class utf32_comb_class_lut []
const guint32 utf32_composition_exclusions []
 These are special exlusions which cannot be derived from UnicodeData.txt but are listed in CompositionExclusions.txt.

const struct utf32_general_category utf32_general_category_lut []
struct {
   guint32   start
   guint32   end
utf32_block_id_lut []
struct {
   guint32   uc
   guint16   len
utf32_normalization_specials []
 This table is huge! It can be generated with the following AWK script:.

struct {
   guint16   uc
   const char   s [4]
jap_tab []


Define Documentation

#define UNI_GC_CC   UNI_GC_OTHER_CONTROL
 

#define UNI_GC_CF   UNI_GC_OTHER_FORMAT
 

#define UNI_GC_CN   UNI_GC_OTHER_NOT_ASSIGNED
 

#define UNI_GC_CO   UNI_GC_OTHER_PRIVATE_USE
 

#define UNI_GC_CS   UNI_GC_OTHER_SURROGATE
 

#define UNI_GC_LL   UNI_GC_LETTER_LOWERCASE
 

#define UNI_GC_LM   UNI_GC_LETTER_MODIFIER
 

#define UNI_GC_LO   UNI_GC_LETTER_OTHER
 

#define UNI_GC_LT   UNI_GC_LETTER_TITLECASE
 

#define UNI_GC_LU   UNI_GC_LETTER_UPPERCASE
 

#define UNI_GC_MC   UNI_GC_MARK_SPACING_COMBINE
 

#define UNI_GC_ME   UNI_GC_MARK_ENCLOSING
 

#define UNI_GC_MN   UNI_GC_MARK_NONSPACING
 

#define UNI_GC_ND   UNI_GC_NUMBER_DECIMAL
 

#define UNI_GC_NL   UNI_GC_NUMBER_LETTER
 

#define UNI_GC_NO   UNI_GC_NUMBER_OTHER
 

#define UNI_GC_PC   UNI_GC_PUNCT_CONNECTOR
 

#define UNI_GC_PD   UNI_GC_PUNCT_DASH
 

#define UNI_GC_PE   UNI_GC_PUNCT_CLOSE
 

#define UNI_GC_PF   UNI_GC_PUNCT_FINAL_QUOTE
 

#define UNI_GC_PI   UNI_GC_PUNCT_INIT_QUOTE
 

#define UNI_GC_PO   UNI_GC_PUNCT_OTHER
 

#define UNI_GC_PS   UNI_GC_PUNCT_OPEN
 

#define UNI_GC_SC   UNI_GC_SYMBOL_CURRENCY
 

#define UNI_GC_SK   UNI_GC_SYMBOL_MODIFIER
 

#define UNI_GC_SM   UNI_GC_SYMBOL_MATH
 

#define UNI_GC_SO   UNI_GC_SYMBOL_OTHER
 

#define UNI_GC_ZL   UNI_GC_SEPARATOR_LINE
 

#define UNI_GC_ZP   UNI_GC_SEPARATOR_PARAGRAPH
 

#define UNI_GC_ZS   UNI_GC_SEPARATOR_SPACE
 

#define UTF32_F_MASK   0xff000000U
 

#define UTF32_F_NFKD   0x80000000U /**< Set for compatibility compositions */
 

Set for compatibility compositions.

#define UTF32_NFD  )     (x)
 

#define UTF32_NFKD  )     (UTF32_F_NFKD | (x))
 

#define UTF32_NFKD_REPLACE_MAXLEN   18
 


Enumeration Type Documentation

enum uni_gc_t
 

Enumeration values:
UNI_GC_LETTER_UPPERCASE 
UNI_GC_LETTER_LOWERCASE 
UNI_GC_LETTER_TITLECASE 
UNI_GC_LETTER_MODIFIER 
UNI_GC_LETTER_OTHER 
UNI_GC_MARK_NONSPACING 
UNI_GC_MARK_SPACING_COMBINE 
UNI_GC_MARK_ENCLOSING 
UNI_GC_NUMBER_DECIMAL 
UNI_GC_NUMBER_LETTER 
UNI_GC_NUMBER_OTHER 
UNI_GC_PUNCT_CONNECTOR 
UNI_GC_PUNCT_DASH 
UNI_GC_PUNCT_OPEN 
UNI_GC_PUNCT_CLOSE 
UNI_GC_PUNCT_INIT_QUOTE 
UNI_GC_PUNCT_FINAL_QUOTE 
UNI_GC_PUNCT_OTHER 
UNI_GC_SYMBOL_MATH 
UNI_GC_SYMBOL_CURRENCY 
UNI_GC_SYMBOL_MODIFIER 
UNI_GC_SYMBOL_OTHER 
UNI_GC_SEPARATOR_SPACE 
UNI_GC_SEPARATOR_LINE 
UNI_GC_SEPARATOR_PARAGRAPH 
UNI_GC_OTHER_CONTROL 
UNI_GC_OTHER_FORMAT 
UNI_GC_OTHER_SURROGATE 
UNI_GC_OTHER_PRIVATE_USE 
UNI_GC_OTHER_NOT_ASSIGNED 


Variable Documentation

guint32 end
 

const { ... } jap_tab[] [static]
 

guint16 len
 

The array length.

guint16 lower
 

const char s[4]
 

guint32 start
 

guint16 uc
 

The first unicode character in the array.

guint16 upper
 

const { ... } utf32_block_id_lut[] [static]
 

const struct utf32_comb_class utf32_comb_class_lut[] [static]
 

const guint32 utf32_composition_exclusions[] [static]
 

These are special exlusions which cannot be derived from UnicodeData.txt but are listed in CompositionExclusions.txt.

Note that the entries in that file are not completely sorted.

const struct utf32_general_category utf32_general_category_lut[] [static]
 

const { ... } utf32_lowercase_lut[] [static]
 

const struct utf32_nfkd utf32_nfkd_lut[] [static]
 

The upper 8 bit of c are reserved for flags.

The character value is, thus, only the lower 24 bits.

const { ... } utf32_normalization_specials[] [static]
 

This table is huge! It can be generated with the following AWK script:.

awk 'BEGIN { FS=";" } /^[0-9A-Z]/ { sub("#.*", ""); printf("{ { "); for (i = 1; i < 6; i++) { gsub(" ", ", 0x", $i); sub("$", ",", $i); printf("{ 0x%s }%s", $i, i < 5 ? ", " : " } },\n" ); } } ' NormalizationTest.txt

const { ... } utf32_uppercase_lut[] [static]
 

const { ... } utf8_2nd_byte_tab[64] [static]
 


Generated on Sun Feb 12 10:50:10 2006 for Gtk-Gnutella by doxygen 1.3.6