utf8_tables.h File Reference

Detailed Description

Unicode Lookup Tables.

Author:: Christian Biere

Date:: 2004-2005

Go to the source code of this file.

Data Structures

struct utf32_nfkd

The upper 8 bit of c are reserved for flags. More...

struct utf32_comb_class

struct utf32_general_category

Defines

#define UTF32_NFKD_REPLACE_MAXLEN   18

#define UTF32_F_MASK   0xff000000U

#define UTF32_F_NFKD   0x80000000U /**< Set for compatibility compositions */

Set for compatibility compositions.

#define UTF32_NFD(x)   (x)

#define UTF32_NFKD(x)   (UTF32_F_NFKD | (x))

#define UNI_GC_LU   UNI_GC_LETTER_UPPERCASE

#define UNI_GC_LL   UNI_GC_LETTER_LOWERCASE

#define UNI_GC_LT   UNI_GC_LETTER_TITLECASE

#define UNI_GC_LM   UNI_GC_LETTER_MODIFIER

#define UNI_GC_LO   UNI_GC_LETTER_OTHER

#define UNI_GC_MN   UNI_GC_MARK_NONSPACING

#define UNI_GC_MC   UNI_GC_MARK_SPACING_COMBINE

#define UNI_GC_ME   UNI_GC_MARK_ENCLOSING

#define UNI_GC_ND   UNI_GC_NUMBER_DECIMAL

#define UNI_GC_NL   UNI_GC_NUMBER_LETTER

#define UNI_GC_NO   UNI_GC_NUMBER_OTHER

#define UNI_GC_PC   UNI_GC_PUNCT_CONNECTOR

#define UNI_GC_PD   UNI_GC_PUNCT_DASH

#define UNI_GC_PS   UNI_GC_PUNCT_OPEN

#define UNI_GC_PE   UNI_GC_PUNCT_CLOSE

#define UNI_GC_PI   UNI_GC_PUNCT_INIT_QUOTE

#define UNI_GC_PF   UNI_GC_PUNCT_FINAL_QUOTE

#define UNI_GC_PO   UNI_GC_PUNCT_OTHER

#define UNI_GC_SM   UNI_GC_SYMBOL_MATH

#define UNI_GC_SC   UNI_GC_SYMBOL_CURRENCY

#define UNI_GC_SK   UNI_GC_SYMBOL_MODIFIER

#define UNI_GC_SO   UNI_GC_SYMBOL_OTHER

#define UNI_GC_ZS   UNI_GC_SEPARATOR_SPACE

#define UNI_GC_ZL   UNI_GC_SEPARATOR_LINE

#define UNI_GC_ZP   UNI_GC_SEPARATOR_PARAGRAPH

#define UNI_GC_CC   UNI_GC_OTHER_CONTROL

#define UNI_GC_CF   UNI_GC_OTHER_FORMAT

#define UNI_GC_CS   UNI_GC_OTHER_SURROGATE

#define UNI_GC_CO   UNI_GC_OTHER_PRIVATE_USE

#define UNI_GC_CN   UNI_GC_OTHER_NOT_ASSIGNED

Enumerations

enum uni_gc_t {
  UNI_GC_LETTER_UPPERCASE = 0, UNI_GC_LETTER_LOWERCASE, UNI_GC_LETTER_TITLECASE, UNI_GC_LETTER_MODIFIER,
  UNI_GC_LETTER_OTHER, UNI_GC_MARK_NONSPACING, UNI_GC_MARK_SPACING_COMBINE, UNI_GC_MARK_ENCLOSING,
  UNI_GC_NUMBER_DECIMAL, UNI_GC_NUMBER_LETTER, UNI_GC_NUMBER_OTHER, UNI_GC_PUNCT_CONNECTOR,
  UNI_GC_PUNCT_DASH, UNI_GC_PUNCT_OPEN, UNI_GC_PUNCT_CLOSE, UNI_GC_PUNCT_INIT_QUOTE,
  UNI_GC_PUNCT_FINAL_QUOTE, UNI_GC_PUNCT_OTHER, UNI_GC_SYMBOL_MATH, UNI_GC_SYMBOL_CURRENCY,
  UNI_GC_SYMBOL_MODIFIER, UNI_GC_SYMBOL_OTHER, UNI_GC_SEPARATOR_SPACE, UNI_GC_SEPARATOR_LINE,
  UNI_GC_SEPARATOR_PARAGRAPH, UNI_GC_OTHER_CONTROL, UNI_GC_OTHER_FORMAT, UNI_GC_OTHER_SURROGATE,
  UNI_GC_OTHER_PRIVATE_USE, UNI_GC_OTHER_NOT_ASSIGNED
}

Variables

struct {

   const guchar   start

   const guchar   end

} utf8_2nd_byte_tab [64]

const struct utf32_nfkd utf32_nfkd_lut []

The upper 8 bit of c are reserved for flags.

struct {

   guint16   lower

   guint16   upper

} utf32_uppercase_lut []

struct {

   guint16   upper

   guint16   lower

} utf32_lowercase_lut []

const struct utf32_comb_class utf32_comb_class_lut []

const guint32 utf32_composition_exclusions []

These are special exlusions which cannot be derived from UnicodeData.txt but are listed in CompositionExclusions.txt.

const struct utf32_general_category utf32_general_category_lut []

struct {

   guint32   start

   guint32   end

} utf32_block_id_lut []

struct {

   guint32   uc

   guint16   len

} utf32_normalization_specials []

This table is huge! It can be generated with the following AWK script:.

struct {

   guint16   uc

   const char   s [4]

} jap_tab []

Define Documentation

#define UNI_GC_CC UNI_GC_OTHER_CONTROL

#define UNI_GC_CF UNI_GC_OTHER_FORMAT

#define UNI_GC_CN UNI_GC_OTHER_NOT_ASSIGNED

#define UNI_GC_CO UNI_GC_OTHER_PRIVATE_USE

#define UNI_GC_CS UNI_GC_OTHER_SURROGATE

#define UNI_GC_LL UNI_GC_LETTER_LOWERCASE

#define UNI_GC_LM UNI_GC_LETTER_MODIFIER

#define UNI_GC_LO UNI_GC_LETTER_OTHER

#define UNI_GC_LT UNI_GC_LETTER_TITLECASE

#define UNI_GC_LU UNI_GC_LETTER_UPPERCASE

#define UNI_GC_MC UNI_GC_MARK_SPACING_COMBINE

#define UNI_GC_ME UNI_GC_MARK_ENCLOSING

#define UNI_GC_MN UNI_GC_MARK_NONSPACING

#define UNI_GC_ND UNI_GC_NUMBER_DECIMAL

#define UNI_GC_NL UNI_GC_NUMBER_LETTER

#define UNI_GC_NO UNI_GC_NUMBER_OTHER

#define UNI_GC_PC UNI_GC_PUNCT_CONNECTOR

#define UNI_GC_PD UNI_GC_PUNCT_DASH

#define UNI_GC_PE UNI_GC_PUNCT_CLOSE

#define UNI_GC_PF UNI_GC_PUNCT_FINAL_QUOTE

#define UNI_GC_PI UNI_GC_PUNCT_INIT_QUOTE

#define UNI_GC_PO UNI_GC_PUNCT_OTHER

#define UNI_GC_PS UNI_GC_PUNCT_OPEN

#define UNI_GC_SC UNI_GC_SYMBOL_CURRENCY

#define UNI_GC_SK UNI_GC_SYMBOL_MODIFIER

#define UNI_GC_SM UNI_GC_SYMBOL_MATH

#define UNI_GC_SO UNI_GC_SYMBOL_OTHER

#define UNI_GC_ZL UNI_GC_SEPARATOR_LINE

#define UNI_GC_ZP UNI_GC_SEPARATOR_PARAGRAPH

#define UNI_GC_ZS UNI_GC_SEPARATOR_SPACE

#define UTF32_F_MASK 0xff000000U

#define UTF32_F_NFKD 0x80000000U /**< Set for compatibility compositions */

Set for compatibility compositions.

#define UTF32_NFD ( x ) (x)

#define UTF32_NFKD ( x ) (UTF32_F_NFKD | (x))

#define UTF32_NFKD_REPLACE_MAXLEN 18

Enumeration Type Documentation

enum uni_gc_t

Enumeration values:

UNI_GC_LETTER_UPPERCASE

UNI_GC_LETTER_LOWERCASE

UNI_GC_LETTER_TITLECASE

UNI_GC_LETTER_MODIFIER

UNI_GC_LETTER_OTHER

UNI_GC_MARK_NONSPACING

UNI_GC_MARK_SPACING_COMBINE

UNI_GC_MARK_ENCLOSING

UNI_GC_NUMBER_DECIMAL

UNI_GC_NUMBER_LETTER

UNI_GC_NUMBER_OTHER

UNI_GC_PUNCT_CONNECTOR

UNI_GC_PUNCT_DASH

UNI_GC_PUNCT_OPEN

UNI_GC_PUNCT_CLOSE

UNI_GC_PUNCT_INIT_QUOTE

UNI_GC_PUNCT_FINAL_QUOTE

UNI_GC_PUNCT_OTHER

UNI_GC_SYMBOL_MATH

UNI_GC_SYMBOL_CURRENCY

UNI_GC_SYMBOL_MODIFIER

UNI_GC_SYMBOL_OTHER

UNI_GC_SEPARATOR_SPACE

UNI_GC_SEPARATOR_LINE

UNI_GC_SEPARATOR_PARAGRAPH

UNI_GC_OTHER_CONTROL

UNI_GC_OTHER_FORMAT

UNI_GC_OTHER_SURROGATE

UNI_GC_OTHER_PRIVATE_USE

UNI_GC_OTHER_NOT_ASSIGNED

Variable Documentation

guint32 end

const { ... } jap_tab[] [static]

guint16 len

The array length.

guint16 lower

const char s[4]

guint32 start

guint16 uc

The first unicode character in the array.

guint16 upper

const { ... } utf32_block_id_lut[] [static]

const struct utf32_comb_class utf32_comb_class_lut[] [static]

const guint32 utf32_composition_exclusions[] [static]

These are special exlusions which cannot be derived from UnicodeData.txt but are listed in CompositionExclusions.txt.
Note that the entries in that file are not completely sorted.

const struct utf32_general_category utf32_general_category_lut[] [static]

const { ... } utf32_lowercase_lut[] [static]

const struct utf32_nfkd utf32_nfkd_lut[] [static]

The upper 8 bit of c are reserved for flags.
The character value is, thus, only the lower 24 bits.

const { ... } utf32_normalization_specials[] [static]

This table is huge! It can be generated with the following AWK script:.
awk 'BEGIN { FS=";" } /^[0-9A-Z]/ { sub("#.*", ""); printf("{ { "); for (i = 1; i < 6; i++) { gsub(" ", ", 0x", $i); sub("$", ",", $i); printf("{ 0x%s }%s", $i, i < 5 ? ", " : " } },\n" ); } } ' NormalizationTest.txt

const { ... } utf32_uppercase_lut[] [static]

const { ... } utf8_2nd_byte_tab[64] [static]

Generated on Sun Feb 12 10:50:10 2006 for Gtk-Gnutella by

1.3.6


Data Structures
struct	utf32_nfkd
	The upper 8 bit of c are reserved for flags. More...
struct	utf32_comb_class
struct	utf32_general_category
Defines
#define	UTF32_NFKD_REPLACE_MAXLEN 18
#define	UTF32_F_MASK 0xff000000U
#define	UTF32_F_NFKD 0x80000000U /*< Set for compatibility compositions /
	Set for compatibility compositions.
#define	UTF32_NFD(x) (x)
#define	UTF32_NFKD(x) (UTF32_F_NFKD \| (x))
#define	UNI_GC_LU UNI_GC_LETTER_UPPERCASE
#define	UNI_GC_LL UNI_GC_LETTER_LOWERCASE
#define	UNI_GC_LT UNI_GC_LETTER_TITLECASE
#define	UNI_GC_LM UNI_GC_LETTER_MODIFIER
#define	UNI_GC_LO UNI_GC_LETTER_OTHER
#define	UNI_GC_MN UNI_GC_MARK_NONSPACING
#define	UNI_GC_MC UNI_GC_MARK_SPACING_COMBINE
#define	UNI_GC_ME UNI_GC_MARK_ENCLOSING
#define	UNI_GC_ND UNI_GC_NUMBER_DECIMAL
#define	UNI_GC_NL UNI_GC_NUMBER_LETTER
#define	UNI_GC_NO UNI_GC_NUMBER_OTHER
#define	UNI_GC_PC UNI_GC_PUNCT_CONNECTOR
#define	UNI_GC_PD UNI_GC_PUNCT_DASH
#define	UNI_GC_PS UNI_GC_PUNCT_OPEN
#define	UNI_GC_PE UNI_GC_PUNCT_CLOSE
#define	UNI_GC_PI UNI_GC_PUNCT_INIT_QUOTE
#define	UNI_GC_PF UNI_GC_PUNCT_FINAL_QUOTE
#define	UNI_GC_PO UNI_GC_PUNCT_OTHER
#define	UNI_GC_SM UNI_GC_SYMBOL_MATH
#define	UNI_GC_SC UNI_GC_SYMBOL_CURRENCY
#define	UNI_GC_SK UNI_GC_SYMBOL_MODIFIER
#define	UNI_GC_SO UNI_GC_SYMBOL_OTHER
#define	UNI_GC_ZS UNI_GC_SEPARATOR_SPACE
#define	UNI_GC_ZL UNI_GC_SEPARATOR_LINE
#define	UNI_GC_ZP UNI_GC_SEPARATOR_PARAGRAPH
#define	UNI_GC_CC UNI_GC_OTHER_CONTROL
#define	UNI_GC_CF UNI_GC_OTHER_FORMAT
#define	UNI_GC_CS UNI_GC_OTHER_SURROGATE
#define	UNI_GC_CO UNI_GC_OTHER_PRIVATE_USE
#define	UNI_GC_CN UNI_GC_OTHER_NOT_ASSIGNED
Enumerations
enum	uni_gc_t { UNI_GC_LETTER_UPPERCASE = 0, UNI_GC_LETTER_LOWERCASE, UNI_GC_LETTER_TITLECASE, UNI_GC_LETTER_MODIFIER, UNI_GC_LETTER_OTHER, UNI_GC_MARK_NONSPACING, UNI_GC_MARK_SPACING_COMBINE, UNI_GC_MARK_ENCLOSING, UNI_GC_NUMBER_DECIMAL, UNI_GC_NUMBER_LETTER, UNI_GC_NUMBER_OTHER, UNI_GC_PUNCT_CONNECTOR, UNI_GC_PUNCT_DASH, UNI_GC_PUNCT_OPEN, UNI_GC_PUNCT_CLOSE, UNI_GC_PUNCT_INIT_QUOTE, UNI_GC_PUNCT_FINAL_QUOTE, UNI_GC_PUNCT_OTHER, UNI_GC_SYMBOL_MATH, UNI_GC_SYMBOL_CURRENCY, UNI_GC_SYMBOL_MODIFIER, UNI_GC_SYMBOL_OTHER, UNI_GC_SEPARATOR_SPACE, UNI_GC_SEPARATOR_LINE, UNI_GC_SEPARATOR_PARAGRAPH, UNI_GC_OTHER_CONTROL, UNI_GC_OTHER_FORMAT, UNI_GC_OTHER_SURROGATE, UNI_GC_OTHER_PRIVATE_USE, UNI_GC_OTHER_NOT_ASSIGNED }
Variables
struct {
const guchar start
const guchar end
}	utf8_2nd_byte_tab [64]
const struct utf32_nfkd	utf32_nfkd_lut []
	The upper 8 bit of c are reserved for flags.
struct {
guint16 lower
guint16 upper
}	utf32_uppercase_lut []
struct {
guint16 upper
guint16 lower
}	utf32_lowercase_lut []
const struct utf32_comb_class	utf32_comb_class_lut []
const guint32	utf32_composition_exclusions []
	These are special exlusions which cannot be derived from UnicodeData.txt but are listed in CompositionExclusions.txt.
const struct utf32_general_category	utf32_general_category_lut []
struct {
guint32 start
guint32 end
}	utf32_block_id_lut []
struct {
guint32 uc
guint16 len
}	utf32_normalization_specials []
	This table is huge! It can be generated with the following AWK script:.
struct {
guint16 uc
const char s [4]
}	jap_tab []