Main Page | Modules | Alphabetical List | Data Structures | File List | Data Fields | Globals | Related Pages

utf8.c File Reference


Detailed Description

Unicode Transformation Format 8 bits.

This code has been heavily inspired by utf8.c/utf8.h from Perl 5.6.1, written by Larry Wall et al.

Author:
Raphael Manfredi
Date:
2002-2003
Author:
Christian Biere
Date:
2004-2005

#include "common.h"
#include <locale.h>
#include <iconv.h>
#include "utf8_tables.h"
#include "utf8.h"
#include "misc.h"
#include "glib-missing.h"
#include "override.h"

Data Structures

struct  conv_to_utf8

Defines

#define D(name, id)   (iconv_t) -1, (name), (id), FALSE
#define UTF8_LENGTH_MARK(len)   utf8len_mark[len]
#define CHAR(x)   ((guchar) (x))
#define UTF8_BYTE_MARK   0x80
#define UTF8_BYTE_MASK   0xbf
#define UTF8_IS_ASCII(x)   (CHAR(x) < UTF8_BYTE_MARK)
#define UTF8_IS_START(x)   (CHAR(x) >= 0xc0 && CHAR(x) <= 0xfd)
#define UTF8_IS_CONTINUATION(x)   (CHAR(x) >= UTF8_BYTE_MARK && CHAR(x) <= UTF8_BYTE_MASK)
#define UTF8_IS_CONTINUED(x)   (CHAR(x) & UTF8_BYTE_MARK)
#define UTF8_CONT_MASK   (CHAR(0x3f))
#define UTF8_ACCU_SHIFT   6
#define UTF8_ACCUMULATE(o, n)   (((o) << UTF8_ACCU_SHIFT) | (CHAR(n) & UTF8_CONT_MASK))
#define UNI_SURROGATE_FIRST   0xd800
#define UNI_SURROGATE_SECOND   0xdc00
#define UNI_SURROGATE_LAST   0xdfff
#define UNI_HANGUL_FIRST   0xac00
#define UNI_HANGUL_LAST   0xd7a3
#define UNI_REPLACEMENT   0xfffd
#define UNI_BYTE_ORDER_MARK   0xfffe
#define UNI_ILLEGAL   0xffff
#define UNICODE_IS_SURROGATE(x)   ((x) >= UNI_SURROGATE_FIRST && (x) <= UNI_SURROGATE_LAST)
#define UNICODE_IS_HANGUL(x)   ((x) >= UNI_HANGUL_FIRST && (x) <= UNI_HANGUL_LAST)
#define UNICODE_IS_ASCII(x)   ((x) < 0x0080U)
#define UNICODE_IS_REPLACEMENT(x)   ((x) == UNI_REPLACEMENT)
#define UNICODE_IS_BYTE_ORDER_MARK(x)   ((0xFFFFU & (x)) == UNI_BYTE_ORDER_MARK)
#define UNICODE_IS_BOM(x)   UNICODE_IS_BYTE_ORDER_MARK(x)
#define UNICODE_IS_ILLEGAL(x)   ((x) > 0x10FFFFU || (UNI_ILLEGAL & (x)) == UNI_ILLEGAL)
#define GET_ITEM(i)   (utf32_comb_class_lut[(i)].uc)
#define FOUND(i)
#define GET_ITEM(i)   (i)
#define FOUND(i)
#define GET_ITEM(i)   (utf32_composition_exclusions[(i)])
#define FOUND(i)
#define GET_ITEM(i)   (i)
#define FOUND(i)
#define GET_ITEM(i)   (i)
#define FOUND(i)
#define UTF8_WARN_EMPTY   0
#define UTF8_WARN_CONTINUATION   1
#define UTF8_WARN_NON_CONTINUATION   2
#define UTF8_WARN_FE_FF   3
#define UTF8_WARN_SHORT   4
#define UTF8_WARN_OVERFLOW   5
#define UTF8_WARN_SURROGATE   6
#define UTF8_WARN_BOM   7
#define UTF8_WARN_LONG   8
#define UTF8_WARN_ILLEGAL   9
#define IS_NON_NUL_ASCII(p)   (!(*(p) & ~0x7f) && (*(p) > 0))
#define LAZY_CONVERT(func, proto, params)
 This macro is used to generate "lazy" variants of the converter functions.

#define GET_ITEM(i)   (utf32_nfkd_lut[(i)].c & ~UTF32_F_MASK)
#define FOUND(i)
#define GET_ITEM(i)   (utf32_uppercase_lut[(i)].lower)
#define FOUND(i)
#define GET_ITEM(i)   (utf32_lowercase_lut[(i)].upper)
#define FOUND(i)
#define T_COUNT   28
#define V_COUNT   21
#define N_COUNT   (T_COUNT * V_COUNT)
#define L_COUNT   19
#define T_COUNT   28
#define V_COUNT   21
#define N_COUNT   (T_COUNT * V_COUNT)
#define S_COUNT   (L_COUNT * N_COUNT)
#define GET_ITEM(i)   (jap_tab[(i)].uc)
#define FOUND(i)
#define REGRESSION(func)

Typedefs

typedef guint32(* utf32_remap_func )(guint32 uc)

Enumerations

enum  utf8_cd {
  UTF8_CD_ISO8859_1, UTF8_CD_ISO8859_6, UTF8_CD_ISO8859_7, UTF8_CD_ISO8859_8,
  UTF8_CD_SJIS, UTF8_CD_EUC_JP, UTF8_CD_KOI8_R, NUM_UTF8_CDS,
  UTF8_CD_INVALID = -1
}

Functions

 RCSID ("$Id:utf8.c, v 1.96 2006/02/02 23:58:20 cbiere Exp $")
void unicode_compose_init (void)
void regression_checks (void)
size_t utf8_decompose_nfd (const gchar *in, gchar *out, size_t size)
 Decomposes (NFD) an UTF-8 encoded string.

size_t utf8_decompose_nfkd (const gchar *in, gchar *out, size_t size)
 Decomposes (NFKD) an UTF-8 encoded string.

size_t utf32_strmaxlen (const guint32 *s, size_t maxlen)
 Determines the length of a UTF-32 string inspecting at most ``maxlen'' characters (not bytes!).

size_t utf32_to_utf8 (const guint32 *in, gchar *out, size_t size)
 Converts a UTF-32 encoded string to a UTF-8 encoded string.

size_t utf32_strlen (const guint32 *s)
 Determines the length of an UTF-32 string.

enum utf8_cd utf8_name_to_cd (const gchar *name)
 Looks up a "to UTF-8" converter by source charset name.

const gchar * utf8_cd_to_name (enum utf8_cd id)
 Determine the name of the source charset of a converter.

iconv_t utf8_cd_get (enum utf8_cd id)
 Get the iconv() conversion descriptor of a converter.

gboolean locale_is_utf8 (void)
const gchar * primary_filename_charset (void)
gboolean primary_filename_charset_is_utf8 (void)
G_GNUC_CONST guint utf8_skip (guchar c)
G_GNUC_CONST guint uniskip (guint32 uc)
 Determines the UTF-8 byte length for the given Unicode codepoint.

G_GNUC_CONST gboolean utf32_bad_codepoint (guint32 uc)
 Determines whether the given UTF-32 codepoint is valid in Unicode.

guint utf8_encoded_char_len (guint32 uc)
guint utf8_encode_char (guint32 uc, gchar *buf, size_t size)
 Needs short description here.

guint utf32_combining_class (guint32 uc)
gint block_id_cmp (size_t i, guint32 uc)
guint utf32_block_id (guint32 uc)
gboolean utf32_composition_exclude (guint32 uc)
gboolean utf32_is_non_character (guint32 uc)
 Checks whether the character is a non-character which is not the same as an unassigned character.

gint general_category_cmp (size_t i, guint32 uc)
uni_gc_t utf32_general_category (guint32 uc)
gint normalization_special_cmp (size_t i, guint32 uc)
gboolean utf32_is_normalization_special (guint32 uc)
guint32 utf8_decode_char (const gchar *s, gint len, guint *retlen, gboolean warn)
guint32 utf8_decode_char_fast (const gchar *s, guint *retlen)
 This routine is the same as utf8_decode_char() but it is more specialized and is aimed at being fast.

guint utf8_char_len (const gchar *s)
 Are the first bytes of string `s' forming a valid UTF-8 character?

gboolean utf8_is_valid_string (const gchar *src)
gboolean utf8_is_valid_data (const gchar *src, size_t len)
size_t utf8_char_count (const gchar *src)
size_t utf8_data_char_count (const gchar *src, size_t len)
size_t utf8_strlcpy (gchar *dst, const gchar *src, size_t dst_size)
 Works exactly like strlcpy() but preserves a valid UTF-8 encoding, if the string has to be truncated.

gint utf16_encode_char (guint32 uc, guint16 *dst)
 Encodes a single UTF-32 character as UTF-16 into a buffer.

gint utf8_to_iso8859 (gchar *s, gint len, gboolean space)
 Convert UTF-8 string to ISO-8859-1 inplace.

const gchar * get_iconv_charset_alias (const gchar *cs)
const gchar * locale_get_charset (void)
 NOTE: The internal variable "charset" can be used to override the initially detected character set name.

const gchar * locale_get_language (void)
 Determine the current language.

conv_to_utf8conv_to_utf8_new (const gchar *cs)
GSList * get_filename_charsets (const gchar *locale)
 Emulate GLib 2.x behaviour and select the appropriate character set for filenames.

void textdomain_init (const char *codeset)
void locale_init_show_results (void)
void conversion_init (void)
void locale_init (void)
void locale_close (void)
 Called at shutdown time.

size_t complete_iconv (iconv_t cd, gchar *dst, size_t dst_left, const gchar *src, gboolean abort_on_error)
 Converts the string in "src" into the buffer "dst" using the iconv context "cd".

gchar * hyper_iconv (iconv_t cd, gchar *dst, size_t dst_size, const gchar *src, gboolean abort_on_error)
 Converts the string in "src" to "dst" using the iconv context "cd".

size_t utf8_enforce (gchar *dst, size_t size, const gchar *src)
 Copies the NUL-terminated string ``src'' to ``dst'' replacing all invalid characters (non-UTF-8) with underscores.

size_t ascii_enforce (gchar *dst, size_t size, const gchar *src)
 Copies the NUL-terminated string ``src'' to ``dst'' replacing all invalid characters (non-ASCII) with underscores.

gchar * hyper_utf8_enforce (gchar *dst, size_t dst_size, const gchar *src)
gchar * hyper_ascii_enforce (gchar *dst, size_t dst_size, const gchar *src)
gchar * utf8_to_filename_charset (const gchar *src)
 Non-convertible characters will be replaced by '_'.

gchar * utf8_to_filename (const gchar *src)
 Converts the UTF-8 encoded src string to a string encoded in the primary filename character set.

gchar * utf8_to_locale (const gchar *src)
 Non-convertible characters will be replaced by '_'.

gchar * convert_to_utf8 (iconv_t cd, const gchar *src)
gchar * locale_to_utf8 (const gchar *src)
 Converts a string from the locale's character set to UTF-8 encoding.

gchar * iso8859_1_to_utf8 (const gchar *src)
 Converts a string from ISO-8859-1 to UTF-8 encoding.

gboolean is_ascii_string (const gchar *s)
const gchar * ascii_rewind (const gchar *const s0, const gchar *p)
gboolean koi8_is_cyrillic_char (guchar c)
gboolean looks_like_koi8 (const gchar *src)
gboolean iso8859_is_valid_char (guchar c)
gboolean iso8859_6_is_arabic_char (guchar c)
gboolean iso8859_6_is_valid_char (guchar c)
gboolean looks_like_iso8859_6 (const gchar *src)
gboolean iso8859_7_is_greek_char (guchar c)
gboolean looks_like_iso8859_7 (const gchar *src)
gboolean iso8859_8_is_hebrew_char (guchar c)
gboolean iso8859_8_is_valid_char (guchar c)
gboolean looks_like_iso8859_8 (const gchar *src)
gboolean looks_like_sjis (const gchar *src)
 Matches SJIS encoded strings.

gboolean iso8859_is_valid_string (const gchar *src)
gchar * unknown_to_utf8 (const gchar *src, const gchar **charset_ptr)
 Converts the string to UTF-8 assuming an appropriate character set.

gchar * convert_to_utf8_normalized (iconv_t cd, const gchar *src, uni_norm_t norm)
gchar * locale_to_utf8_normalized (const gchar *src, uni_norm_t norm)
 Converts a string from the locale's character set to UTF-8 encoding and the specified Unicode normalization form.

gchar * filename_to_utf8_normalized (const gchar *src, uni_norm_t norm)
 Converts a string from the filename character set to UTF-8 encoding and the specified Unicode normalization form.

gchar * iso8859_1_to_utf8_normalized (const gchar *src, uni_norm_t norm)
 Converts a string from the ISO-8859-1 character set to UTF-8 encoding and the specified Unicode normalization form.

gchar * unknown_to_utf8_normalized (const gchar *src, uni_norm_t norm, const gchar **charset_ptr)
 Converts a string from the ISO-8859-1 character set to UTF-8 encoding and the specified Unicode normalization form.

gchar * utf8_to_ui_string (const gchar *src)
gchar * ui_string_to_utf8 (const gchar *src)
gchar * locale_to_ui_string (const gchar *src)
gchar * locale_to_ui_string2 (const gchar *src)
 LAZY_CONVERT (locale_to_utf8_normalized,(const gchar *src, uni_norm_t norm),(src, norm))
size_t utf32_to_utf8_inplace (guint32 *buf)
 Converts a UTF-32 encoded string to a UTF-8 encoded string.

guint32 * utf32_strdup (const guint32 *s)
 The equivalent of g_strdup() for UTF-32 strings.

gint64 utf32_strcmp (const guint32 *s1, const guint32 *s2)
const guint32 * utf32_decompose_lookup (guint32 uc, gboolean nfkd)
 Looks up the decomposed string for an UTF-32 character.

guint32 utf32_uppercase (guint32 uc)
 Looks up the simple uppercase variant of an UTF-32 character.

guint32 utf32_lowercase (guint32 uc)
 Looks up the simple lowercase variant of an UTF-32 character.

guint32 utf32_compose_char (guint32 a, guint32 b)
 Finds the composition of two UTF-32 characters.

guint32 * utf32_next_starter (const guint32 *s)
 Finds the next ``starter'' character (combining class zero) in the string starting at ``s''.

gboolean utf32_canonical_sorted (const guint32 *src)
 Checks whether an UTF-32 string is in canonical order.

gboolean utf32_is_decomposed_char (guint32 uc, gboolean nfkd)
gboolean utf32_is_decomposed (const guint32 *src, gboolean nfkd)
 Checks whether an UTF-32 string is decomposed.

guint32 * utf32_sort_canonical (guint32 *src)
 Puts an UTF-32 string into canonical order.

gboolean utf8_is_decomposed (const gchar *src, gboolean nfkd)
 Checks whether an UTF-8 encoded string is decomposed.

gboolean utf8_canonical_sorted (const gchar *src)
 Checks whether an UTF-8 encoded string is in canonical order.

gchar * utf8_sort_canonical (gchar *src)
 Puts an UTF-8 encoded string into canonical order.

guint utf32_decompose_hangul_char (guint32 uc, guint32 *buf)
 Decomposes a Hangul character.

size_t utf32_compose_hangul (guint32 *src)
 Composes all Hangul characters in a string.

const guint32 * utf32_decompose_single_char (guint32 uc, size_t *len, gboolean nfkd)
 Decomposes a single UTF-32 character.

const guint32 * utf32_decompose_char (guint32 uc, size_t *len, gboolean nfkd)
 Decomposes an UTF-32 character completely.

size_t utf8_decompose (const gchar *src, gchar *out, size_t size, gboolean nfkd)
 Decomposes an UTF-8 encoded string.

size_t utf32_decompose (const guint32 *in, guint32 *out, size_t size, gboolean nfkd)
 Decomposes an UTF-32 encoded string.

size_t utf32_decompose_nfd (const guint32 *in, guint32 *out, size_t size)
 Decomposes (NFD) an UTF-32 encoded string.

size_t utf32_decompose_nfkd (const guint32 *in, guint32 *out, size_t size)
 Decomposes (NFKD) an UTF-32 encoded string.

size_t utf8_remap (gchar *dst, const gchar *src, size_t size, utf32_remap_func remap)
 Copies the UTF-8 string ``src'' to ``dst'' remapping all characters using ``remap''.

size_t utf32_remap (guint32 *dst, const guint32 *src, size_t size, utf32_remap_func remap)
 Copies the UTF-32 string ``src'' to ``dst'' remapping all characters using ``remap''.

size_t utf32_strlower (guint32 *dst, const guint32 *src, size_t size)
 Copies ``src'' to ``dst'' converting all characters to lowercase.

size_t utf32_strupper (guint32 *dst, const guint32 *src, size_t size)
 Copies ``src'' to ``dst'' converting all characters to uppercase.

size_t utf8_strlower (gchar *dst, const gchar *src, size_t size)
 Copies ``src'' to ``dst'' converting all characters to lowercase.

size_t utf8_strupper (gchar *dst, const gchar *src, size_t size)
 Copies ``src'' to ``dst'' converting all characters to uppercase.

gchar * utf8_strlower_copy (const gchar *src)
 Copies the UTF-8 string ``src'' to a newly allocated buffer converting all characters to lowercase.

gchar * utf8_strupper_copy (const gchar *src)
 Copies the UTF-8 string ``src'' to a newly allocated buffer converting all characters to uppercase.

guint32 utf32_filter_char (guint32 uc, gboolean *space, gboolean last)
 Filters characters that are ignorable for query strings.

size_t utf32_filter (const guint32 *src, guint32 *dst, size_t size)
 Remove all the non letter and non digit by looking the unicode symbol type all other characters will be reduce to normal space try to merge continues spaces in the same time keep the important non spacing marks.

size_t utf32_split_blocks (const guint32 *src, guint32 *dst, size_t size)
 Copies the NUL-terminated UTF-32 string ``src'' to ``dst'' inserting an ASCII whitespace (U+0020) at every Unicode block change.

gboolean icu_enabled (void)
gboolean locale_is_latin (void)
size_t utf32_compose (guint32 *src)
 Composes an UTF-32 encoded string in-place.

guint32 * utf32_normalize (const guint32 *src, uni_norm_t norm)
gchar * utf8_normalize (const gchar *src, uni_norm_t norm)
 Normalizes an UTF-8 string to the request normal form and returns it as a newly allocated string.

guint32 * utf32_canonize (const guint32 *src)
 Apply the NFKD/NFC algo to have nomalized keywords.

gchar * utf8_canonize (const gchar *src)
 Apply the NFKD/NFC algo to have nomalized keywords.

int compose_root_cmp (gconstpointer a, gconstpointer b)
 Helper function to sort the lists of ``utf32_compose_roots''.

void unicode_compose_add (guint idx)
 This is a helper for unicode_compose_init() to create the lookup table used by utf32_compose_char().

const gchar * utf8_dejap_char (const guint32 uc)
gboolean utf8_can_dejap (const gchar *src)
 Checks whether the given UTF-8 string contains any convertible characters.

size_t utf8_dejap (gchar *dst, size_t dst_size, const gchar *src)
 Converts hiragana and katakana characters to ASCII sequences, strips voice marks and keeps any other characters as is.

void regression_normalization_character_identity (void)
 Checks that the following holds except for the characters the appear in column 1 in Part 1 of NormalizationTest.txt:.

void regression_normalization_issue (void)
 See: http://www.unicode.org/review/pr-29.html.

void regression_utf8_strlower (void)
void regression_bug_1211413 (void)
 The following code is supposed to reproduce bug #1211413.

void regression_iconv_utf8_to_utf8 (void)
 Some iconv()s let invalid UTF-8 with codepoints beyond U+10FFFF slip through, when converting from UTF-8 to UTF-8.

void regression_utf8_bijection (void)
void utf8_regression_checks (void)

Variables

guint32 common_dbg = 0
 XXX -- need to init lib's props --RAM.

gboolean unicode_compose_init_passed
gboolean locale_init_passed
gboolean use_icu = FALSE
 use_icu is set to TRUE if the initialization of ICU succeeded.

gboolean latin_locale = FALSE
 Used by is_latin_locale().

const gchar * charset = NULL
GSList * sl_filename_charsets = NULL
 A single-linked list of conv_to_utf8 structs.

iconv_t cd_locale_to_utf8 = (iconv_t) -1
iconv_t cd_utf8_to_locale = (iconv_t) -1
 Mainly used for Gtk+ 1.2.

iconv_t cd_utf8_to_filename = (iconv_t) -1
 Mainly used for Gtk+ 1.2.

struct {
   iconv_t   cd
   const gchar *   name
   const enum utf8_cd   id
   gboolean   initialized
utf8_cd_tab []
const guint8 utf8len_mark []
const char * codesets []
GHashTable * utf32_compose_roots


Define Documentation

#define CHAR  )     ((guchar) (x))
 

#define D name,
id   )     (iconv_t) -1, (name), (id), FALSE
 

#define FOUND  ) 
 

Value:

G_STMT_START { \
    return jap_tab[(i)].s; \
    /* NOTREACHED */ \
} G_STMT_END

#define FOUND  ) 
 

Value:

G_STMT_START { \
    return utf32_lowercase_lut[(i)].lower; \
    /* NOTREACHED */ \
} G_STMT_END

#define FOUND  ) 
 

Value:

G_STMT_START { \
    return utf32_uppercase_lut[(i)].upper; \
    /* NOTREACHED */ \
} G_STMT_END

#define FOUND  ) 
 

Value:

G_STMT_START { \
    return utf32_nfkd_lut[(i)].c & (nfkd ? 0 : UTF32_F_NFKD) \
        ? NULL \
        : utf32_nfkd_lut[(i)].d; \
    /* NOTREACHED */ \
} G_STMT_END

#define FOUND  ) 
 

Value:

G_STMT_START { \
    return TRUE; \
    /* NOTREACHED */ \
} G_STMT_END

#define FOUND  ) 
 

Value:

G_STMT_START { \
    return utf32_general_category_lut[(i)].gc; \
    /* NOTREACHED */ \
} G_STMT_END

#define FOUND  ) 
 

Value:

G_STMT_START { \
    return TRUE; \
    /* NOTREACHED */ \
} G_STMT_END

#define FOUND  ) 
 

Value:

G_STMT_START { \
    return 1 + (i); \
    /* NOTREACHED */ \
} G_STMT_END

#define FOUND  ) 
 

Value:

G_STMT_START { \
    return utf32_comb_class_lut[(i)].cc; \
    /* NOTREACHED */ \
} G_STMT_END

#define GET_ITEM  )     (jap_tab[(i)].uc)
 

#define GET_ITEM  )     (utf32_lowercase_lut[(i)].upper)
 

#define GET_ITEM  )     (utf32_uppercase_lut[(i)].lower)
 

#define GET_ITEM  )     (utf32_nfkd_lut[(i)].c & ~UTF32_F_MASK)
 

#define GET_ITEM  )     (i)
 

#define GET_ITEM  )     (i)
 

#define GET_ITEM  )     (utf32_composition_exclusions[(i)])
 

#define GET_ITEM  )     (i)
 

#define GET_ITEM  )     (utf32_comb_class_lut[(i)].uc)
 

#define IS_NON_NUL_ASCII  )     (!(*(p) & ~0x7f) && (*(p) > 0))
 

#define L_COUNT   19
 

#define LAZY_CONVERT func,
proto,
params   ) 
 

Value:

const gchar * \
CAT2(lazy_,func) proto \
{ \
    static gchar *prev; /* Previous conversion result */ \
    gchar *dst; \
 \
    g_assert(src); \
    g_assert(prev != src); \
 \
    G_FREE_NULL(prev); \
 \
    dst = func params; \
    if (dst != src) \
        prev = dst; \
    return dst; \
}
This macro is used to generate "lazy" variants of the converter functions.

In this context "lazy" means that the function will either return the original string (if appropriate) or a newly allocated string but the newly allocated string MUST NOT be freed. Instead the memory will be released when the function is used again. Thus the handling is similar to that of functions which return static buffers except that the functions are not limited to a fixed buffer size. The return type has a const qualifier so that a blatant attempt to free the memory is usually caught at compile time. If the result is not the original string, it MUST NOT be passed as parameter to this function. The last allocated buffer will normally be leaked at exit time. However, if you pass an empty string, the last allocated buffer is released and the empty string itself is returned. This is not strictly necessary but it may be used to get rid of useless warnings about a "memory leak" or to keep the memory foot-print lower.

#define N_COUNT   (T_COUNT * V_COUNT)
 

#define N_COUNT   (T_COUNT * V_COUNT)
 

#define REGRESSION func   ) 
 

Value:

G_STMT_START { \
    printf("REGRESSION: regression_%s", STRINGIFY(func)); \
    fflush(stdout); \
    CAT2(regression_,func)(); \
    printf(" PASSED\n"); \
} G_STMT_END

#define S_COUNT   (L_COUNT * N_COUNT)
 

#define T_COUNT   28
 

#define T_COUNT   28
 

#define UNI_BYTE_ORDER_MARK   0xfffe
 

#define UNI_HANGUL_FIRST   0xac00
 

#define UNI_HANGUL_LAST   0xd7a3
 

#define UNI_ILLEGAL   0xffff
 

#define UNI_REPLACEMENT   0xfffd
 

#define UNI_SURROGATE_FIRST   0xd800
 

#define UNI_SURROGATE_LAST   0xdfff
 

#define UNI_SURROGATE_SECOND   0xdc00
 

#define UNICODE_IS_ASCII  )     ((x) < 0x0080U)
 

#define UNICODE_IS_BOM  )     UNICODE_IS_BYTE_ORDER_MARK(x)
 

#define UNICODE_IS_BYTE_ORDER_MARK  )     ((0xFFFFU & (x)) == UNI_BYTE_ORDER_MARK)
 

#define UNICODE_IS_HANGUL  )     ((x) >= UNI_HANGUL_FIRST && (x) <= UNI_HANGUL_LAST)
 

#define UNICODE_IS_ILLEGAL  )     ((x) > 0x10FFFFU || (UNI_ILLEGAL & (x)) == UNI_ILLEGAL)
 

#define UNICODE_IS_REPLACEMENT  )     ((x) == UNI_REPLACEMENT)
 

#define UNICODE_IS_SURROGATE  )     ((x) >= UNI_SURROGATE_FIRST && (x) <= UNI_SURROGATE_LAST)
 

#define UTF8_ACCU_SHIFT   6
 

#define UTF8_ACCUMULATE o,
 )     (((o) << UTF8_ACCU_SHIFT) | (CHAR(n) & UTF8_CONT_MASK))
 

#define UTF8_BYTE_MARK   0x80
 

#define UTF8_BYTE_MASK   0xbf
 

#define UTF8_CONT_MASK   (CHAR(0x3f))
 

#define UTF8_IS_ASCII  )     (CHAR(x) < UTF8_BYTE_MARK)
 

#define UTF8_IS_CONTINUATION  )     (CHAR(x) >= UTF8_BYTE_MARK && CHAR(x) <= UTF8_BYTE_MASK)
 

#define UTF8_IS_CONTINUED  )     (CHAR(x) & UTF8_BYTE_MARK)
 

#define UTF8_IS_START  )     (CHAR(x) >= 0xc0 && CHAR(x) <= 0xfd)
 

#define UTF8_LENGTH_MARK len   )     utf8len_mark[len]
 

#define UTF8_WARN_BOM   7
 

#define UTF8_WARN_CONTINUATION   1
 

#define UTF8_WARN_EMPTY   0
 

#define UTF8_WARN_FE_FF   3
 

#define UTF8_WARN_ILLEGAL   9
 

#define UTF8_WARN_LONG   8
 

#define UTF8_WARN_NON_CONTINUATION   2
 

#define UTF8_WARN_OVERFLOW   5
 

#define UTF8_WARN_SHORT   4
 

#define UTF8_WARN_SURROGATE   6
 

#define V_COUNT   21
 

#define V_COUNT   21
 


Typedef Documentation

typedef guint32(* utf32_remap_func)(guint32 uc)
 


Enumeration Type Documentation

enum utf8_cd
 

Enumeration values:
UTF8_CD_ISO8859_1 
UTF8_CD_ISO8859_6 
UTF8_CD_ISO8859_7 
UTF8_CD_ISO8859_8 
UTF8_CD_SJIS 
UTF8_CD_EUC_JP 
UTF8_CD_KOI8_R 
NUM_UTF8_CDS 
UTF8_CD_INVALID 


Function Documentation

size_t ascii_enforce gchar *  dst,
size_t  size,
const gchar *  src
 

Copies the NUL-terminated string ``src'' to ``dst'' replacing all invalid characters (non-ASCII) with underscores.

``src'' and ``dst'' may be identical but must not overlap otherwise. If ``dst'' is to small, the resulting string will be truncated.

Parameters:
src a NUL-terminated string.
dst the destination buffer.
size the size in bytes of the destination buffer.
Returns:
the length in bytes of resulting string assuming size was sufficiently large.

TODO: Add overlap check

const gchar* ascii_rewind const gchar *const  s0,
const gchar *  p
[inline, static]
 

gint block_id_cmp size_t  i,
guint32  uc
[inline, static]
 

size_t complete_iconv iconv_t  cd,
gchar *  dst,
size_t  dst_left,
const gchar *  src,
gboolean  abort_on_error
[static]
 

Converts the string in "src" into the buffer "dst" using the iconv context "cd".

If "dst_size" is too small, the resulting string will be truncated. complete_iconv() returns the necessary buffer size. IFF "dst_size" is zero, "dst" may be NULL.

Note:
NOTE: This assumes 8-bit (char-based) encodings.
Parameters:
cd an iconv context; if it is (iconv_t) -1, NULL will be returned.
dst the destination buffer; may be NULL IFF dst_size is zero.
dst_left no document.
src the source string to convert.
abort_on_error If TRUE, the conversion is be aborted and zero is returned on any error. Otherwise, if iconv() returns EINVAL or EILSEQ an underscore is written to the destination buffer as replacement character.
Returns:
On success the size of the converting string including the trailing NUL. Otherwise, zero is returned.

int compose_root_cmp gconstpointer  a,
gconstpointer  b
[static]
 

Helper function to sort the lists of ``utf32_compose_roots''.

struct conv_to_utf8* conv_to_utf8_new const gchar *  cs  ) 
 

void conversion_init void   )  [static]
 

gchar* convert_to_utf8 iconv_t  cd,
const gchar *  src
[static]
 

gchar* convert_to_utf8_normalized iconv_t  cd,
const gchar *  src,
uni_norm_t  norm
[static]
 

gchar* filename_to_utf8_normalized const gchar *  src,
uni_norm_t  norm
 

Converts a string from the filename character set to UTF-8 encoding and the specified Unicode normalization form.

Parameters:
src the string to convert.
norm the Unicode normalization form to use.
Returns:
a newly allocated string.

gint general_category_cmp size_t  i,
guint32  uc
[inline, static]
 

GSList* get_filename_charsets const gchar *  locale  ) 
 

Emulate GLib 2.x behaviour and select the appropriate character set for filenames.

Parameters:
locale the name of the current locale character set.
Returns:
a list of newly allocated strings holding the names of the used character sets. The first is the one that should be used when creating files.

const gchar* get_iconv_charset_alias const gchar *  cs  ) 
 

Returns:
a string representing the current locale as an alias which is understood by GNU iconv. The returned pointer points to a static buffer.

gchar* hyper_ascii_enforce gchar *  dst,
size_t  dst_size,
const gchar *  src
 

gchar* hyper_iconv iconv_t  cd,
gchar *  dst,
size_t  dst_size,
const gchar *  src,
gboolean  abort_on_error
[static]
 

Converts the string in "src" to "dst" using the iconv context "cd".

If complete_iconv() iconv fails, NULL is returned. Otherwise, the converted string is returned. If "dst" was sufficiently large, "dst" will be returned. If not, a newly allocated string is returned. In the latter case complete_iconv() has to run twice. IFF dst_size is zero "dst" won't be touched and may be NULL. For best performance a small local buffer should be used as "dst" so that complete_iconv() does not have to run twice, especially if the result is only used temporary and copying is not necessary.

Parameters:
cd an iconv context; if it is (iconv_t) -1, NULL will be returned.
src the source string to convert.
dst the destination buffer; may be NULL IFF dst_size is zero.
dst_size the size of the dst buffer.
abort_on_error If TRUE, NULL is returned if iconv() returns EINVAL or EILSEQ during the conversion. Otherwise, an underscore is used as replacement character and conversion continues.
Returns:
On success the converted string, either "dst" or a newly allocated string. Returns NULL on failure.

gchar* hyper_utf8_enforce gchar *  dst,
size_t  dst_size,
const gchar *  src
 

gboolean icu_enabled void   ) 
 

Returns:
TRUE if ICU was successfully initialized. If FALSE is returned none of the ICU-related functions must be used.

gboolean is_ascii_string const gchar *  s  ) 
 

gchar* iso8859_1_to_utf8 const gchar *  src  ) 
 

Converts a string from ISO-8859-1 to UTF-8 encoding.

The returned string is in no defined Unicode normalization form.

Parameters:
src a NUL-terminated string.
Returns:
a newly allocated UTF-8 encoded string.

gchar* iso8859_1_to_utf8_normalized const gchar *  src,
uni_norm_t  norm
 

Converts a string from the ISO-8859-1 character set to UTF-8 encoding and the specified Unicode normalization form.

Parameters:
src the string to convert.
norm the Unicode normalization form to use.
Returns:
a newly allocated string.

gboolean iso8859_6_is_arabic_char guchar  c  )  [inline, static]
 

gboolean iso8859_6_is_valid_char guchar  c  )  [inline, static]
 

gboolean iso8859_7_is_greek_char guchar  c  )  [inline, static]
 

gboolean iso8859_8_is_hebrew_char guchar  c  )  [inline, static]
 

gboolean iso8859_8_is_valid_char guchar  c  )  [inline, static]
 

gboolean iso8859_is_valid_char guchar  c  )  [inline, static]
 

gboolean iso8859_is_valid_string const gchar *  src  ) 
 

gboolean koi8_is_cyrillic_char guchar  c  )  [inline, static]
 

LAZY_CONVERT locale_to_utf8_normalized  ,
(const gchar *src, uni_norm_t norm)  ,
(src, norm) 
 

void locale_close void   ) 
 

Called at shutdown time.

const gchar* locale_get_charset void   ) 
 

NOTE: The internal variable "charset" can be used to override the initially detected character set name.

Returns:
the name of current locale's character set.

const gchar* locale_get_language void   ) 
 

Determine the current language.

Returns:
A two-letter ISO 639 of the language currently used for messages.

TRANSLATORS: Put the two-letter ISO 639 code here.

void locale_init void   ) 
 

void locale_init_show_results void   )  [static]
 

gboolean locale_is_latin void   ) 
 

gboolean locale_is_utf8 void   ) 
 

gchar* locale_to_ui_string const gchar *  src  )  [static]
 

gchar* locale_to_ui_string2 const gchar *  src  )  [static]
 

gchar* locale_to_utf8 const gchar *  src  ) 
 

Converts a string from the locale's character set to UTF-8 encoding.

The returned string is in no defined Unicode normalization form.

Parameters:
src a NUL-terminated string.
Returns:
a newly allocated UTF-8 encoded string.

gchar* locale_to_utf8_normalized const gchar *  src,
uni_norm_t  norm
 

Converts a string from the locale's character set to UTF-8 encoding and the specified Unicode normalization form.

Parameters:
src the string to convert.
norm the Unicode normalization form to use.
Returns:
a newly allocated string.

gboolean looks_like_iso8859_6 const gchar *  src  ) 
 

gboolean looks_like_iso8859_7 const gchar *  src  ) 
 

gboolean looks_like_iso8859_8 const gchar *  src  ) 
 

gboolean looks_like_koi8 const gchar *  src  ) 
 

gboolean looks_like_sjis const gchar *  src  ) 
 

Matches SJIS encoded strings.

Parameters:
src no dicument.
SJIS encoding has code tables below:

  • ASCII/JIS Roman "[\x00-\x7F]"
  • JIS X 0208:1997 "[\x81-\x9F\xE0-\xFC][\x40-\x7E\x80-\xFC]"
  • Half width Katakana "[\xA0-\xDF]"

gint normalization_special_cmp size_t  i,
guint32  uc
[inline, static]
 

const gchar* primary_filename_charset void   ) 
 

gboolean primary_filename_charset_is_utf8 void   )  [inline, static]
 

RCSID "$Id:utf8.  c,
v 1.96 2006/02/02 23:58:20 cbiere Exp $" 
 

void regression_bug_1211413 void   )  [static]
 

The following code is supposed to reproduce bug #1211413.

void regression_checks void   ) 
 

void regression_iconv_utf8_to_utf8 void   )  [static]
 

Some iconv()s let invalid UTF-8 with codepoints beyond U+10FFFF slip through, when converting from UTF-8 to UTF-8.

Thus, use utf8_enforce() for UTF-8 -> UTF-8 instead.

void regression_normalization_character_identity void   )  [static]
 

Checks that the following holds except for the characters the appear in column 1 in Part 1 of NormalizationTest.txt:.

X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X)

void regression_normalization_issue void   )  [static]
 

See: http://www.unicode.org/review/pr-29.html.

void regression_utf8_bijection void   )  [static]
 

void regression_utf8_strlower void   )  [static]
 

void textdomain_init const char *  codeset  )  [static]
 

gchar* ui_string_to_utf8 const gchar *  src  ) 
 

void unicode_compose_add guint  idx  )  [static]
 

This is a helper for unicode_compose_init() to create the lookup table used by utf32_compose_char().

The first character of the decomposition sequence is used as key, the index into the ``utf32_nfkd_lut'' is used as value.

void unicode_compose_init void   )  [static]
 

G_GNUC_CONST guint uniskip guint32  uc  )  [inline, static]
 

Determines the UTF-8 byte length for the given Unicode codepoint.

Parameters:
uc an UTF-32 codepoint.
Returns:
The exact amount of bytes necessary to store this codepoint in UTF-8 encoding.

gchar* unknown_to_utf8 const gchar *  src,
const gchar **  charset_ptr
 

Converts the string to UTF-8 assuming an appropriate character set.

The conversion result might still be rubbish but is guaranteed to be UTF-8 encoded.

The returned string is in no defined Unicode normalization form.

Parameters:
src a NUL-terminated string.
charset_ptr If not NULL, it will point to the name of the charset used to convert string.
Returns:
the original pointer or a newly allocated UTF-8 encoded string.

gchar* unknown_to_utf8_normalized const gchar *  src,
uni_norm_t  norm,
const gchar **  charset_ptr
 

Converts a string from the ISO-8859-1 character set to UTF-8 encoding and the specified Unicode normalization form.

Parameters:
src the string to convert.
norm the Unicode normalization form to use.
charset_ptr no document.
Returns:
Either the original src pointer or a newly allocated string.

gint utf16_encode_char guint32  uc,
guint16 *  dst
 

Encodes a single UTF-32 character as UTF-16 into a buffer.

See also RFC 2781.

Parameters:
uc the unicode character to encode.
dst the destination buffer. MUST BE at least 4 bytes long.
Returns:
0 if the unicode character is invalid. Otherwise, the amount of UTF-16 characters is returned i.e., 1 or 2.

G_GNUC_CONST gboolean utf32_bad_codepoint guint32  uc  )  [inline, static]
 

Determines whether the given UTF-32 codepoint is valid in Unicode.

Parameters:
uc an UTF-32 codepoint.
Returns:
If the given codepoint is a surrogate, a BOM, out of range or an invalid codepoint FALSE is returned; otherwise TRUE.

guint utf32_block_id guint32  uc  )  [inline, static]
 

gboolean utf32_canonical_sorted const guint32 *  src  ) 
 

Checks whether an UTF-32 string is in canonical order.

guint32* utf32_canonize const guint32 *  src  ) 
 

Apply the NFKD/NFC algo to have nomalized keywords.

guint utf32_combining_class guint32  uc  )  [inline, static]
 

size_t utf32_compose guint32 *  src  ) 
 

Composes an UTF-32 encoded string in-place.

The modified string might be shorter but is never longer than the original string.

NB: We assume that a direct composition eliminates at most one character. Further, the string must be in canonical order.

Parameters:
src an NUL-terminated UTF-32 string.
Returns:
the length in characters (not bytes!) of the possibly modified string.

guint32 utf32_compose_char guint32  a,
guint32  b
[static]
 

Finds the composition of two UTF-32 characters.

Parameters:
a an UTF-32 character (should be a starter)
b an UTF-32 character
Returns:
zero if there's no composition for the characters. Otherwise, the composed character is returned.

size_t utf32_compose_hangul guint32 *  src  )  [inline, static]
 

Composes all Hangul characters in a string.

gboolean utf32_composition_exclude guint32  uc  )  [inline, static]
 

size_t utf32_decompose const guint32 *  in,
guint32 *  out,
size_t  size,
gboolean  nfkd
[inline, static]
 

Decomposes an UTF-32 encoded string.

const guint32* utf32_decompose_char guint32  uc,
size_t *  len,
gboolean  nfkd
[inline, static]
 

Decomposes an UTF-32 character completely.

Parameters:
uc the UTF-32 to decompose.
len the variable ``len'' points to will be set to length in characters (not bytes!) of decomposed string. This is important because the decomposed string is not zero-terminated.
nfkd if TRUE, compatibility composition is used, otherwise canonical composition.
Returns:
a pointer to a buffer holding the decomposed string. The buffer is unterminated. The maximum length is UTF32_NFKD_REPLACE_MAXLEN characters. The returned pointer points to a static buffer which might get overwritten by subsequent calls to this function.

guint utf32_decompose_hangul_char guint32  uc,
guint32 *  buf
[inline, static]
 

Decomposes a Hangul character.

Parameters:
uc must be a Hangul character
buf must be at least three elements large
Returns:
the length of the decomposed character.

const guint32* utf32_decompose_lookup guint32  uc,
gboolean  nfkd
[static]
 

Looks up the decomposed string for an UTF-32 character.

Parameters:
uc the unicode character to look up.
nfkd if TRUE, compatibility composition is used, otherwise canonical composition.
Returns:
NULL if the character is not in decomposition table. Otherwise, the returned pointer points to a possibly unterminated UTF-32 string of maximum UTF32_NFD_REPLACE_MAXLEN characters. The result is constant.

size_t utf32_decompose_nfd const guint32 *  in,
guint32 *  out,
size_t  size
 

Decomposes (NFD) an UTF-32 encoded string.

size_t utf32_decompose_nfkd const guint32 *  in,
guint32 *  out,
size_t  size
 

Decomposes (NFKD) an UTF-32 encoded string.

const guint32* utf32_decompose_single_char guint32  uc,
size_t *  len,
gboolean  nfkd
[inline, static]
 

Decomposes a single UTF-32 character.

This must be used iteratively to gain the complete decomposition.

Parameters:
uc the UTF-32 to decompose.
len the variable ``len'' points to will be set to length in characters (not bytes!) of decomposed string. This is important because the decomposed string is not zero-terminated.
nfkd if TRUE, compatibility composition is used, otherwise canonical composition.
Returns:
a pointer to a buffer holding the decomposed string. The buffer is unterminated. The maximum length is UTF32_NFKD_REPLACE_MAXLEN characters. The returned pointer points to a static buffer which might get overwritten by subsequent calls to this function.

size_t utf32_filter const guint32 *  src,
guint32 *  dst,
size_t  size
 

Remove all the non letter and non digit by looking the unicode symbol type all other characters will be reduce to normal space try to merge continues spaces in the same time keep the important non spacing marks.

Parameters:
src an NUL-terminated UTF-32 string.
dst the output buffer to hold the modified UTF-32 string.
size the number of characters (not bytes!) dst can hold.
Returns:
The length of the output string.

guint32 utf32_filter_char guint32  uc,
gboolean *  space,
gboolean  last
[inline, static]
 

Filters characters that are ignorable for query strings.

*space should be initialized to TRUE for the first character of a string. ``space'' is used to prevent adding multiple space characters i.e., a space should not be followed by a space.

Parameters:
uc an UTF-32 character
space pointer to a gboolean holding the current space state
last should be TRUE if ``uc'' is the last character of the string.
Returns:
zero if the character should be skipped, otherwise the character itself or a replacement character.

uni_gc_t utf32_general_category guint32  uc  )  [inline, static]
 

gboolean utf32_is_decomposed const guint32 *  src,
gboolean  nfkd
 

Checks whether an UTF-32 string is decomposed.

gboolean utf32_is_decomposed_char guint32  uc,
gboolean  nfkd
[inline, static]
 

gboolean utf32_is_non_character guint32  uc  )  [inline, static]
 

Checks whether the character is a non-character which is not the same as an unassigned character.

Parameters:
uc an UTF-32 character
Returns:
TRUE if the the character is a non-character, FALSE otherwise.

gboolean utf32_is_normalization_special guint32  uc  )  [inline, static]
 

guint32 utf32_lowercase guint32  uc  ) 
 

Looks up the simple lowercase variant of an UTF-32 character.

Returns:
the lowercase variant of ``uc'' or ``uc'' itself.

guint32* utf32_next_starter const guint32 *  s  )  [inline, static]
 

Finds the next ``starter'' character (combining class zero) in the string starting at ``s''.

Note that NUL is also a ``starter''.

Parameters:
s a NUL-terminated UTF-32 string.
Returns:
a pointer to the next ``starter'' character in ``s''.

guint32* utf32_normalize const guint32 *  src,
uni_norm_t  norm
 

size_t utf32_remap guint32 *  dst,
const guint32 *  src,
size_t  size,
utf32_remap_func  remap
[static]
 

Copies the UTF-32 string ``src'' to ``dst'' remapping all characters using ``remap''.

If the created string is as long as ``size'' or larger, the string in ``dst'' will be truncated. ``dst'' is always NUL-terminated unless ``size'' is zero. The returned value is the length of the converted string ``src'' regardless of the ``size'' parameter. ``src'' must be validly UTF-8 encoded, otherwise the string will be truncated.

Parameters:
dst the target buffer
src an UTF-8 string
size the size of dst in bytes
remap a function that takes a single UTF-32 character and returns a single UTF-32 character.
Returns:
the length in bytes of the converted string ``src''.

guint32* utf32_sort_canonical guint32 *  src  ) 
 

Puts an UTF-32 string into canonical order.

size_t utf32_split_blocks const guint32 *  src,
guint32 *  dst,
size_t  size
 

Copies the NUL-terminated UTF-32 string ``src'' to ``dst'' inserting an ASCII whitespace (U+0020) at every Unicode block change.

If the block change is caused by such a ASCII whitespace itself, no additional space is inserted.

Parameters:
src an NUL-terminated UTF-32 string.
dst the output buffer to hold the modified UTF-32 string.
size the number of characters (not bytes!) dst can hold.
Returns:
The length of the output string.

gint64 utf32_strcmp const guint32 *  s1,
const guint32 *  s2
 

guint32* utf32_strdup const guint32 *  s  ) 
 

The equivalent of g_strdup() for UTF-32 strings.

size_t utf32_strlen const guint32 *  s  ) 
 

Determines the length of an UTF-32 string.

Parameters:
s a NUL-terminated UTF-32 string.
Returns:
the length in characters (not bytes!) of the string ``s''.

size_t utf32_strlower guint32 *  dst,
const guint32 *  src,
size_t  size
 

Copies ``src'' to ``dst'' converting all characters to lowercase.

If the string is as long as ``size'' or larger, the string in ``dst'' will be truncated. ``dst'' is always NUL-terminated unless ``size'' is zero. The returned value is the length of the converted string ``src'' regardless of the ``size'' parameter.

Parameters:
dst the target buffer
src an UTF-32 string
size the size of dst in bytes
Returns:
the length in characters of the converted string ``src''.

size_t utf32_strmaxlen const guint32 *  s,
size_t  maxlen
 

Determines the length of a UTF-32 string inspecting at most ``maxlen'' characters (not bytes!).

This can safely be used with unterminated UTF-32 strings if ``maxlen'' has an appropriate value.

To detect whether the actual string is longer than ``maxlen'' characters, just check if ``string[maxlen]'' is 0x0000, if and only if the returned value equals maxlen. Otherwise, the returned value is indeed the complete length of the UTF-32 string.

Parameters:
s an UTF-32 string.
maxlen the maximum number of characters to inspect.
Returns:
the length in characters (not bytes!) of the string ``s''.

size_t utf32_strupper guint32 *  dst,
const guint32 *  src,
size_t  size
 

Copies ``src'' to ``dst'' converting all characters to uppercase.

If the string is as long as ``size'' or larger, the string in ``dst'' will be truncated. ``dst'' is always NUL-terminated unless ``size'' is zero. The returned value is the length of the converted string ``src'' regardless of the ``size'' parameter.

Parameters:
dst the target buffer
src an UTF-32 string
size the size of dst in bytes
Returns:
the length in characters of the converted string ``src''.

size_t utf32_to_utf8 const guint32 *  src,
gchar *  dst,
size_t  size
 

Converts a UTF-32 encoded string to a UTF-8 encoded string.

The target string ``out'' is always be zero-terminated unless ``size'' is zero.

Parameters:
src the UTF-32 input string.
dst the target buffer for converted UTF-8 string.
size the length of the outbuf buffer in bytes. Whether the buffer was too small can be checked by comparing ``size'' with the return value. The value of ``size'' MUST NOT exceed INT_MAX.
Returns:
the length in bytes of completely converted string.

size_t utf32_to_utf8_inplace guint32 *  buf  ) 
 

Converts a UTF-32 encoded string to a UTF-8 encoded string.

The target string ``out'' is always be zero-terminated unless ``size'' is zero.

Parameters:
buf the UTF-32 input string.
Returns:
the length in bytes of completely converted string.

guint32 utf32_uppercase guint32  uc  )  [static]
 

Looks up the simple uppercase variant of an UTF-32 character.

Returns:
the uppercase variant of ``uc'' or ``uc'' itself.

gboolean utf8_can_dejap const gchar *  src  ) 
 

Checks whether the given UTF-8 string contains any convertible characters.

Parameters:
src an UTF-8 encoded NUL-terminated string.
Returns:
TRUE if utf8_dejap() would convert any characters; otherwise FALSE.

gboolean utf8_canonical_sorted const gchar *  src  ) 
 

Checks whether an UTF-8 encoded string is in canonical order.

gchar* utf8_canonize const gchar *  src  ) 
 

Apply the NFKD/NFC algo to have nomalized keywords.

iconv_t utf8_cd_get enum utf8_cd  id  )  [static]
 

Get the iconv() conversion descriptor of a converter.

const gchar* utf8_cd_to_name enum utf8_cd  id  ) 
 

Determine the name of the source charset of a converter.

size_t utf8_char_count const gchar *  src  ) 
 

guint utf8_char_len const gchar *  s  ) 
 

Are the first bytes of string `s' forming a valid UTF-8 character?

Parameters:
s a NUL-terminated string or at minimum a buffer with 4 bytes.
Returns:
amount of bytes used to encode that character, or 0 if invalid.

size_t utf8_data_char_count const gchar *  src,
size_t  len
 

guint32 utf8_decode_char const gchar *  s,
gint  len,
guint *  retlen,
gboolean  warn
 

Returns:
the character value of the first character in the string `s', which is assumed to be in UTF-8 encoding and no longer than `len'. `retlen' will be set to the length, in bytes, of that character.
If `s' does not point to a well-formed UTF-8 character, the behaviour is dependent on the value of `warn'. When FALSE, it is assumed that the caller will raise a warning, and this function will silently just set `retlen' to 0 and return zero.

guint32 utf8_decode_char_fast const gchar *  s,
guint *  retlen
 

This routine is the same as utf8_decode_char() but it is more specialized and is aimed at being fast.

Use it when you don't need warnings and you don't know the length of the string you're reading from.

Returns:
the character value of the first character in the string `s', which is assumed to be in UTF-8 encoding, and ending with a NUL byte. `retlen' will be set to the length, in bytes, of that character.
If `s' does not point to a well-formed UTF-8 character, `retlen' is set to 0 and the function returns 0.

size_t utf8_decompose const gchar *  src,
gchar *  out,
size_t  size,
gboolean  nfkd
[inline, static]
 

Decomposes an UTF-8 encoded string.

The UTF-8 string written to ``dst'' is always NUL-terminated unless ``size'' is zero. If the size of ``dst'' is too small to hold the complete decomposed string, the resulting string will be truncated but the validity of the UTF-8 encoding will be preserved. Truncation is indicated by the return value being equal to or greater than ``size''.

Parameters:
src a UTF-8 encoded string.
out a pointer to a buffer which will hold the decomposed string.
size the number of bytes ``dst'' can hold.
nfkd if TRUE, compatibility composition is used, otherwise canonical composition.
Returns:
the length in bytes (not characters!) of completely decomposed string.

size_t utf8_decompose_nfd const gchar *  src,
gchar *  out,
size_t  size
 

Decomposes (NFD) an UTF-8 encoded string.

The UTF-8 string written to ``dst'' is always NUL-terminated unless ``size'' is zero. If the size of ``dst'' is too small to hold the complete decomposed string, the resulting string will be truncated but the validity of the UTF-8 encoding will be preserved. Truncation is indicated by the return value being equal to or greater than ``size''.

Parameters:
src a UTF-8 encoded string.
out a pointer to a buffer which will hold the decomposed string.
size the number of bytes ``dst'' can hold.
Returns:
the length in bytes (not characters!) of completely decomposed string.

size_t utf8_decompose_nfkd const gchar *  src,
gchar *  out,
size_t  size
 

Decomposes (NFKD) an UTF-8 encoded string.

The UTF-8 string written to ``dst'' is always NUL-terminated unless ``size'' is zero. If the size of ``dst'' is too small to hold the complete decomposed string, the resulting string will be truncated but the validity of the UTF-8 encoding will be preserved. Truncation is indicated by the return value being equal to or greater than ``size''.

Parameters:
src a UTF-8 encoded string.
out a pointer to a buffer which will hold the decomposed string.
size the number of bytes ``dst'' can hold.
Returns:
the length in bytes (not characters!) of completely decomposed string.

size_t utf8_dejap gchar *  dst,
size_t  dst_size,
const gchar *  src
 

Converts hiragana and katakana characters to ASCII sequences, strips voice marks and keeps any other characters as is.

Parameters:
dst the destination buffer.
dst_size the size of the dst buffer in bytes.
src the source string.
Returns:
The length in bytes of the resulting string assuming dst_size was sufficient.

const gchar* utf8_dejap_char const guint32  uc  ) 
 

guint utf8_encode_char guint32  uc,
gchar *  buf,
size_t  size
[static]
 

Needs short description here.

Parameters:
uc the unicode character to encode.
buf the destination buffer. MUST BE at least 4 bytes long.
size no document.
Returns:
0 if the unicode character is invalid. Otherwise the length of the UTF-8 character is returned.

guint utf8_encoded_char_len guint32  uc  )  [inline, static]
 

Parameters:
uc the unicode character to encode.
Returns:
0 if the unicode codepoint is invalid. Otherwise the length of the UTF-8 character is returned.

size_t utf8_enforce gchar *  dst,
size_t  size,
const gchar *  src
 

Copies the NUL-terminated string ``src'' to ``dst'' replacing all invalid characters (non-UTF-8) with underscores.

``src'' and ``dst'' may be identical but must not overlap otherwise. If ``dst'' is to small, the resulting string will be truncated but the UTF-8 encoding is preserved in any case.

Parameters:
src a NUL-terminated string.
dst the destination buffer.
size the size in bytes of the destination buffer.
Returns:
the length in bytes of resulting string assuming size was sufficiently large.

TODO: Add overlap check

gboolean utf8_is_decomposed const gchar *  src,
gboolean  nfkd
 

Checks whether an UTF-8 encoded string is decomposed.

gboolean utf8_is_valid_data const gchar *  src,
size_t  len
 

Returns:
amount of UTF-8 chars when first `len' bytes of the given string `s' form valid a UTF-8 string, 0 meaning the string is not valid UTF-8.
Note:
If `len' is 0, the string must be NUL-terminated.

gboolean utf8_is_valid_string const gchar *  src  ) 
 

Returns:
amount of UTF-8 chars when first `len' bytes of the given string `s' form valid a UTF-8 string, 0 meaning the string is not valid UTF-8.
Parameters:
src a NUL-terminated string.

enum utf8_cd utf8_name_to_cd const gchar *  name  )  [static]
 

Looks up a "to UTF-8" converter by source charset name.

gchar* utf8_normalize const gchar *  src,
uni_norm_t  norm
 

Normalizes an UTF-8 string to the request normal form and returns it as a newly allocated string.

Parameters:
src the string to normalize, must be valid UTF-8.
norm one of UNI_NORM_NFC, UNI_NORM_NFD, UNI_NORM_NFKC, UNI_NORM_NFKD.
Returns:
a newly allocated string

void utf8_regression_checks void   ) 
 

size_t utf8_remap gchar *  dst,
const gchar *  src,
size_t  size,
utf32_remap_func  remap
[static]
 

Copies the UTF-8 string ``src'' to ``dst'' remapping all characters using ``remap''.

If the created string is as long as ``size'' or larger, the string in ``dst'' will be truncated. ``dst'' is always NUL-terminated unless ``size'' is zero. The returned value is the length of the converted string ``src'' regardless of the ``size'' parameter. ``src'' must be validly UTF-8 encoded, otherwise the string will be truncated.

Parameters:
dst the target buffer
src an UTF-8 string
size the size of dst in bytes
remap a function that takes a single UTF-32 character and returns a single UTF-32 character.
Returns:
the length in bytes of the converted string ``src''.

G_GNUC_CONST guint utf8_skip guchar  c  )  [inline, static]
 

gchar* utf8_sort_canonical gchar *  src  ) 
 

Puts an UTF-8 encoded string into canonical order.

size_t utf8_strlcpy gchar *  dst,
const gchar *  src,
size_t  dst_size
 

Works exactly like strlcpy() but preserves a valid UTF-8 encoding, if the string has to be truncated.

Parameters:
dst the target buffer to copy the string to.
src the source buffer to copy the string from.
dst_size the number of bytes ``dst'' can hold.

size_t utf8_strlower gchar *  dst,
const gchar *  src,
size_t  size
 

Copies ``src'' to ``dst'' converting all characters to lowercase.

If the string is as long as ``size'' or larger, the string in ``dst'' will be truncated. ``dst'' is always NUL-terminated unless ``size'' is zero. The returned value is the length of the converted string ``src'' regardless of the ``size'' parameter. ``src'' must be validly UTF-8 encoded, otherwise the string will be truncated.

Parameters:
dst the target buffer
src an UTF-8 string
size the size of dst in bytes
Returns:
the length in bytes of the converted string ``src''.

gchar* utf8_strlower_copy const gchar *  src  ) 
 

Copies the UTF-8 string ``src'' to a newly allocated buffer converting all characters to lowercase.

Parameters:
src an UTF-8 string
Returns:
a newly allocated buffer containing the lowercased string.

size_t utf8_strupper gchar *  dst,
const gchar *  src,
size_t  size
 

Copies ``src'' to ``dst'' converting all characters to uppercase.

If the string is as long as ``size'' or larger, the string in ``dst'' will be truncated. ``dst'' is always NUL-terminated unless ``size'' is zero. The returned value is the length of the converted string ``src'' regardless of the ``size'' parameter. ``src'' must be validly UTF-8 encoded, otherwise the string will be truncated.

Parameters:
dst the target buffer
src an UTF-8 string
size the size of dst in bytes
Returns:
the length in bytes of the converted string ``src''.

gchar* utf8_strupper_copy const gchar *  src  ) 
 

Copies the UTF-8 string ``src'' to a newly allocated buffer converting all characters to uppercase.

Parameters:
src an UTF-8 string
Returns:
a newly allocated buffer containing the uppercased string.

gchar* utf8_to_filename const gchar *  src  ) 
 

Converts the UTF-8 encoded src string to a string encoded in the primary filename character set.

Parameters:
src a NUL-terminated UTF-8 encoded string.
Returns:
a pointer to a newly allocated buffer holding the converted string.

gchar* utf8_to_filename_charset const gchar *  src  )  [static]
 

Non-convertible characters will be replaced by '_'.

The returned string WILL be NUL-terminated in any case.

In case of an unrecoverable error, NULL is returned.

Parameters:
src a NUL-terminated string.
Returns:
a pointer to a newly allocated buffer holding the converted string.

gint utf8_to_iso8859 gchar *  s,
gint  len,
gboolean  space
 

Convert UTF-8 string to ISO-8859-1 inplace.

If `space' is TRUE, all characters outside the U+0000 .. U+00FF range are turned to space U+0020. Otherwise, we stop at the first out-of-range character.

If `len' is 0, the length of the string is computed with strlen().

Returns:
length of decoded string.

gchar* utf8_to_locale const gchar *  src  ) 
 

Non-convertible characters will be replaced by '_'.

The returned string WILL be NUL-terminated in any case.

In case of an unrecoverable error, NULL is returned.

Parameters:
src a NUL-terminated string.
Returns:
a pointer to a newly allocated buffer holding the converted string.

gchar* utf8_to_ui_string const gchar *  src  ) 
 


Variable Documentation

iconv_t cd
 

iconv() conversion descriptor; may be -1

iconv_t cd_locale_to_utf8 = (iconv_t) -1 [static]
 

iconv_t cd_utf8_to_filename = (iconv_t) -1 [static]
 

Mainly used for Gtk+ 1.2.

iconv_t cd_utf8_to_locale = (iconv_t) -1 [static]
 

Mainly used for Gtk+ 1.2.

const gchar* charset = NULL [static]
 

const char* codesets[] [static]
 

guint32 common_dbg = 0 [static]
 

XXX -- need to init lib's props --RAM.

It affects only those functions that are explicitely defined to handle UI strings as input or output. This allows to reduce the number of conversions. For example, if a function specification permits that the original string may be returned, we will do that instead of creating a copy. If UI_USES_UTF8_ENCODING is undefined, it is assumed that the user-interface uses the locale's encoding for its strings.

const enum utf8_cd id
 

Enumerated ID of the converter.

gboolean initialized
 

Whether initialization of "cd" was attempted.

gboolean latin_locale = FALSE [static]
 

Used by is_latin_locale().

It is initialized by locale_init().

gboolean locale_init_passed [static]
 

const gchar* name
 

Name of the source charset.

GSList* sl_filename_charsets = NULL [static]
 

A single-linked list of conv_to_utf8 structs.

The first one is used for converting from the primary charset. Additional charsets are optional.

gboolean unicode_compose_init_passed [static]
 

gboolean use_icu = FALSE [static]
 

use_icu is set to TRUE if the initialization of ICU succeeded.

If it fails, we'll fall back to the non-ICU behaviour.

GHashTable* utf32_compose_roots [static]
 

struct { ... } utf8_cd_tab[] [static]
 

const guint8 utf8len_mark[] [static]
 

Initial value:

 {
    0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
}


Generated on Sun Feb 12 10:50:09 2006 for Gtk-Gnutella by doxygen 1.3.6