utf8.h File Reference

Detailed Description

Unicode Transformation Format 8 bits.

Author:: Raphael Manfredi

Date:: 2002-2003

#include <glib.h>

Go to the source code of this file.

Defines

#define UNI_NORM_GUI   UNI_NORM_NFC

#define UNI_NORM_NETWORK   UNI_NORM_NFC

#define UNI_NORM_FILESYSTEM   UNI_NORM_NFC

#define UNICODE_CANONIZE(x)   utf8_canonize(x)

Enumerations

enum uni_norm_t {
  UNI_NORM_NFC = 0, UNI_NORM_NFKC, UNI_NORM_NFD, UNI_NORM_NFKD,
  NUM_UNI_NORM
}

Functions

void locale_init (void)

void locale_close (void)

Called at shutdown time.

const gchar * locale_get_charset (void)

NOTE: The internal variable "charset" can be used to override the initially detected character set name.

const gchar * locale_get_language (void)

Determine the current language.

guint utf8_char_len (const gchar *s)

Are the first bytes of string `s' forming a valid UTF-8 character?

gboolean is_ascii_string (const gchar *str)

gboolean utf8_is_valid_string (const gchar *s)

gboolean utf8_is_valid_data (const gchar *s, size_t n)

size_t utf8_char_count (const gchar *s)

size_t utf8_strlcpy (gchar *dst, const gchar *src, size_t dst_size)

Works exactly like strlcpy() but preserves a valid UTF-8 encoding, if the string has to be truncated.

guint32 utf8_decode_char_fast (const gchar *s, guint *retlen)

This routine is the same as utf8_decode_char() but it is more specialized and is aimed at being fast.

gint utf8_to_iso8859 (gchar *s, gint len, gboolean space)

Convert UTF-8 string to ISO-8859-1 inplace.

size_t utf8_strlower (gchar *dst, const gchar *src, size_t size)

Copies ``src'' to ``dst'' converting all characters to lowercase.

gchar * utf8_strlower_copy (const gchar *src)

Copies the UTF-8 string ``src'' to a newly allocated buffer converting all characters to lowercase.

size_t utf8_strupper (gchar *dst, const gchar *src, size_t size)

Copies ``src'' to ``dst'' converting all characters to uppercase.

gchar * utf8_strupper_copy (const gchar *src)

Copies the UTF-8 string ``src'' to a newly allocated buffer converting all characters to uppercase.

gchar * utf8_canonize (const gchar *src)

Apply the NFKD/NFC algo to have nomalized keywords.

gchar * utf8_normalize (const gchar *src, uni_norm_t norm)

Normalizes an UTF-8 string to the request normal form and returns it as a newly allocated string.

size_t utf32_to_utf8 (const guint32 *in, gchar *out, size_t size)

guint32 utf32_lowercase (guint32 uc)

Looks up the simple lowercase variant of an UTF-32 character.

size_t utf8_decode_lookahead (const gchar *s, size_t len)

This is a highly specialized function (read: don't use it if you don't understand what it does and how it's used) to be used with utf8_decode_char().

guint32 utf16_encode_char_compact (guint32 uc)

Encodes a single UTF-32 character as UTF-16 and return the result compacted into a 32-bit integer.

const gchar * lazy_iso8859_1_to_utf8 (const gchar *src)

Lazy converters either return a pointer to a static buffer or manage the allocated memory themselves.

const gchar * lazy_ui_string_to_utf8 (const gchar *src)

const gchar * lazy_utf8_to_ui_string (const gchar *src)

const gchar * lazy_utf8_to_locale (const gchar *src)

const gchar * lazy_locale_to_utf8 (const gchar *src)

const gchar * lazy_locale_to_ui_string (const gchar *src)

const gchar * lazy_locale_to_ui_string2 (const gchar *src)

const gchar * lazy_locale_to_utf8_normalized (const gchar *src, uni_norm_t norm)

const gchar * lazy_unknown_to_utf8_normalized (const gchar *src, uni_norm_t norm, const gchar **charset_ptr)

gchar * iso8859_1_to_utf8 (const gchar *str)

Converts a string from ISO-8859-1 to UTF-8 encoding.

gchar * iso8859_1_to_utf8_normalized (const gchar *str, uni_norm_t norm)

Converts a string from the ISO-8859-1 character set to UTF-8 encoding and the specified Unicode normalization form.

gchar * utf8_to_ui_string (const gchar *src)

gchar * ui_string_to_utf8 (const gchar *src)

gchar * utf8_to_locale (const gchar *s)

Non-convertible characters will be replaced by '_'.

gchar * locale_to_utf8 (const gchar *str)

Converts a string from the locale's character set to UTF-8 encoding.

gchar * locale_to_utf8_normalized (const gchar *str, uni_norm_t norm)

Converts a string from the locale's character set to UTF-8 encoding and the specified Unicode normalization form.

gchar * utf8_to_filename (const gchar *s)

Converts the UTF-8 encoded src string to a string encoded in the primary filename character set.

gchar * filename_to_utf8_normalized (const gchar *str, uni_norm_t norm)

Converts a string from the filename character set to UTF-8 encoding and the specified Unicode normalization form.

gchar * unknown_to_utf8 (const gchar *str, const gchar **charset_ptr)

Converts the string to UTF-8 assuming an appropriate character set.

gchar * unknown_to_utf8_normalized (const gchar *src, uni_norm_t norm, const gchar **charset_ptr)

Converts a string from the ISO-8859-1 character set to UTF-8 encoding and the specified Unicode normalization form.

size_t ascii_enforce (gchar *dst, size_t size, const gchar *src)

Copies the NUL-terminated string ``src'' to ``dst'' replacing all invalid characters (non-ASCII) with underscores.

size_t utf8_enforce (gchar *dst, size_t size, const gchar *src)

Copies the NUL-terminated string ``src'' to ``dst'' replacing all invalid characters (non-UTF-8) with underscores.

gboolean icu_enabled (void)

gboolean locale_is_latin (void)

gboolean locale_is_utf8 (void)

gboolean utf8_can_dejap (const gchar *src)

Checks whether the given UTF-8 string contains any convertible characters.

size_t utf8_dejap (gchar *dst, size_t dst_size, const gchar *src)

Converts hiragana and katakana characters to ASCII sequences, strips voice marks and keeps any other characters as is.

Define Documentation

#define UNI_NORM_FILESYSTEM UNI_NORM_NFC

#define UNI_NORM_GUI UNI_NORM_NFC

#define UNI_NORM_NETWORK UNI_NORM_NFC

#define UNICODE_CANONIZE ( x ) utf8_canonize(x)

Enumeration Type Documentation

enum uni_norm_t

Enumeration values:

UNI_NORM_NFC

UNI_NORM_NFKC

UNI_NORM_NFD

UNI_NORM_NFKD

NUM_UNI_NORM

Function Documentation

size_t ascii_enforce ( gchar * dst,

size_t size,

const gchar * src

)

Copies the NUL-terminated string ``src'' to ``dst'' replacing all invalid characters (non-ASCII) with underscores.
``src'' and ``dst'' may be identical but must not overlap otherwise. If ``dst'' is to small, the resulting string will be truncated.

Parameters:

src a NUL-terminated string.

dst the destination buffer.

size the size in bytes of the destination buffer.

Returns:
the length in bytes of resulting string assuming size was sufficiently large.

TODO: Add overlap check

gchar* filename_to_utf8_normalized ( const gchar * src,

uni_norm_t norm

)

Converts a string from the filename character set to UTF-8 encoding and the specified Unicode normalization form.

Parameters:

src the string to convert.

norm the Unicode normalization form to use.

Returns:
a newly allocated string.

gboolean icu_enabled ( void )

Returns:
TRUE if ICU was successfully initialized. If FALSE is returned none of the ICU-related functions must be used.

gboolean is_ascii_string ( const gchar * str )

gchar* iso8859_1_to_utf8 ( const gchar * src )

Converts a string from ISO-8859-1 to UTF-8 encoding.
The returned string is in no defined Unicode normalization form.

Parameters:

src a NUL-terminated string.

Returns:
a newly allocated UTF-8 encoded string.

gchar* iso8859_1_to_utf8_normalized ( const gchar * src,

uni_norm_t norm

)

Converts a string from the ISO-8859-1 character set to UTF-8 encoding and the specified Unicode normalization form.

Parameters:

src the string to convert.

norm the Unicode normalization form to use.

Returns:
a newly allocated string.

const gchar* lazy_iso8859_1_to_utf8 ( const gchar * src )

Lazy converters either return a pointer to a static buffer or manage the allocated memory themselves.
They may also return the original pointer. Copy the result before calling them again unless you don't need the previous result anymore.

const gchar* lazy_locale_to_ui_string ( const gchar * src )

const gchar* lazy_locale_to_ui_string2 ( const gchar * src )

const gchar* lazy_locale_to_utf8 ( const gchar * src )

const gchar* lazy_locale_to_utf8_normalized ( const gchar * src,

uni_norm_t norm

)

const gchar* lazy_ui_string_to_utf8 ( const gchar * src )

const gchar* lazy_unknown_to_utf8_normalized ( const gchar * src,

uni_norm_t norm,

const gchar ** charset_ptr

)

const gchar* lazy_utf8_to_locale ( const gchar * src )

const gchar* lazy_utf8_to_ui_string ( const gchar * src )

void locale_close ( void )

Called at shutdown time.

const gchar* locale_get_charset ( void )

NOTE: The internal variable "charset" can be used to override the initially detected character set name.

Returns:
the name of current locale's character set.

const gchar* locale_get_language ( void )

Determine the current language.

Returns:
A two-letter ISO 639 of the language currently used for messages.

TRANSLATORS: Put the two-letter ISO 639 code here.

void locale_init ( void )

gboolean locale_is_latin ( void )

gboolean locale_is_utf8 ( void )

gchar* locale_to_utf8 ( const gchar * src )

Converts a string from the locale's character set to UTF-8 encoding.
The returned string is in no defined Unicode normalization form.

Parameters:

src a NUL-terminated string.

Returns:
a newly allocated UTF-8 encoded string.

gchar* locale_to_utf8_normalized ( const gchar * src,

uni_norm_t norm

)

Converts a string from the locale's character set to UTF-8 encoding and the specified Unicode normalization form.

Parameters:

src the string to convert.

norm the Unicode normalization form to use.

Returns:
a newly allocated string.

gchar* ui_string_to_utf8 ( const gchar * src )

gchar* unknown_to_utf8 ( const gchar * src,

const gchar ** charset_ptr

)

Converts the string to UTF-8 assuming an appropriate character set.
The conversion result might still be rubbish but is guaranteed to be UTF-8 encoded.
The returned string is in no defined Unicode normalization form.

Parameters:

src a NUL-terminated string.

charset_ptr If not NULL, it will point to the name of the charset used to convert string.

Returns:
the original pointer or a newly allocated UTF-8 encoded string.

gchar* unknown_to_utf8_normalized ( const gchar * src,

uni_norm_t norm,

const gchar ** charset_ptr

)

Converts a string from the ISO-8859-1 character set to UTF-8 encoding and the specified Unicode normalization form.

Parameters:

src the string to convert.

norm the Unicode normalization form to use.

charset_ptr no document.

Returns:
Either the original src pointer or a newly allocated string.

guint32 utf16_encode_char_compact ( guint32 uc ) [inline, static]

Encodes a single UTF-32 character as UTF-16 and return the result compacted into a 32-bit integer.
See also RFC 2781.

Parameters:

uc the unicode character to encode.

Returns:
(guint32) -1 if the unicode character is invalid. Otherwise the UTF-16 encoded character is returned in a compact form: The lower 16 bits are the first UTF-16 character, the upper 16 bits are the second one. If the upper bits are all zero, the unicode character fit into 16 bits.

guint32 utf32_lowercase ( guint32 uc )

Looks up the simple lowercase variant of an UTF-32 character.

Returns:
the lowercase variant of ``uc'' or ``uc'' itself.

size_t utf32_to_utf8 ( const guint32 * in,

gchar * out,

size_t size

)

gboolean utf8_can_dejap ( const gchar * src )

Checks whether the given UTF-8 string contains any convertible characters.

Parameters:

src an UTF-8 encoded NUL-terminated string.

Returns:
TRUE if utf8_dejap() would convert any characters; otherwise FALSE.

gchar* utf8_canonize ( const gchar * src )

Apply the NFKD/NFC algo to have nomalized keywords.

size_t utf8_char_count ( const gchar * s )

guint utf8_char_len ( const gchar * s )

Are the first bytes of string `s' forming a valid UTF-8 character?

Parameters:

s a NUL-terminated string or at minimum a buffer with 4 bytes.

Returns:
amount of bytes used to encode that character, or 0 if invalid.

guint32 utf8_decode_char_fast ( const gchar * s,

guint * retlen

)

This routine is the same as utf8_decode_char() but it is more specialized and is aimed at being fast.
Use it when you don't need warnings and you don't know the length of the string you're reading from.

Returns:
the character value of the first character in the string `s', which is assumed to be in UTF-8 encoding, and ending with a NUL byte. `retlen' will be set to the length, in bytes, of that character.
If `s' does not point to a well-formed UTF-8 character, `retlen' is set to 0 and the function returns 0.

size_t utf8_decode_lookahead ( const gchar * s,

size_t len

) [inline, static]

This is a highly specialized function (read: don't use it if you don't understand what it does and how it's used) to be used with utf8_decode_char().
It's purpose is to determine the maximum possible length in bytes of current UTF-8 character that ``s'' points to.

Parameters:

s a UTF-8 encoded string.

len number of bytes pending to be decoded.

Returns:
the maximum length in bytes of the current UTF-8 character.

size_t utf8_dejap ( gchar * dst,

size_t dst_size,

const gchar * src

)

Converts hiragana and katakana characters to ASCII sequences, strips voice marks and keeps any other characters as is.

Parameters:

dst the destination buffer.

dst_size the size of the dst buffer in bytes.

src the source string.

Returns:
The length in bytes of the resulting string assuming dst_size was sufficient.

size_t utf8_enforce ( gchar * dst,

size_t size,

const gchar * src

)

Copies the NUL-terminated string ``src'' to ``dst'' replacing all invalid characters (non-UTF-8) with underscores.
``src'' and ``dst'' may be identical but must not overlap otherwise. If ``dst'' is to small, the resulting string will be truncated but the UTF-8 encoding is preserved in any case.

Parameters:

src a NUL-terminated string.

dst the destination buffer.

size the size in bytes of the destination buffer.

Returns:
the length in bytes of resulting string assuming size was sufficiently large.

TODO: Add overlap check

gboolean utf8_is_valid_data ( const gchar * src,

size_t len

)

Returns:
amount of UTF-8 chars when first `len' bytes of the given string `s' form valid a UTF-8 string, 0 meaning the string is not valid UTF-8.

Note:
If `len' is 0, the string must be NUL-terminated.

gboolean utf8_is_valid_string ( const gchar * src )

Returns:
amount of UTF-8 chars when first `len' bytes of the given string `s' form valid a UTF-8 string, 0 meaning the string is not valid UTF-8.

Parameters:

src a NUL-terminated string.

gchar* utf8_normalize ( const gchar * src,

uni_norm_t norm

)

Normalizes an UTF-8 string to the request normal form and returns it as a newly allocated string.

Parameters:

src the string to normalize, must be valid UTF-8.

norm one of UNI_NORM_NFC, UNI_NORM_NFD, UNI_NORM_NFKC, UNI_NORM_NFKD.

Returns:
a newly allocated string

size_t utf8_strlcpy ( gchar * dst,

const gchar * src,

size_t dst_size

)

Works exactly like strlcpy() but preserves a valid UTF-8 encoding, if the string has to be truncated.

Parameters:

dst the target buffer to copy the string to.

src the source buffer to copy the string from.

dst_size the number of bytes ``dst'' can hold.

size_t utf8_strlower ( gchar * dst,

const gchar * src,

size_t size

)

Copies ``src'' to ``dst'' converting all characters to lowercase.
If the string is as long as ``size'' or larger, the string in ``dst'' will be truncated. ``dst'' is always NUL-terminated unless ``size'' is zero. The returned value is the length of the converted string ``src'' regardless of the ``size'' parameter. ``src'' must be validly UTF-8 encoded, otherwise the string will be truncated.

Parameters:

dst the target buffer

src an UTF-8 string

size the size of dst in bytes

Returns:
the length in bytes of the converted string ``src''.

gchar* utf8_strlower_copy ( const gchar * src )

Copies the UTF-8 string ``src'' to a newly allocated buffer converting all characters to lowercase.

Parameters:

src an UTF-8 string

Returns:
a newly allocated buffer containing the lowercased string.

size_t utf8_strupper ( gchar * dst,

const gchar * src,

size_t size

)

Copies ``src'' to ``dst'' converting all characters to uppercase.
If the string is as long as ``size'' or larger, the string in ``dst'' will be truncated. ``dst'' is always NUL-terminated unless ``size'' is zero. The returned value is the length of the converted string ``src'' regardless of the ``size'' parameter. ``src'' must be validly UTF-8 encoded, otherwise the string will be truncated.

Parameters:

dst the target buffer

src an UTF-8 string

size the size of dst in bytes

Returns:
the length in bytes of the converted string ``src''.

gchar* utf8_strupper_copy ( const gchar * src )

Copies the UTF-8 string ``src'' to a newly allocated buffer converting all characters to uppercase.

Parameters:

src an UTF-8 string

Returns:
a newly allocated buffer containing the uppercased string.

gchar* utf8_to_filename ( const gchar * src )

Converts the UTF-8 encoded src string to a string encoded in the primary filename character set.

Parameters:

src a NUL-terminated UTF-8 encoded string.

Returns:
a pointer to a newly allocated buffer holding the converted string.

gint utf8_to_iso8859 ( gchar * s,

gint len,

gboolean space

)

Convert UTF-8 string to ISO-8859-1 inplace.
If `space' is TRUE, all characters outside the U+0000 .. U+00FF range are turned to space U+0020. Otherwise, we stop at the first out-of-range character.
If `len' is 0, the length of the string is computed with strlen().

Returns:
length of decoded string.

gchar* utf8_to_locale ( const gchar * src )

Non-convertible characters will be replaced by '_'.
The returned string WILL be NUL-terminated in any case.
In case of an unrecoverable error, NULL is returned.

Parameters:

src a NUL-terminated string.

Returns:
a pointer to a newly allocated buffer holding the converted string.

gchar* utf8_to_ui_string ( const gchar * src )

Generated on Sun Feb 12 10:50:10 2006 for Gtk-Gnutella by

1.3.6


Defines
#define	UNI_NORM_GUI UNI_NORM_NFC
#define	UNI_NORM_NETWORK UNI_NORM_NFC
#define	UNI_NORM_FILESYSTEM UNI_NORM_NFC
#define	UNICODE_CANONIZE(x) utf8_canonize(x)
Enumerations
enum	uni_norm_t { UNI_NORM_NFC = 0, UNI_NORM_NFKC, UNI_NORM_NFD, UNI_NORM_NFKD, NUM_UNI_NORM }
Functions
void	locale_init (void)
void	locale_close (void)
	Called at shutdown time.
const gchar *	locale_get_charset (void)
	NOTE: The internal variable "charset" can be used to override the initially detected character set name.
const gchar *	locale_get_language (void)
	Determine the current language.
guint	utf8_char_len (const gchar *s)
	Are the first bytes of string `s' forming a valid UTF-8 character?
gboolean	is_ascii_string (const gchar *str)
gboolean	utf8_is_valid_string (const gchar *s)
gboolean	utf8_is_valid_data (const gchar *s, size_t n)
size_t	utf8_char_count (const gchar *s)
size_t	utf8_strlcpy (gchar dst, const gchar src, size_t dst_size)
	Works exactly like strlcpy() but preserves a valid UTF-8 encoding, if the string has to be truncated.
guint32	utf8_decode_char_fast (const gchar s, guint retlen)
	This routine is the same as utf8_decode_char() but it is more specialized and is aimed at being fast.
gint	utf8_to_iso8859 (gchar *s, gint len, gboolean space)
	Convert UTF-8 string to ISO-8859-1 inplace.
size_t	utf8_strlower (gchar dst, const gchar src, size_t size)
	Copies ``src'' to ``dst'' converting all characters to lowercase.
gchar *	utf8_strlower_copy (const gchar *src)
	Copies the UTF-8 string ``src'' to a newly allocated buffer converting all characters to lowercase.
size_t	utf8_strupper (gchar dst, const gchar src, size_t size)
	Copies ``src'' to ``dst'' converting all characters to uppercase.
gchar *	utf8_strupper_copy (const gchar *src)
	Copies the UTF-8 string ``src'' to a newly allocated buffer converting all characters to uppercase.
gchar *	utf8_canonize (const gchar *src)
	Apply the NFKD/NFC algo to have nomalized keywords.
gchar *	utf8_normalize (const gchar *src, uni_norm_t norm)
	Normalizes an UTF-8 string to the request normal form and returns it as a newly allocated string.
size_t	utf32_to_utf8 (const guint32 in, gchar out, size_t size)
guint32	utf32_lowercase (guint32 uc)
	Looks up the simple lowercase variant of an UTF-32 character.
size_t	utf8_decode_lookahead (const gchar *s, size_t len)
	This is a highly specialized function (read: don't use it if you don't understand what it does and how it's used) to be used with utf8_decode_char().
guint32	utf16_encode_char_compact (guint32 uc)
	Encodes a single UTF-32 character as UTF-16 and return the result compacted into a 32-bit integer.
const gchar *	lazy_iso8859_1_to_utf8 (const gchar *src)
	Lazy converters either return a pointer to a static buffer or manage the allocated memory themselves.
const gchar *	lazy_ui_string_to_utf8 (const gchar *src)
const gchar *	lazy_utf8_to_ui_string (const gchar *src)
const gchar *	lazy_utf8_to_locale (const gchar *src)
const gchar *	lazy_locale_to_utf8 (const gchar *src)
const gchar *	lazy_locale_to_ui_string (const gchar *src)
const gchar *	lazy_locale_to_ui_string2 (const gchar *src)
const gchar *	lazy_locale_to_utf8_normalized (const gchar *src, uni_norm_t norm)
const gchar *	lazy_unknown_to_utf8_normalized (const gchar src, uni_norm_t norm, const gchar *charset_ptr)
gchar *	iso8859_1_to_utf8 (const gchar *str)
	Converts a string from ISO-8859-1 to UTF-8 encoding.
gchar *	iso8859_1_to_utf8_normalized (const gchar *str, uni_norm_t norm)
	Converts a string from the ISO-8859-1 character set to UTF-8 encoding and the specified Unicode normalization form.
gchar *	utf8_to_ui_string (const gchar *src)
gchar *	ui_string_to_utf8 (const gchar *src)
gchar *	utf8_to_locale (const gchar *s)
	Non-convertible characters will be replaced by '_'.
gchar *	locale_to_utf8 (const gchar *str)
	Converts a string from the locale's character set to UTF-8 encoding.
gchar *	locale_to_utf8_normalized (const gchar *str, uni_norm_t norm)
	Converts a string from the locale's character set to UTF-8 encoding and the specified Unicode normalization form.
gchar *	utf8_to_filename (const gchar *s)
	Converts the UTF-8 encoded src string to a string encoded in the primary filename character set.
gchar *	filename_to_utf8_normalized (const gchar *str, uni_norm_t norm)
	Converts a string from the filename character set to UTF-8 encoding and the specified Unicode normalization form.
gchar *	unknown_to_utf8 (const gchar str, const gchar *charset_ptr)
	Converts the string to UTF-8 assuming an appropriate character set.
gchar *	unknown_to_utf8_normalized (const gchar src, uni_norm_t norm, const gchar *charset_ptr)
	Converts a string from the ISO-8859-1 character set to UTF-8 encoding and the specified Unicode normalization form.
size_t	ascii_enforce (gchar dst, size_t size, const gchar src)
	Copies the NUL-terminated string ``src'' to ``dst'' replacing all invalid characters (non-ASCII) with underscores.
size_t	utf8_enforce (gchar dst, size_t size, const gchar src)
	Copies the NUL-terminated string ``src'' to ``dst'' replacing all invalid characters (non-UTF-8) with underscores.
gboolean	icu_enabled (void)
gboolean	locale_is_latin (void)
gboolean	locale_is_utf8 (void)
gboolean	utf8_can_dejap (const gchar *src)
	Checks whether the given UTF-8 string contains any convertible characters.
size_t	utf8_dejap (gchar dst, size_t dst_size, const gchar src)
	Converts hiragana and katakana characters to ASCII sequences, strips voice marks and keeps any other characters as is.