This code has been heavily inspired by utf8.c/utf8.h from Perl 5.6.1, written by Larry Wall et al.
#include "common.h"
#include <locale.h>
#include <iconv.h>
#include "utf8_tables.h"
#include "utf8.h"
#include "misc.h"
#include "glib-missing.h"
#include "override.h"
Data Structures | |
struct | conv_to_utf8 |
Defines | |
#define | D(name, id) (iconv_t) -1, (name), (id), FALSE |
#define | UTF8_LENGTH_MARK(len) utf8len_mark[len] |
#define | CHAR(x) ((guchar) (x)) |
#define | UTF8_BYTE_MARK 0x80 |
#define | UTF8_BYTE_MASK 0xbf |
#define | UTF8_IS_ASCII(x) (CHAR(x) < UTF8_BYTE_MARK) |
#define | UTF8_IS_START(x) (CHAR(x) >= 0xc0 && CHAR(x) <= 0xfd) |
#define | UTF8_IS_CONTINUATION(x) (CHAR(x) >= UTF8_BYTE_MARK && CHAR(x) <= UTF8_BYTE_MASK) |
#define | UTF8_IS_CONTINUED(x) (CHAR(x) & UTF8_BYTE_MARK) |
#define | UTF8_CONT_MASK (CHAR(0x3f)) |
#define | UTF8_ACCU_SHIFT 6 |
#define | UTF8_ACCUMULATE(o, n) (((o) << UTF8_ACCU_SHIFT) | (CHAR(n) & UTF8_CONT_MASK)) |
#define | UNI_SURROGATE_FIRST 0xd800 |
#define | UNI_SURROGATE_SECOND 0xdc00 |
#define | UNI_SURROGATE_LAST 0xdfff |
#define | UNI_HANGUL_FIRST 0xac00 |
#define | UNI_HANGUL_LAST 0xd7a3 |
#define | UNI_REPLACEMENT 0xfffd |
#define | UNI_BYTE_ORDER_MARK 0xfffe |
#define | UNI_ILLEGAL 0xffff |
#define | UNICODE_IS_SURROGATE(x) ((x) >= UNI_SURROGATE_FIRST && (x) <= UNI_SURROGATE_LAST) |
#define | UNICODE_IS_HANGUL(x) ((x) >= UNI_HANGUL_FIRST && (x) <= UNI_HANGUL_LAST) |
#define | UNICODE_IS_ASCII(x) ((x) < 0x0080U) |
#define | UNICODE_IS_REPLACEMENT(x) ((x) == UNI_REPLACEMENT) |
#define | UNICODE_IS_BYTE_ORDER_MARK(x) ((0xFFFFU & (x)) == UNI_BYTE_ORDER_MARK) |
#define | UNICODE_IS_BOM(x) UNICODE_IS_BYTE_ORDER_MARK(x) |
#define | UNICODE_IS_ILLEGAL(x) ((x) > 0x10FFFFU || (UNI_ILLEGAL & (x)) == UNI_ILLEGAL) |
#define | GET_ITEM(i) (utf32_comb_class_lut[(i)].uc) |
#define | FOUND(i) |
#define | GET_ITEM(i) (i) |
#define | FOUND(i) |
#define | GET_ITEM(i) (utf32_composition_exclusions[(i)]) |
#define | FOUND(i) |
#define | GET_ITEM(i) (i) |
#define | FOUND(i) |
#define | GET_ITEM(i) (i) |
#define | FOUND(i) |
#define | UTF8_WARN_EMPTY 0 |
#define | UTF8_WARN_CONTINUATION 1 |
#define | UTF8_WARN_NON_CONTINUATION 2 |
#define | UTF8_WARN_FE_FF 3 |
#define | UTF8_WARN_SHORT 4 |
#define | UTF8_WARN_OVERFLOW 5 |
#define | UTF8_WARN_SURROGATE 6 |
#define | UTF8_WARN_BOM 7 |
#define | UTF8_WARN_LONG 8 |
#define | UTF8_WARN_ILLEGAL 9 |
#define | IS_NON_NUL_ASCII(p) (!(*(p) & ~0x7f) && (*(p) > 0)) |
#define | LAZY_CONVERT(func, proto, params) |
This macro is used to generate "lazy" variants of the converter functions. | |
#define | GET_ITEM(i) (utf32_nfkd_lut[(i)].c & ~UTF32_F_MASK) |
#define | FOUND(i) |
#define | GET_ITEM(i) (utf32_uppercase_lut[(i)].lower) |
#define | FOUND(i) |
#define | GET_ITEM(i) (utf32_lowercase_lut[(i)].upper) |
#define | FOUND(i) |
#define | T_COUNT 28 |
#define | V_COUNT 21 |
#define | N_COUNT (T_COUNT * V_COUNT) |
#define | L_COUNT 19 |
#define | T_COUNT 28 |
#define | V_COUNT 21 |
#define | N_COUNT (T_COUNT * V_COUNT) |
#define | S_COUNT (L_COUNT * N_COUNT) |
#define | GET_ITEM(i) (jap_tab[(i)].uc) |
#define | FOUND(i) |
#define | REGRESSION(func) |
Typedefs | |
typedef guint32(* | utf32_remap_func )(guint32 uc) |
Enumerations | |
enum | utf8_cd { UTF8_CD_ISO8859_1, UTF8_CD_ISO8859_6, UTF8_CD_ISO8859_7, UTF8_CD_ISO8859_8, UTF8_CD_SJIS, UTF8_CD_EUC_JP, UTF8_CD_KOI8_R, NUM_UTF8_CDS, UTF8_CD_INVALID = -1 } |
Functions | |
RCSID ("$Id:utf8.c, v 1.96 2006/02/02 23:58:20 cbiere Exp $") | |
void | unicode_compose_init (void) |
void | regression_checks (void) |
size_t | utf8_decompose_nfd (const gchar *in, gchar *out, size_t size) |
Decomposes (NFD) an UTF-8 encoded string. | |
size_t | utf8_decompose_nfkd (const gchar *in, gchar *out, size_t size) |
Decomposes (NFKD) an UTF-8 encoded string. | |
size_t | utf32_strmaxlen (const guint32 *s, size_t maxlen) |
Determines the length of a UTF-32 string inspecting at most ``maxlen'' characters (not bytes!). | |
size_t | utf32_to_utf8 (const guint32 *in, gchar *out, size_t size) |
Converts a UTF-32 encoded string to a UTF-8 encoded string. | |
size_t | utf32_strlen (const guint32 *s) |
Determines the length of an UTF-32 string. | |
enum utf8_cd | utf8_name_to_cd (const gchar *name) |
Looks up a "to UTF-8" converter by source charset name. | |
const gchar * | utf8_cd_to_name (enum utf8_cd id) |
Determine the name of the source charset of a converter. | |
iconv_t | utf8_cd_get (enum utf8_cd id) |
Get the iconv() conversion descriptor of a converter. | |
gboolean | locale_is_utf8 (void) |
const gchar * | primary_filename_charset (void) |
gboolean | primary_filename_charset_is_utf8 (void) |
G_GNUC_CONST guint | utf8_skip (guchar c) |
G_GNUC_CONST guint | uniskip (guint32 uc) |
Determines the UTF-8 byte length for the given Unicode codepoint. | |
G_GNUC_CONST gboolean | utf32_bad_codepoint (guint32 uc) |
Determines whether the given UTF-32 codepoint is valid in Unicode. | |
guint | utf8_encoded_char_len (guint32 uc) |
guint | utf8_encode_char (guint32 uc, gchar *buf, size_t size) |
Needs short description here. | |
guint | utf32_combining_class (guint32 uc) |
gint | block_id_cmp (size_t i, guint32 uc) |
guint | utf32_block_id (guint32 uc) |
gboolean | utf32_composition_exclude (guint32 uc) |
gboolean | utf32_is_non_character (guint32 uc) |
Checks whether the character is a non-character which is not the same as an unassigned character. | |
gint | general_category_cmp (size_t i, guint32 uc) |
uni_gc_t | utf32_general_category (guint32 uc) |
gint | normalization_special_cmp (size_t i, guint32 uc) |
gboolean | utf32_is_normalization_special (guint32 uc) |
guint32 | utf8_decode_char (const gchar *s, gint len, guint *retlen, gboolean warn) |
guint32 | utf8_decode_char_fast (const gchar *s, guint *retlen) |
This routine is the same as utf8_decode_char() but it is more specialized and is aimed at being fast. | |
guint | utf8_char_len (const gchar *s) |
Are the first bytes of string `s' forming a valid UTF-8 character? | |
gboolean | utf8_is_valid_string (const gchar *src) |
gboolean | utf8_is_valid_data (const gchar *src, size_t len) |
size_t | utf8_char_count (const gchar *src) |
size_t | utf8_data_char_count (const gchar *src, size_t len) |
size_t | utf8_strlcpy (gchar *dst, const gchar *src, size_t dst_size) |
Works exactly like strlcpy() but preserves a valid UTF-8 encoding, if the string has to be truncated. | |
gint | utf16_encode_char (guint32 uc, guint16 *dst) |
Encodes a single UTF-32 character as UTF-16 into a buffer. | |
gint | utf8_to_iso8859 (gchar *s, gint len, gboolean space) |
Convert UTF-8 string to ISO-8859-1 inplace. | |
const gchar * | get_iconv_charset_alias (const gchar *cs) |
const gchar * | locale_get_charset (void) |
NOTE: The internal variable "charset" can be used to override the initially detected character set name. | |
const gchar * | locale_get_language (void) |
Determine the current language. | |
conv_to_utf8 * | conv_to_utf8_new (const gchar *cs) |
GSList * | get_filename_charsets (const gchar *locale) |
Emulate GLib 2.x behaviour and select the appropriate character set for filenames. | |
void | textdomain_init (const char *codeset) |
void | locale_init_show_results (void) |
void | conversion_init (void) |
void | locale_init (void) |
void | locale_close (void) |
Called at shutdown time. | |
size_t | complete_iconv (iconv_t cd, gchar *dst, size_t dst_left, const gchar *src, gboolean abort_on_error) |
Converts the string in "src" into the buffer "dst" using the iconv context "cd". | |
gchar * | hyper_iconv (iconv_t cd, gchar *dst, size_t dst_size, const gchar *src, gboolean abort_on_error) |
Converts the string in "src" to "dst" using the iconv context "cd". | |
size_t | utf8_enforce (gchar *dst, size_t size, const gchar *src) |
Copies the NUL-terminated string ``src'' to ``dst'' replacing all invalid characters (non-UTF-8) with underscores. | |
size_t | ascii_enforce (gchar *dst, size_t size, const gchar *src) |
Copies the NUL-terminated string ``src'' to ``dst'' replacing all invalid characters (non-ASCII) with underscores. | |
gchar * | hyper_utf8_enforce (gchar *dst, size_t dst_size, const gchar *src) |
gchar * | hyper_ascii_enforce (gchar *dst, size_t dst_size, const gchar *src) |
gchar * | utf8_to_filename_charset (const gchar *src) |
Non-convertible characters will be replaced by '_'. | |
gchar * | utf8_to_filename (const gchar *src) |
Converts the UTF-8 encoded src string to a string encoded in the primary filename character set. | |
gchar * | utf8_to_locale (const gchar *src) |
Non-convertible characters will be replaced by '_'. | |
gchar * | convert_to_utf8 (iconv_t cd, const gchar *src) |
gchar * | locale_to_utf8 (const gchar *src) |
Converts a string from the locale's character set to UTF-8 encoding. | |
gchar * | iso8859_1_to_utf8 (const gchar *src) |
Converts a string from ISO-8859-1 to UTF-8 encoding. | |
gboolean | is_ascii_string (const gchar *s) |
const gchar * | ascii_rewind (const gchar *const s0, const gchar *p) |
gboolean | koi8_is_cyrillic_char (guchar c) |
gboolean | looks_like_koi8 (const gchar *src) |
gboolean | iso8859_is_valid_char (guchar c) |
gboolean | iso8859_6_is_arabic_char (guchar c) |
gboolean | iso8859_6_is_valid_char (guchar c) |
gboolean | looks_like_iso8859_6 (const gchar *src) |
gboolean | iso8859_7_is_greek_char (guchar c) |
gboolean | looks_like_iso8859_7 (const gchar *src) |
gboolean | iso8859_8_is_hebrew_char (guchar c) |
gboolean | iso8859_8_is_valid_char (guchar c) |
gboolean | looks_like_iso8859_8 (const gchar *src) |
gboolean | looks_like_sjis (const gchar *src) |
Matches SJIS encoded strings. | |
gboolean | iso8859_is_valid_string (const gchar *src) |
gchar * | unknown_to_utf8 (const gchar *src, const gchar **charset_ptr) |
Converts the string to UTF-8 assuming an appropriate character set. | |
gchar * | convert_to_utf8_normalized (iconv_t cd, const gchar *src, uni_norm_t norm) |
gchar * | locale_to_utf8_normalized (const gchar *src, uni_norm_t norm) |
Converts a string from the locale's character set to UTF-8 encoding and the specified Unicode normalization form. | |
gchar * | filename_to_utf8_normalized (const gchar *src, uni_norm_t norm) |
Converts a string from the filename character set to UTF-8 encoding and the specified Unicode normalization form. | |
gchar * | iso8859_1_to_utf8_normalized (const gchar *src, uni_norm_t norm) |
Converts a string from the ISO-8859-1 character set to UTF-8 encoding and the specified Unicode normalization form. | |
gchar * | unknown_to_utf8_normalized (const gchar *src, uni_norm_t norm, const gchar **charset_ptr) |
Converts a string from the ISO-8859-1 character set to UTF-8 encoding and the specified Unicode normalization form. | |
gchar * | utf8_to_ui_string (const gchar *src) |
gchar * | ui_string_to_utf8 (const gchar *src) |
gchar * | locale_to_ui_string (const gchar *src) |
gchar * | locale_to_ui_string2 (const gchar *src) |
LAZY_CONVERT (locale_to_utf8_normalized,(const gchar *src, uni_norm_t norm),(src, norm)) | |
size_t | utf32_to_utf8_inplace (guint32 *buf) |
Converts a UTF-32 encoded string to a UTF-8 encoded string. | |
guint32 * | utf32_strdup (const guint32 *s) |
The equivalent of g_strdup() for UTF-32 strings. | |
gint64 | utf32_strcmp (const guint32 *s1, const guint32 *s2) |
const guint32 * | utf32_decompose_lookup (guint32 uc, gboolean nfkd) |
Looks up the decomposed string for an UTF-32 character. | |
guint32 | utf32_uppercase (guint32 uc) |
Looks up the simple uppercase variant of an UTF-32 character. | |
guint32 | utf32_lowercase (guint32 uc) |
Looks up the simple lowercase variant of an UTF-32 character. | |
guint32 | utf32_compose_char (guint32 a, guint32 b) |
Finds the composition of two UTF-32 characters. | |
guint32 * | utf32_next_starter (const guint32 *s) |
Finds the next ``starter'' character (combining class zero) in the string starting at ``s''. | |
gboolean | utf32_canonical_sorted (const guint32 *src) |
Checks whether an UTF-32 string is in canonical order. | |
gboolean | utf32_is_decomposed_char (guint32 uc, gboolean nfkd) |
gboolean | utf32_is_decomposed (const guint32 *src, gboolean nfkd) |
Checks whether an UTF-32 string is decomposed. | |
guint32 * | utf32_sort_canonical (guint32 *src) |
Puts an UTF-32 string into canonical order. | |
gboolean | utf8_is_decomposed (const gchar *src, gboolean nfkd) |
Checks whether an UTF-8 encoded string is decomposed. | |
gboolean | utf8_canonical_sorted (const gchar *src) |
Checks whether an UTF-8 encoded string is in canonical order. | |
gchar * | utf8_sort_canonical (gchar *src) |
Puts an UTF-8 encoded string into canonical order. | |
guint | utf32_decompose_hangul_char (guint32 uc, guint32 *buf) |
Decomposes a Hangul character. | |
size_t | utf32_compose_hangul (guint32 *src) |
Composes all Hangul characters in a string. | |
const guint32 * | utf32_decompose_single_char (guint32 uc, size_t *len, gboolean nfkd) |
Decomposes a single UTF-32 character. | |
const guint32 * | utf32_decompose_char (guint32 uc, size_t *len, gboolean nfkd) |
Decomposes an UTF-32 character completely. | |
size_t | utf8_decompose (const gchar *src, gchar *out, size_t size, gboolean nfkd) |
Decomposes an UTF-8 encoded string. | |
size_t | utf32_decompose (const guint32 *in, guint32 *out, size_t size, gboolean nfkd) |
Decomposes an UTF-32 encoded string. | |
size_t | utf32_decompose_nfd (const guint32 *in, guint32 *out, size_t size) |
Decomposes (NFD) an UTF-32 encoded string. | |
size_t | utf32_decompose_nfkd (const guint32 *in, guint32 *out, size_t size) |
Decomposes (NFKD) an UTF-32 encoded string. | |
size_t | utf8_remap (gchar *dst, const gchar *src, size_t size, utf32_remap_func remap) |
Copies the UTF-8 string ``src'' to ``dst'' remapping all characters using ``remap''. | |
size_t | utf32_remap (guint32 *dst, const guint32 *src, size_t size, utf32_remap_func remap) |
Copies the UTF-32 string ``src'' to ``dst'' remapping all characters using ``remap''. | |
size_t | utf32_strlower (guint32 *dst, const guint32 *src, size_t size) |
Copies ``src'' to ``dst'' converting all characters to lowercase. | |
size_t | utf32_strupper (guint32 *dst, const guint32 *src, size_t size) |
Copies ``src'' to ``dst'' converting all characters to uppercase. | |
size_t | utf8_strlower (gchar *dst, const gchar *src, size_t size) |
Copies ``src'' to ``dst'' converting all characters to lowercase. | |
size_t | utf8_strupper (gchar *dst, const gchar *src, size_t size) |
Copies ``src'' to ``dst'' converting all characters to uppercase. | |
gchar * | utf8_strlower_copy (const gchar *src) |
Copies the UTF-8 string ``src'' to a newly allocated buffer converting all characters to lowercase. | |
gchar * | utf8_strupper_copy (const gchar *src) |
Copies the UTF-8 string ``src'' to a newly allocated buffer converting all characters to uppercase. | |
guint32 | utf32_filter_char (guint32 uc, gboolean *space, gboolean last) |
Filters characters that are ignorable for query strings. | |
size_t | utf32_filter (const guint32 *src, guint32 *dst, size_t size) |
Remove all the non letter and non digit by looking the unicode symbol type all other characters will be reduce to normal space try to merge continues spaces in the same time keep the important non spacing marks. | |
size_t | utf32_split_blocks (const guint32 *src, guint32 *dst, size_t size) |
Copies the NUL-terminated UTF-32 string ``src'' to ``dst'' inserting an ASCII whitespace (U+0020) at every Unicode block change. | |
gboolean | icu_enabled (void) |
gboolean | locale_is_latin (void) |
size_t | utf32_compose (guint32 *src) |
Composes an UTF-32 encoded string in-place. | |
guint32 * | utf32_normalize (const guint32 *src, uni_norm_t norm) |
gchar * | utf8_normalize (const gchar *src, uni_norm_t norm) |
Normalizes an UTF-8 string to the request normal form and returns it as a newly allocated string. | |
guint32 * | utf32_canonize (const guint32 *src) |
Apply the NFKD/NFC algo to have nomalized keywords. | |
gchar * | utf8_canonize (const gchar *src) |
Apply the NFKD/NFC algo to have nomalized keywords. | |
int | compose_root_cmp (gconstpointer a, gconstpointer b) |
Helper function to sort the lists of ``utf32_compose_roots''. | |
void | unicode_compose_add (guint idx) |
This is a helper for unicode_compose_init() to create the lookup table used by utf32_compose_char(). | |
const gchar * | utf8_dejap_char (const guint32 uc) |
gboolean | utf8_can_dejap (const gchar *src) |
Checks whether the given UTF-8 string contains any convertible characters. | |
size_t | utf8_dejap (gchar *dst, size_t dst_size, const gchar *src) |
Converts hiragana and katakana characters to ASCII sequences, strips voice marks and keeps any other characters as is. | |
void | regression_normalization_character_identity (void) |
Checks that the following holds except for the characters the appear in column 1 in Part 1 of NormalizationTest.txt:. | |
void | regression_normalization_issue (void) |
See: http://www.unicode.org/review/pr-29.html. | |
void | regression_utf8_strlower (void) |
void | regression_bug_1211413 (void) |
The following code is supposed to reproduce bug #1211413. | |
void | regression_iconv_utf8_to_utf8 (void) |
Some iconv()s let invalid UTF-8 with codepoints beyond U+10FFFF slip through, when converting from UTF-8 to UTF-8. | |
void | regression_utf8_bijection (void) |
void | utf8_regression_checks (void) |
Variables | |
guint32 | common_dbg = 0 |
XXX -- need to init lib's props --RAM. | |
gboolean | unicode_compose_init_passed |
gboolean | locale_init_passed |
gboolean | use_icu = FALSE |
use_icu is set to TRUE if the initialization of ICU succeeded. | |
gboolean | latin_locale = FALSE |
Used by is_latin_locale(). | |
const gchar * | charset = NULL |
GSList * | sl_filename_charsets = NULL |
A single-linked list of conv_to_utf8 structs. | |
iconv_t | cd_locale_to_utf8 = (iconv_t) -1 |
iconv_t | cd_utf8_to_locale = (iconv_t) -1 |
Mainly used for Gtk+ 1.2. | |
iconv_t | cd_utf8_to_filename = (iconv_t) -1 |
Mainly used for Gtk+ 1.2. | |
struct { | |
iconv_t cd | |
const gchar * name | |
const enum utf8_cd id | |
gboolean initialized | |
} | utf8_cd_tab [] |
const guint8 | utf8len_mark [] |
const char * | codesets [] |
GHashTable * | utf32_compose_roots |
|
|
|
|
|
Value: G_STMT_START { \ return jap_tab[(i)].s; \ /* NOTREACHED */ \ } G_STMT_END |
|
Value: G_STMT_START { \ return utf32_lowercase_lut[(i)].lower; \ /* NOTREACHED */ \ } G_STMT_END |
|
Value: G_STMT_START { \ return utf32_uppercase_lut[(i)].upper; \ /* NOTREACHED */ \ } G_STMT_END |
|
Value: G_STMT_START { \ return utf32_nfkd_lut[(i)].c & (nfkd ? 0 : UTF32_F_NFKD) \ ? NULL \ : utf32_nfkd_lut[(i)].d; \ /* NOTREACHED */ \ } G_STMT_END |
|
Value: G_STMT_START { \ return TRUE; \ /* NOTREACHED */ \ } G_STMT_END |
|
Value: G_STMT_START { \ return utf32_general_category_lut[(i)].gc; \ /* NOTREACHED */ \ } G_STMT_END |
|
Value: G_STMT_START { \ return TRUE; \ /* NOTREACHED */ \ } G_STMT_END |
|
Value: G_STMT_START { \ return 1 + (i); \ /* NOTREACHED */ \ } G_STMT_END |
|
Value: G_STMT_START { \ return utf32_comb_class_lut[(i)].cc; \ /* NOTREACHED */ \ } G_STMT_END |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Value: const gchar * \ CAT2(lazy_,func) proto \ { \ static gchar *prev; /* Previous conversion result */ \ gchar *dst; \ \ g_assert(src); \ g_assert(prev != src); \ \ G_FREE_NULL(prev); \ \ dst = func params; \ if (dst != src) \ prev = dst; \ return dst; \ } In this context "lazy" means that the function will either return the original string (if appropriate) or a newly allocated string but the newly allocated string MUST NOT be freed. Instead the memory will be released when the function is used again. Thus the handling is similar to that of functions which return static buffers except that the functions are not limited to a fixed buffer size. The return type has a const qualifier so that a blatant attempt to free the memory is usually caught at compile time. If the result is not the original string, it MUST NOT be passed as parameter to this function. The last allocated buffer will normally be leaked at exit time. However, if you pass an empty string, the last allocated buffer is released and the empty string itself is returned. This is not strictly necessary but it may be used to get rid of useless warnings about a "memory leak" or to keep the memory foot-print lower. |
|
|
|
|
|
Value: G_STMT_START { \ printf("REGRESSION: regression_%s", STRINGIFY(func)); \ fflush(stdout); \ CAT2(regression_,func)(); \ printf(" PASSED\n"); \ } G_STMT_END |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Copies the NUL-terminated string ``src'' to ``dst'' replacing all invalid characters (non-ASCII) with underscores. ``src'' and ``dst'' may be identical but must not overlap otherwise. If ``dst'' is to small, the resulting string will be truncated.
TODO: Add overlap check |
|
|
|
|
|
Converts the string in "src" into the buffer "dst" using the iconv context "cd". If "dst_size" is too small, the resulting string will be truncated. complete_iconv() returns the necessary buffer size. IFF "dst_size" is zero, "dst" may be NULL.
|
|
Helper function to sort the lists of ``utf32_compose_roots''.
|
|
|
|
|
|
|
|
|
|
Converts a string from the filename character set to UTF-8 encoding and the specified Unicode normalization form.
|
|
|
|
Emulate GLib 2.x behaviour and select the appropriate character set for filenames.
|
|
|
|
|
|
Converts the string in "src" to "dst" using the iconv context "cd". If complete_iconv() iconv fails, NULL is returned. Otherwise, the converted string is returned. If "dst" was sufficiently large, "dst" will be returned. If not, a newly allocated string is returned. In the latter case complete_iconv() has to run twice. IFF dst_size is zero "dst" won't be touched and may be NULL. For best performance a small local buffer should be used as "dst" so that complete_iconv() does not have to run twice, especially if the result is only used temporary and copying is not necessary.
|
|
|
|
|
|
|
|
Converts a string from ISO-8859-1 to UTF-8 encoding. The returned string is in no defined Unicode normalization form.
|
|
Converts a string from the ISO-8859-1 character set to UTF-8 encoding and the specified Unicode normalization form.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Called at shutdown time.
|
|
NOTE: The internal variable "charset" can be used to override the initially detected character set name.
|
|
Determine the current language.
TRANSLATORS: Put the two-letter ISO 639 code here. |
|
|
|
|
|
|
|
|
|
|
|
|
|
Converts a string from the locale's character set to UTF-8 encoding. The returned string is in no defined Unicode normalization form.
|
|
Converts a string from the locale's character set to UTF-8 encoding and the specified Unicode normalization form.
|
|
|
|
|
|
|
|
|
|
Matches SJIS encoded strings.
|
|
|
|
|
|
|
|
|
|
The following code is supposed to reproduce bug #1211413.
|
|
|
|
Some iconv()s let invalid UTF-8 with codepoints beyond U+10FFFF slip through, when converting from UTF-8 to UTF-8. Thus, use utf8_enforce() for UTF-8 -> UTF-8 instead. |
|
Checks that the following holds except for the characters the appear in column 1 in Part 1 of NormalizationTest.txt:. X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X) |
|
See: http://www.unicode.org/review/pr-29.html.
|
|
|
|
|
|
|
|
|
|
This is a helper for unicode_compose_init() to create the lookup table used by utf32_compose_char(). The first character of the decomposition sequence is used as key, the index into the ``utf32_nfkd_lut'' is used as value. |
|
|
|
Determines the UTF-8 byte length for the given Unicode codepoint.
|
|
Converts the string to UTF-8 assuming an appropriate character set. The conversion result might still be rubbish but is guaranteed to be UTF-8 encoded. The returned string is in no defined Unicode normalization form.
|
|
Converts a string from the ISO-8859-1 character set to UTF-8 encoding and the specified Unicode normalization form.
|
|
Encodes a single UTF-32 character as UTF-16 into a buffer. See also RFC 2781.
|
|
Determines whether the given UTF-32 codepoint is valid in Unicode.
|
|
|
|
Checks whether an UTF-32 string is in canonical order.
|
|
Apply the NFKD/NFC algo to have nomalized keywords.
|
|
|
|
Composes an UTF-32 encoded string in-place. The modified string might be shorter but is never longer than the original string. NB: We assume that a direct composition eliminates at most one character. Further, the string must be in canonical order.
|
|
Finds the composition of two UTF-32 characters.
|
|
Composes all Hangul characters in a string.
|
|
|
|
Decomposes an UTF-32 encoded string.
|
|
Decomposes an UTF-32 character completely.
|
|
Decomposes a Hangul character.
|
|
Looks up the decomposed string for an UTF-32 character.
|
|
Decomposes (NFD) an UTF-32 encoded string.
|
|
Decomposes (NFKD) an UTF-32 encoded string.
|
|
Decomposes a single UTF-32 character. This must be used iteratively to gain the complete decomposition.
|
|
Remove all the non letter and non digit by looking the unicode symbol type all other characters will be reduce to normal space try to merge continues spaces in the same time keep the important non spacing marks.
|
|
Filters characters that are ignorable for query strings. *space should be initialized to TRUE for the first character of a string. ``space'' is used to prevent adding multiple space characters i.e., a space should not be followed by a space.
|
|
|
|
Checks whether an UTF-32 string is decomposed.
|
|
|
|
Checks whether the character is a non-character which is not the same as an unassigned character.
|
|
|
|
Looks up the simple lowercase variant of an UTF-32 character.
|
|
Finds the next ``starter'' character (combining class zero) in the string starting at ``s''. Note that NUL is also a ``starter''.
|
|
|
|
Copies the UTF-32 string ``src'' to ``dst'' remapping all characters using ``remap''. If the created string is as long as ``size'' or larger, the string in ``dst'' will be truncated. ``dst'' is always NUL-terminated unless ``size'' is zero. The returned value is the length of the converted string ``src'' regardless of the ``size'' parameter. ``src'' must be validly UTF-8 encoded, otherwise the string will be truncated.
|
|
Puts an UTF-32 string into canonical order.
|
|
Copies the NUL-terminated UTF-32 string ``src'' to ``dst'' inserting an ASCII whitespace (U+0020) at every Unicode block change. If the block change is caused by such a ASCII whitespace itself, no additional space is inserted.
|
|
|
|
The equivalent of g_strdup() for UTF-32 strings.
|
|
Determines the length of an UTF-32 string.
|
|
Copies ``src'' to ``dst'' converting all characters to lowercase. If the string is as long as ``size'' or larger, the string in ``dst'' will be truncated. ``dst'' is always NUL-terminated unless ``size'' is zero. The returned value is the length of the converted string ``src'' regardless of the ``size'' parameter.
|
|
Determines the length of a UTF-32 string inspecting at most ``maxlen'' characters (not bytes!). This can safely be used with unterminated UTF-32 strings if ``maxlen'' has an appropriate value. To detect whether the actual string is longer than ``maxlen'' characters, just check if ``string[maxlen]'' is 0x0000, if and only if the returned value equals maxlen. Otherwise, the returned value is indeed the complete length of the UTF-32 string.
|
|
Copies ``src'' to ``dst'' converting all characters to uppercase. If the string is as long as ``size'' or larger, the string in ``dst'' will be truncated. ``dst'' is always NUL-terminated unless ``size'' is zero. The returned value is the length of the converted string ``src'' regardless of the ``size'' parameter.
|
|
Converts a UTF-32 encoded string to a UTF-8 encoded string. The target string ``out'' is always be zero-terminated unless ``size'' is zero.
|
|
Converts a UTF-32 encoded string to a UTF-8 encoded string. The target string ``out'' is always be zero-terminated unless ``size'' is zero.
|
|
Looks up the simple uppercase variant of an UTF-32 character.
|
|
Checks whether the given UTF-8 string contains any convertible characters.
|
|
Checks whether an UTF-8 encoded string is in canonical order.
|
|
Apply the NFKD/NFC algo to have nomalized keywords.
|
|
Get the iconv() conversion descriptor of a converter.
|
|
Determine the name of the source charset of a converter.
|
|
|
|
Are the first bytes of string `s' forming a valid UTF-8 character?
|
|
|
|
|
|
This routine is the same as utf8_decode_char() but it is more specialized and is aimed at being fast. Use it when you don't need warnings and you don't know the length of the string you're reading from.
|
|
Decomposes an UTF-8 encoded string. The UTF-8 string written to ``dst'' is always NUL-terminated unless ``size'' is zero. If the size of ``dst'' is too small to hold the complete decomposed string, the resulting string will be truncated but the validity of the UTF-8 encoding will be preserved. Truncation is indicated by the return value being equal to or greater than ``size''.
|
|
Decomposes (NFD) an UTF-8 encoded string. The UTF-8 string written to ``dst'' is always NUL-terminated unless ``size'' is zero. If the size of ``dst'' is too small to hold the complete decomposed string, the resulting string will be truncated but the validity of the UTF-8 encoding will be preserved. Truncation is indicated by the return value being equal to or greater than ``size''.
|
|
Decomposes (NFKD) an UTF-8 encoded string. The UTF-8 string written to ``dst'' is always NUL-terminated unless ``size'' is zero. If the size of ``dst'' is too small to hold the complete decomposed string, the resulting string will be truncated but the validity of the UTF-8 encoding will be preserved. Truncation is indicated by the return value being equal to or greater than ``size''.
|
|
Converts hiragana and katakana characters to ASCII sequences, strips voice marks and keeps any other characters as is.
|
|
|
|
Needs short description here.
|
|
|
|
Copies the NUL-terminated string ``src'' to ``dst'' replacing all invalid characters (non-UTF-8) with underscores. ``src'' and ``dst'' may be identical but must not overlap otherwise. If ``dst'' is to small, the resulting string will be truncated but the UTF-8 encoding is preserved in any case.
TODO: Add overlap check |
|
Checks whether an UTF-8 encoded string is decomposed.
|
|
|
|
|
|
Looks up a "to UTF-8" converter by source charset name.
|
|
Normalizes an UTF-8 string to the request normal form and returns it as a newly allocated string.
|
|
|
|
Copies the UTF-8 string ``src'' to ``dst'' remapping all characters using ``remap''. If the created string is as long as ``size'' or larger, the string in ``dst'' will be truncated. ``dst'' is always NUL-terminated unless ``size'' is zero. The returned value is the length of the converted string ``src'' regardless of the ``size'' parameter. ``src'' must be validly UTF-8 encoded, otherwise the string will be truncated.
|
|
|
|
Puts an UTF-8 encoded string into canonical order.
|
|
Works exactly like strlcpy() but preserves a valid UTF-8 encoding, if the string has to be truncated.
|
|
Copies ``src'' to ``dst'' converting all characters to lowercase. If the string is as long as ``size'' or larger, the string in ``dst'' will be truncated. ``dst'' is always NUL-terminated unless ``size'' is zero. The returned value is the length of the converted string ``src'' regardless of the ``size'' parameter. ``src'' must be validly UTF-8 encoded, otherwise the string will be truncated.
|
|
Copies the UTF-8 string ``src'' to a newly allocated buffer converting all characters to lowercase.
|
|
Copies ``src'' to ``dst'' converting all characters to uppercase. If the string is as long as ``size'' or larger, the string in ``dst'' will be truncated. ``dst'' is always NUL-terminated unless ``size'' is zero. The returned value is the length of the converted string ``src'' regardless of the ``size'' parameter. ``src'' must be validly UTF-8 encoded, otherwise the string will be truncated.
|
|
Copies the UTF-8 string ``src'' to a newly allocated buffer converting all characters to uppercase.
|
|
Converts the UTF-8 encoded src string to a string encoded in the primary filename character set.
|
|
Non-convertible characters will be replaced by '_'. The returned string WILL be NUL-terminated in any case. In case of an unrecoverable error, NULL is returned.
|
|
Convert UTF-8 string to ISO-8859-1 inplace. If `space' is TRUE, all characters outside the U+0000 .. U+00FF range are turned to space U+0020. Otherwise, we stop at the first out-of-range character. If `len' is 0, the length of the string is computed with strlen().
|
|
Non-convertible characters will be replaced by '_'. The returned string WILL be NUL-terminated in any case. In case of an unrecoverable error, NULL is returned.
|
|
|
|
iconv() conversion descriptor; may be -1
|
|
|
|
Mainly used for Gtk+ 1.2.
|
|
Mainly used for Gtk+ 1.2.
|
|
|
|
|
|
XXX -- need to init lib's props --RAM. It affects only those functions that are explicitely defined to handle UI strings as input or output. This allows to reduce the number of conversions. For example, if a function specification permits that the original string may be returned, we will do that instead of creating a copy. If UI_USES_UTF8_ENCODING is undefined, it is assumed that the user-interface uses the locale's encoding for its strings. |
|
Enumerated ID of the converter.
|
|
Whether initialization of "cd" was attempted.
|
|
Used by is_latin_locale(). It is initialized by locale_init(). |
|
|
|
Name of the source charset.
|
|
A single-linked list of conv_to_utf8 structs. The first one is used for converting from the primary charset. Additional charsets are optional. |
|
|
|
use_icu is set to TRUE if the initialization of ICU succeeded. If it fails, we'll fall back to the non-ICU behaviour. |
|
|
|
|
|
Initial value: { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC } |