Gtk-Gnutella: utf8.h Source File

00001 /*
00002  * $Id: utf8.h,v 1.41 2006/02/02 23:58:20 cbiere Exp $
00003  *
00004  * Copyright (c) 2002-2003, Raphael Manfredi
00005  *
00006  *----------------------------------------------------------------------
00007  * This file is part of gtk-gnutella.
00008  *
00009  *  gtk-gnutella is free software; you can redistribute it and/or modify
00010  *  it under the terms of the GNU General Public License as published by
00011  *  the Free Software Foundation; either version 2 of the License, or
00012  *  (at your option) any later version.
00013  *
00014  *  gtk-gnutella is distributed in the hope that it will be useful,
00015  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00016  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00017  *  GNU General Public License for more details.
00018  *
00019  *  You should have received a copy of the GNU General Public License
00020  *  along with gtk-gnutella; if not, write to the Free Software
00021  *  Foundation, Inc.:
00022  *      59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
00023  *----------------------------------------------------------------------
00024  */
00025 
00036 #ifndef _utf8_h_
00037 #define _utf8_h_
00038 
00039 #include <glib.h>
00040 
00041 
00042 #if 0  /* xxxUSE_ICU */
00043 #include "unicode/uchar.h"
00044 #include "unicode/ustring.h"
00045 #include "unicode/utypes.h"
00046 #include "unicode/ustdio.h"
00047 #include "unicode/unorm.h"
00048 #endif
00049 
00050 typedef enum {
00051     UNI_NORM_NFC = 0,
00052     UNI_NORM_NFKC,
00053     UNI_NORM_NFD,
00054     UNI_NORM_NFKD,
00055 
00056     NUM_UNI_NORM
00057 } uni_norm_t;
00058 
00059 /*
00060  * Gtk+ renderers want UTF-8 NFC
00061  */
00062 #define UNI_NORM_GUI UNI_NORM_NFC
00063 
00064 /*
00065  * NFC is more dense than NFD, thus it's the normalization of choice when
00066  * passing text over the wire.
00067  */
00068 #define UNI_NORM_NETWORK UNI_NORM_NFC
00069 
00070 #if defined(__APPLE__) && defined(__MACH__) /* Darwin */
00071 /* Mac OS X (Darwin) wants filenames always in UTF-8 NFD */
00072 #define UNI_NORM_FILESYSTEM UNI_NORM_NFD
00073 #else /* !Darwin */
00074 /* Unix systems usually use NFC for UTF-8 filenames */
00075 #define UNI_NORM_FILESYSTEM UNI_NORM_NFC
00076 #endif /* Darwin */
00077 
00078 void locale_init(void);
00079 void locale_close(void);
00080 const gchar *locale_get_charset(void);
00081 const gchar * locale_get_language(void);
00082 guint utf8_char_len(const gchar *s);
00083 gboolean is_ascii_string(const gchar *str);
00084 gboolean utf8_is_valid_string(const gchar *s);
00085 gboolean utf8_is_valid_data(const gchar *s, size_t n);
00086 size_t utf8_char_count(const gchar *s);
00087 size_t utf8_strlcpy(gchar *dst, const gchar *src, size_t dst_size);
00088 guint32 utf8_decode_char_fast(const gchar *s, guint *retlen)
00089     NON_NULL_PARAM((1,2));
00090 gint utf8_to_iso8859(gchar *s, gint len, gboolean space);
00091 size_t utf8_strlower(gchar *dst, const gchar *src, size_t size);
00092 gchar *utf8_strlower_copy(const gchar *src);
00093 size_t utf8_strupper(gchar *dst, const gchar *src, size_t size);
00094 gchar *utf8_strupper_copy(const gchar *src);
00095 gchar *utf8_canonize(const gchar *src);
00096 gchar *utf8_normalize(const gchar *src, uni_norm_t norm);
00097 
00098 size_t utf32_to_utf8(const guint32 *in, gchar *out, size_t size);
00099 guint32 utf32_lowercase(guint32 uc);
00100 
00113 static inline size_t
00114 utf8_decode_lookahead(const gchar *s, size_t len)
00115 {
00116     while (len < 6 && s[len] != '\0')
00117         len++;
00118     return len;
00119 }
00120 
00133 static inline guint32
00134 utf16_encode_char_compact(guint32 uc)
00135 {
00136     if (uc <= 0xFFFF) {
00137         return uc;
00138     } else if (uc <= 0x10FFFF) {
00139         guint16 w1, w2;
00140 
00141         uc -= 0x10000;
00142         w1 = (uc >> 10) | 0xd800;
00143         w2 = (uc & 0x3ff) | 0xdc00;
00144         return (w2 << 16) | w1;
00145     }
00146     return (guint32) -1;
00147 }
00148 
00155 const gchar *lazy_iso8859_1_to_utf8(const gchar *src);
00156 
00157 const gchar *lazy_ui_string_to_utf8(const gchar *src);
00158 const gchar *lazy_utf8_to_ui_string(const gchar *src);
00159 
00160 const gchar *lazy_utf8_to_locale(const gchar *src);
00161 const gchar *lazy_locale_to_utf8(const gchar *src);
00162 
00163 const gchar *lazy_locale_to_ui_string(const gchar *src);
00164 const gchar *lazy_locale_to_ui_string2(const gchar *src);
00165 
00166 const gchar *lazy_locale_to_utf8_normalized(const gchar *src, uni_norm_t norm);
00167 const gchar *lazy_unknown_to_utf8_normalized(const gchar *src, uni_norm_t norm,
00168                 const gchar **charset_ptr);
00169 
00170 gchar *iso8859_1_to_utf8(const gchar *str);
00171 gchar *iso8859_1_to_utf8_normalized(const gchar *str, uni_norm_t norm);
00172 
00173 gchar *utf8_to_ui_string(const gchar *src);
00174 gchar *ui_string_to_utf8(const gchar *src);
00175 
00176 gchar *utf8_to_locale(const gchar *s);
00177 gchar *locale_to_utf8(const gchar *str);
00178 gchar *locale_to_utf8_normalized(const gchar *str, uni_norm_t norm);
00179 
00180 gchar *utf8_to_filename(const gchar *s);
00181 gchar *filename_to_utf8_normalized(const gchar *str, uni_norm_t norm);
00182 
00183 gchar *unknown_to_utf8(const gchar *str, const gchar **charset_ptr);
00184 gchar *unknown_to_utf8_normalized(const gchar *src, uni_norm_t norm,
00185             const gchar **charset_ptr);
00186 
00187 size_t ascii_enforce(gchar *dst, size_t size, const gchar *src);
00188 size_t utf8_enforce(gchar *dst, size_t size, const gchar *src);
00189 
00190 gboolean icu_enabled(void);
00191 gboolean locale_is_latin(void);
00192 gboolean locale_is_utf8(void);
00193 
00194 gboolean utf8_can_dejap(const gchar *src);
00195 size_t utf8_dejap(gchar *dst, size_t dst_size, const gchar *src);
00196 
00197 #if 0  /* xxxUSE_ICU */
00198 
00199 #define UNICODE_CANONIZE(x) \
00200     (icu_enabled() ? unicode_canonize(x) : utf8_canonize(x))
00201 
00202 int locale_to_icu_conv(const gchar *in, int lenin, UChar *out, int lenout);
00203 int utf8_to_icu_conv(const gchar *in, int lenin, UChar *out, int lenout);
00204 int icu_to_utf8_conv(const UChar *in, int lenin, gchar *out, int lenout);
00205 
00206 int unicode_NFC(const UChar *source, gint32 len, UChar *result, gint32 rlen);
00207 int unicode_NFKD(const UChar *source, gint32 len, UChar *result, gint32 rlen);
00208 int unicode_lower(const UChar *source, gint32 len, UChar *result, gint32 rlen);
00209 int unicode_upper(const UChar *source, gint32 len, UChar *result, gint32 rlen);
00210 int unicode_filters(const UChar *source, gint32 len, UChar *result);
00211 gchar* unicode_canonize(const gchar *in);
00212 
00213 #else /* !xxxUSE_ICU */
00214 
00215 #define UNICODE_CANONIZE(x) utf8_canonize(x)
00216 
00217 #endif  /* xxxUSE_ICU */
00218 
00219 #endif  /* _utf8_h_ */
00220 
00221 /* vi: set sw=4 ts=4 cindent: */