/* * Copyright (C) 2001 Edmund Grimley Evans * Portions Copyright (C) 2003 by various authors * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. * * $Id: charset.c,v 1.11 2004/01/10 14:59:24 joerg78 Exp $ */ #include #include #include "cddbd.h" #include "utf8.h" void *xrealloc(void *ptr, size_t size) { void *p = realloc(ptr, size); if (!p && size) { cddbd_log(LOG_ERR, "Realloc failed."); quit(QUIT_ERR); } return p; } /* * If there is a valid UTF-8 char starting at *ps, * return its value and increment *ps. Otherwise * return -1 and leave *ps unchanged. All 2^31 * possible UCS chars are accepted. */ int parse_utf8(const char **ps) { unsigned char *s = (unsigned char *)*ps; int wc, n, i; if (*s < 0x80) { *ps = (char *)s + 1; return *s; } if (*s < 0xc2) return -1; else if (*s < 0xe0) { if ((s[1] & 0xc0) != 0x80) return -1; *ps = (char *)s + 2; return ((s[0] & 0x1f) << 6) | (s[1] & 0x3f); } else if (*s < 0xf0) n = 3; else if (*s < 0xf8) n = 4; else if (*s < 0xfc) n = 5; else if (*s < 0xfe) n = 6; else return -1; wc = *s++ & ((1 << (7 - n)) - 1); for (i = 1; i < n; i++) { if ((*s & 0xc0) != 0x80) return -1; wc = (wc << 6) | (*s++ & 0x3f); } if (wc < (1 << (5 * n - 4))) return -1; *ps = (char *)s; return wc; } void charset_latin1_utf8(const char *s, char **ps) { char *buf, *p; p = buf = xrealloc(0, strlen(s) * 2 + 1); for (; *s; s++) if ((*s & 0x80) == 0) *p++ = *s; else { *p++ = 0xc0 | ((*s >> 6) & 0x03); *p++ = 0x80 | (*s & 0x3f); } *p++ = '\0'; *ps = xrealloc(buf, p - buf); } int charset_utf8_latin1(const char *s, char **ps) { const char *t = s; int inexact = 0; char *p, *buf; int n; p = buf = xrealloc(0, strlen(s) / 2 * MAXTRANS + 2); while ((n = parse_utf8(&t))) if (n == -1) return -1; else if (n < 0x100) *p++ = n; else { int i, j; inexact = 1; i = (n >> 8) - 1; j = n & 0xff; if (i < UTF8TAB && utf8tab[i] && utf8tab[i][j]) { strcpy(p, utf8tab[i][j]); p += strlen(utf8tab[i][j]); } else *p++ = '?'; } *p++ = '\0'; *ps = xrealloc(buf, p - buf); return inexact; } int charset_is_utf8(const char *s) { const char *t = s; int n; while ((n = parse_utf8(&t))) if (n == -1) return 0; return 1; } /* * Here we define the acceptable characters in a DB entry: * - ASCII control chars: 9 (tab), 10 (lf), 13 (cr) * - all printable ASCII: 32..127 * - everything from U+00A0 upwards except: * - surrogates: U+D800..U+DFFF * - BOM and reverse BOM: U+FEFF, U+FFFE * - U+FFFF * These code points are excluded because they are likely to * arise through misconversions and may cause problems. * There are lots of other weird UCS characters which would not * be exactly welcome in a DB entry, but it's best to keep the * rule simple and consistent. */ #define GOOD_ASCII(c) \ ((32 <= (c) && (c) < 127) || \ (c) == 9 || (c) == 10 || (c) == 13) #define GOOD_UCS(c) \ (GOOD_ASCII(c) || \ ((c) >= 160 && ((c) & ~0x3ff) != 0xd800 && \ (c) != 0xfeff && (c) != 0xfffe && (c) != 0xffff)) #define GOOD_LATIN1(c) \ (GOOD_ASCII(c) || \ ((c) >= 160)) int charset_is_valid_utf8(const char *s) { const char *t = s; int n; while ((n = parse_utf8(&t))) { if (!GOOD_UCS(n)) return 0; } return 1; } int charset_is_valid_ascii(const char *s) { for (; *s; s++) { if (!GOOD_ASCII(*s)) return 0; } return 1; } int charset_is_valid_latin1(const char *s) { for (; *s; s++) { if (!GOOD_LATIN1((unsigned char)*s)) return 0; } return 1; }