341 lines
		
	
	
	
		
			7 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			341 lines
		
	
	
	
		
			7 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /* unicode.c - functions to convert unicode characters */
 | |
| 
 | |
| /* Copyright (C) 2010-2012 Free Software Foundation, Inc.
 | |
| 
 | |
|    This file is part of GNU Bash, the Bourne Again SHell.
 | |
| 
 | |
|    Bash is free software: you can redistribute it and/or modify
 | |
|    it under the terms of the GNU General Public License as published by
 | |
|    the Free Software Foundation, either version 3 of the License, or
 | |
|    (at your option) any later version.
 | |
| 
 | |
|    Bash is distributed in the hope that it will be useful,
 | |
|    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
|    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | |
|    GNU General Public License for more details.
 | |
| 
 | |
|    You should have received a copy of the GNU General Public License
 | |
|    along with Bash.  If not, see <http://www.gnu.org/licenses/>.
 | |
| */
 | |
| 
 | |
| #include <config.h>
 | |
| 
 | |
| #if defined (HANDLE_MULTIBYTE)
 | |
| 
 | |
| #include <stdc.h>
 | |
| #include <wchar.h>
 | |
| #include <bashansi.h>
 | |
| #ifdef HAVE_UNISTD_H
 | |
| #include <unistd.h>
 | |
| #endif
 | |
| #include <stdio.h>
 | |
| #include <limits.h>
 | |
| 
 | |
| #if HAVE_ICONV
 | |
| #  include <iconv.h>
 | |
| #endif
 | |
| 
 | |
| #include <xmalloc.h>
 | |
| 
 | |
| #ifndef USHORT_MAX
 | |
| #  ifdef USHRT_MAX
 | |
| #    define USHORT_MAX USHRT_MAX
 | |
| #  else
 | |
| #    define USHORT_MAX ((unsigned short) ~(unsigned short)0)
 | |
| #  endif
 | |
| #endif
 | |
| 
 | |
| #if !defined (STREQ)
 | |
| #  define STREQ(a, b) ((a)[0] == (b)[0] && strcmp ((a), (b)) == 0)
 | |
| #endif /* !STREQ */
 | |
| 
 | |
| #if defined (HAVE_LOCALE_CHARSET)
 | |
| extern const char *locale_charset __P((void));
 | |
| #else
 | |
| extern char *get_locale_var __P((char *));
 | |
| #endif
 | |
| 
 | |
| static int u32init = 0;
 | |
| static int utf8locale = 0;
 | |
| #if defined (HAVE_ICONV)
 | |
| static iconv_t localconv;
 | |
| #endif
 | |
| 
 | |
| #ifndef HAVE_LOCALE_CHARSET
 | |
| static char charsetbuf[40];
 | |
| 
 | |
| static char *
 | |
| stub_charset ()
 | |
| {
 | |
|   char *locale, *s, *t;
 | |
| 
 | |
|   locale = get_locale_var ("LC_CTYPE");
 | |
|   if (locale == 0 || *locale == 0)
 | |
|     {
 | |
|       strcpy (charsetbuf, "ASCII");
 | |
|       return charsetbuf;
 | |
|     }
 | |
|   s = strrchr (locale, '.');
 | |
|   if (s)
 | |
|     {
 | |
|       strcpy (charsetbuf, s+1);
 | |
|       t = strchr (charsetbuf, '@');
 | |
|       if (t)
 | |
| 	*t = 0;
 | |
|       return charsetbuf;
 | |
|     }
 | |
|   strcpy (charsetbuf, locale);
 | |
|   return charsetbuf;
 | |
| }
 | |
| #endif
 | |
| 
 | |
| void
 | |
| u32reset ()
 | |
| {
 | |
| #if defined (HAVE_ICONV)
 | |
|   if (u32init && localconv != (iconv_t)-1)
 | |
|     {
 | |
|       iconv_close (localconv);
 | |
|       localconv = (iconv_t)-1;
 | |
|     }
 | |
| #endif
 | |
|   u32init = 0;
 | |
|   utf8locale = 0;
 | |
| }
 | |
| 
 | |
| /* u32toascii ? */
 | |
| int
 | |
| u32tochar (x, s)
 | |
|      unsigned long x;
 | |
|      char *s;
 | |
| {
 | |
|   int l;
 | |
| 
 | |
|   l = (x <= UCHAR_MAX) ? 1 : ((x <= USHORT_MAX) ? 2 : 4);
 | |
| 
 | |
|   if (x <= UCHAR_MAX)
 | |
|     s[0] = x & 0xFF;
 | |
|   else if (x <= USHORT_MAX)	/* assume unsigned short = 16 bits */
 | |
|     {
 | |
|       s[0] = (x >> 8) & 0xFF;
 | |
|       s[1] = x & 0xFF;
 | |
|     }
 | |
|   else
 | |
|     {
 | |
|       s[0] = (x >> 24) & 0xFF;
 | |
|       s[1] = (x >> 16) & 0xFF;
 | |
|       s[2] = (x >> 8) & 0xFF;
 | |
|       s[3] = x & 0xFF;
 | |
|     }
 | |
|   s[l] = '\0';
 | |
|   return l;  
 | |
| }
 | |
| 
 | |
| int
 | |
| u32tocesc (wc, s)
 | |
|      u_bits32_t wc;
 | |
|      char *s;
 | |
| {
 | |
|   int l;
 | |
| 
 | |
|   if (wc < 0x10000)
 | |
|     l = sprintf (s, "\\u%04X", wc);
 | |
|   else
 | |
|     l = sprintf (s, "\\u%08X", wc);
 | |
|   return l;
 | |
| }
 | |
| 
 | |
| /* Convert unsigned 32-bit int to utf-8 character string */
 | |
| int
 | |
| u32toutf8 (wc, s)
 | |
|      u_bits32_t wc;
 | |
|      char *s;
 | |
| {
 | |
|   int l;
 | |
| 
 | |
|   if (wc < 0x0080)
 | |
|     {
 | |
|       s[0] = (char)wc;
 | |
|       l = 1;
 | |
|     }
 | |
|   else if (wc < 0x0800)
 | |
|     {
 | |
|       s[0] = (wc >> 6) | 0xc0;
 | |
|       s[1] = (wc & 0x3f) | 0x80;
 | |
|       l = 2;
 | |
|     }
 | |
|   else if (wc < 0x10000)
 | |
|     {
 | |
|       /* Technically, we could return 0 here if 0xd800 <= wc <= 0x0dfff */
 | |
|       s[0] = (wc >> 12) | 0xe0;
 | |
|       s[1] = ((wc >> 6) & 0x3f) | 0x80;
 | |
|       s[2] = (wc & 0x3f) | 0x80;
 | |
|       l = 3;
 | |
|     }
 | |
|   else if (wc < 0x200000)
 | |
|     {
 | |
|       s[0] = (wc >> 18) | 0xf0;
 | |
|       s[1] = ((wc >> 12) & 0x3f) | 0x80;
 | |
|       s[2] = ((wc >>  6) & 0x3f) | 0x80;
 | |
|       s[3] = (wc & 0x3f) | 0x80;
 | |
|       l = 4;
 | |
|     }
 | |
|   /* Strictly speaking, UTF-8 doesn't have characters longer than 4 bytes */
 | |
|   else if (wc < 0x04000000)
 | |
|     {
 | |
|       s[0] = (wc >> 24) | 0xf8;
 | |
|       s[1] = ((wc >> 18) & 0x3f) | 0x80;
 | |
|       s[2] = ((wc >> 12) & 0x3f) | 0x80;
 | |
|       s[3] = ((wc >>  6) & 0x3f) | 0x80;
 | |
|       s[4] = (wc & 0x3f) | 0x80;
 | |
|       l = 5;
 | |
|     }
 | |
|   else if (wc < 0x080000000)
 | |
|     {
 | |
|       s[0] = (wc >> 30) | 0xf8;
 | |
|       s[1] = ((wc >> 24) & 0x3f) | 0x80;
 | |
|       s[2] = ((wc >> 18) & 0x3f) | 0x80;
 | |
|       s[3] = ((wc >> 12) & 0x3f) | 0x80;
 | |
|       s[4] = ((wc >>  6) & 0x3f) | 0x80;
 | |
|       s[5] = (wc & 0x3f) | 0x80;
 | |
|       l = 6;
 | |
|     }
 | |
|   else
 | |
|     l = 0;
 | |
| 
 | |
|   s[l] = '\0';
 | |
|   return l;
 | |
| }
 | |
| 
 | |
| /* Convert a 32-bit unsigned int (unicode) to a UTF-16 string.  Rarely used,
 | |
|    only if sizeof(wchar_t) == 2. */
 | |
| int
 | |
| u32toutf16 (c, s)
 | |
|      u_bits32_t c;
 | |
|      unsigned short *s;
 | |
| {
 | |
|   int l;
 | |
| 
 | |
|   l = 0;
 | |
|   if (c < 0x0d800)
 | |
|     {
 | |
|       s[0] = (unsigned short) (c & 0xFFFF);
 | |
|       l = 1;
 | |
|     }
 | |
|   else if (c >= 0x0e000 && c <= 0x010ffff)
 | |
|     {
 | |
|       c -= 0x010000;
 | |
|       s[0] = (unsigned short)((c >> 10) + 0xd800);
 | |
|       s[1] = (unsigned short)((c & 0x3ff) + 0xdc00);
 | |
|       l = 2;
 | |
|     }
 | |
|   s[l] = 0;
 | |
|   return l;
 | |
| }
 | |
| 
 | |
| /* convert a single unicode-32 character into a multibyte string and put the
 | |
|    result in S, which must be large enough (at least MB_LEN_MAX bytes) */
 | |
| int
 | |
| u32cconv (c, s)
 | |
|      unsigned long c;
 | |
|      char *s;
 | |
| {
 | |
|   wchar_t wc;
 | |
|   wchar_t ws[3];
 | |
|   int n;
 | |
| #if HAVE_ICONV
 | |
|   const char *charset;
 | |
|   char obuf[25], *optr;
 | |
|   size_t obytesleft;
 | |
|   const char *iptr;
 | |
|   size_t sn;
 | |
| #endif
 | |
| 
 | |
| #if __STDC_ISO_10646__
 | |
|   wc = c;
 | |
|   if (sizeof (wchar_t) == 4 && c <= 0x7fffffff)
 | |
|     n = wctomb (s, wc);
 | |
|   else if (sizeof (wchar_t) == 2 && c <= 0x10ffff && u32toutf16 (c, ws))
 | |
|     n = wcstombs (s, ws, MB_LEN_MAX);
 | |
|   else
 | |
|     n = -1;
 | |
|   if (n != -1)
 | |
|     return n;
 | |
| #endif
 | |
| 
 | |
| #if HAVE_NL_LANGINFO
 | |
|   codeset = nl_langinfo (CODESET);
 | |
|   if (STREQ (codeset, "UTF-8"))
 | |
|     {
 | |
|       n = u32toutf8 (c, s);
 | |
|       return n;
 | |
|     }
 | |
| #endif
 | |
| 
 | |
| #if HAVE_ICONV
 | |
|   /* this is mostly from coreutils-8.5/lib/unicodeio.c */
 | |
|   if (u32init == 0)
 | |
|     {
 | |
| #  if HAVE_LOCALE_CHARSET
 | |
|       charset = locale_charset ();	/* XXX - fix later */
 | |
| #  else
 | |
|       charset = stub_charset ();
 | |
| #  endif
 | |
|       if (STREQ (charset, "UTF-8"))
 | |
| 	utf8locale = 1;
 | |
|       else
 | |
| 	{
 | |
| 	  localconv = iconv_open (charset, "UTF-8");
 | |
| 	  if (localconv == (iconv_t)-1)
 | |
| 	    /* We assume ASCII when presented with an unknown encoding. */
 | |
| 	    localconv = iconv_open ("ASCII", "UTF-8");
 | |
| 	}
 | |
|       u32init = 1;
 | |
|     }
 | |
| 
 | |
|   /* If we have a UTF-8 locale, convert to UTF-8 and return converted value. */
 | |
|   n = u32toutf8 (c, s);
 | |
|   if (utf8locale)
 | |
|     return n;
 | |
| 
 | |
|   /* If the conversion is not supported, even the ASCII requested above, we
 | |
|      bail now.  Currently we return the UTF-8 conversion.  We could return
 | |
|      u32tocesc(). */
 | |
|   if (localconv == (iconv_t)-1)
 | |
|     return n;
 | |
|     
 | |
|   optr = obuf;
 | |
|   obytesleft = sizeof (obuf);
 | |
|   iptr = s;
 | |
|   sn = n;
 | |
| 
 | |
|   iconv (localconv, NULL, NULL, NULL, NULL);
 | |
| 
 | |
|   if (iconv (localconv, (ICONV_CONST char **)&iptr, &sn, &optr, &obytesleft) == (size_t)-1)
 | |
|     {
 | |
| #if 1
 | |
|       /* You get ISO C99 escape sequences if iconv fails */      
 | |
|       n = u32tocesc (c, s);
 | |
| #else
 | |
|       /* You get UTF-8 if iconv fails */
 | |
| #endif
 | |
|       return n;
 | |
|     }
 | |
| 
 | |
|   *optr = '\0';
 | |
| 
 | |
|   /* number of chars to be copied is optr - obuf if we want to do bounds
 | |
|      checking */
 | |
|   strcpy (s, obuf);
 | |
|   return (optr - obuf);
 | |
| #endif	/* HAVE_ICONV */
 | |
| 
 | |
|   n = u32tocesc (c, s);	/* fallback is ISO C99 escape sequences */
 | |
|   return n;
 | |
| }
 | |
| #else
 | |
| void
 | |
| u32reset ()
 | |
| {
 | |
| }
 | |
| #endif /* HANDLE_MULTIBYTE */
 | 
