236 lines
		
	
	
	
		
			4.5 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
		
		
			
		
	
	
			236 lines
		
	
	
	
		
			4.5 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
|   | /* unicode.c - functions to convert unicode characters */ | ||
|  | 
 | ||
|  | /* Copyright (C) 2010 Free Software Foundation, Inc.
 | ||
|  | 
 | ||
|  |    This file is part of GNU Bash, the Bourne Again SHell. | ||
|  | 
 | ||
|  |    Bash is free software: you can redistribute it and/or modify | ||
|  |    it under the terms of the GNU General Public License as published by | ||
|  |    the Free Software Foundation, either version 3 of the License, or | ||
|  |    (at your option) any later version. | ||
|  | 
 | ||
|  |    Bash is distributed in the hope that it will be useful, | ||
|  |    but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
|  |    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||
|  |    GNU General Public License for more details. | ||
|  | 
 | ||
|  |    You should have received a copy of the GNU General Public License | ||
|  |    along with Bash.  If not, see <http://www.gnu.org/licenses/>.
 | ||
|  | */ | ||
|  | 
 | ||
|  | #include <config.h>
 | ||
|  | 
 | ||
|  | #if defined (HANDLE_MULTIBYTE)
 | ||
|  | 
 | ||
|  | #include <stdc.h>
 | ||
|  | #include <wchar.h>
 | ||
|  | #include <bashansi.h>
 | ||
|  | #ifdef HAVE_UNISTD_H
 | ||
|  | #include <unistd.h>
 | ||
|  | #endif
 | ||
|  | #include <limits.h>
 | ||
|  | 
 | ||
|  | #if HAVE_ICONV
 | ||
|  | #  include <iconv.h>
 | ||
|  | #endif
 | ||
|  | 
 | ||
|  | #include <xmalloc.h>
 | ||
|  | 
 | ||
|  | #ifndef USHORT_MAX
 | ||
|  | #  ifdef USHRT_MAX
 | ||
|  | #    define USHORT_MAX USHRT_MAX
 | ||
|  | #  else
 | ||
|  | #    define USHORT_MAX ((unsigned short) ~(unsigned short)0)
 | ||
|  | #  endif
 | ||
|  | #endif
 | ||
|  | 
 | ||
|  | #if !defined (STREQ)
 | ||
|  | #  define STREQ(a, b) ((a)[0] == (b)[0] && strcmp ((a), (b)) == 0)
 | ||
|  | #endif /* !STREQ */
 | ||
|  | 
 | ||
|  | #if defined (HAVE_LOCALE_CHARSET)
 | ||
|  | extern const char *locale_charset __P((void)); | ||
|  | #else
 | ||
|  | extern char *get_locale_var __P((char *)); | ||
|  | #endif
 | ||
|  | 
 | ||
|  | static int u32init = 0; | ||
|  | static int utf8locale = 0; | ||
|  | #if defined (HAVE_ICONV)
 | ||
|  | static iconv_t localconv; | ||
|  | #endif
 | ||
|  | 
 | ||
|  | #ifndef HAVE_LOCALE_CHARSET
 | ||
|  | static char * | ||
|  | stub_charset () | ||
|  | { | ||
|  |   char *locale, *s, *t; | ||
|  | 
 | ||
|  |   locale = get_locale_var ("LC_CTYPE"); | ||
|  |   if (locale == 0 || *locale == 0) | ||
|  |     return "ASCII"; | ||
|  |   s = strrchr (locale, '.'); | ||
|  |   if (s) | ||
|  |     { | ||
|  |       t = strchr (s, '@'); | ||
|  |       if (t) | ||
|  | 	*t = 0; | ||
|  |       return ++s; | ||
|  |     } | ||
|  |   else if (STREQ (locale, "UTF-8")) | ||
|  |     return "UTF-8"; | ||
|  |   else | ||
|  |     return "ASCII"; | ||
|  | } | ||
|  | #endif
 | ||
|  | 
 | ||
|  | /* u32toascii ? */ | ||
|  | int | ||
|  | u32tochar (wc, s) | ||
|  |      wchar_t wc; | ||
|  |      char *s; | ||
|  | { | ||
|  |   unsigned long x; | ||
|  |   int l; | ||
|  | 
 | ||
|  |   x = wc; | ||
|  |   l = (x <= UCHAR_MAX) ? 1 : ((x <= USHORT_MAX) ? 2 : 4); | ||
|  | 
 | ||
|  |   if (x <= UCHAR_MAX) | ||
|  |     s[0] = x & 0xFF; | ||
|  |   else if (x <= USHORT_MAX)	/* assume unsigned short = 16 bits */ | ||
|  |     { | ||
|  |       s[0] = (x >> 8) & 0xFF; | ||
|  |       s[1] = x & 0xFF; | ||
|  |     } | ||
|  |   else | ||
|  |     { | ||
|  |       s[0] = (x >> 24) & 0xFF; | ||
|  |       s[1] = (x >> 16) & 0xFF; | ||
|  |       s[2] = (x >> 8) & 0xFF; | ||
|  |       s[3] = x & 0xFF; | ||
|  |     } | ||
|  |   s[l] = '\0'; | ||
|  |   return l;   | ||
|  | } | ||
|  | 
 | ||
|  | int | ||
|  | u32toutf8 (wc, s) | ||
|  |      wchar_t wc; | ||
|  |      char *s; | ||
|  | { | ||
|  |   int l; | ||
|  | 
 | ||
|  |   l = (wc < 0x0080) ? 1 : ((wc < 0x0800) ? 2 : 3); | ||
|  | 
 | ||
|  |   if (wc < 0x0080) | ||
|  |     s[0] = (unsigned char)wc; | ||
|  |   else if (wc < 0x0800) | ||
|  |     { | ||
|  |       s[0] = (wc >> 6) | 0xc0; | ||
|  |       s[1] = (wc & 0x3f) | 0x80; | ||
|  |     } | ||
|  |   else | ||
|  |     { | ||
|  |       s[0] = (wc >> 12) | 0xe0; | ||
|  |       s[1] = ((wc >> 6) & 0x3f) | 0x80; | ||
|  |       s[2] = (wc & 0x3f) | 0x80; | ||
|  |     } | ||
|  |   s[l] = '\0'; | ||
|  |   return l; | ||
|  | } | ||
|  | 
 | ||
|  | /* convert a single unicode-32 character into a multibyte string and put the
 | ||
|  |    result in S, which must be large enough (at least MB_LEN_MAX bytes) */ | ||
|  | int | ||
|  | u32cconv (c, s) | ||
|  |      unsigned long c; | ||
|  |      char *s; | ||
|  | { | ||
|  |   wchar_t wc; | ||
|  |   int n; | ||
|  | #if HAVE_ICONV
 | ||
|  |   const char *charset; | ||
|  |   char obuf[25], *optr; | ||
|  |   size_t obytesleft; | ||
|  |   const char *iptr; | ||
|  |   size_t sn; | ||
|  | #endif
 | ||
|  | 
 | ||
|  |   wc = c; | ||
|  | 
 | ||
|  | #if __STDC_ISO_10646__
 | ||
|  |   if (sizeof (wchar_t) == 4) | ||
|  |     { | ||
|  |       n = wctomb (s, wc); | ||
|  |       return n; | ||
|  |     } | ||
|  | #endif
 | ||
|  | 
 | ||
|  | #if HAVE_NL_LANGINFO
 | ||
|  |   codeset = nl_langinfo (CODESET); | ||
|  |   if (STREQ (codeset, "UTF-8")) | ||
|  |     { | ||
|  |       n = u32toutf8 (wc, s); | ||
|  |       return n; | ||
|  |     } | ||
|  | #endif
 | ||
|  | 
 | ||
|  | #if HAVE_ICONV
 | ||
|  |   /* this is mostly from coreutils-8.5/lib/unicodeio.c */ | ||
|  |   if (u32init == 0) | ||
|  |     { | ||
|  | #  if HAVE_LOCALE_CHARSET
 | ||
|  |       charset = locale_charset ();	/* XXX - fix later */ | ||
|  | #  else
 | ||
|  |       charset = stub_charset (); | ||
|  | #  endif
 | ||
|  |       if (STREQ (charset, "UTF-8")) | ||
|  | 	utf8locale = 1; | ||
|  |       else | ||
|  | 	{ | ||
|  | 	  localconv = iconv_open (charset, "UTF-8"); | ||
|  | 	  if (localconv == (iconv_t)-1) | ||
|  | 	    localconv = iconv_open (charset, "ASCII"); | ||
|  | 	} | ||
|  |       u32init = 1; | ||
|  |     } | ||
|  | 
 | ||
|  |   if (utf8locale) | ||
|  |     { | ||
|  |       n = u32toutf8 (wc, s); | ||
|  |       return n; | ||
|  |     } | ||
|  | 
 | ||
|  |   if (localconv == (iconv_t)-1) | ||
|  |     { | ||
|  |       n = u32tochar (wc, s); | ||
|  |       return n; | ||
|  |     } | ||
|  | 
 | ||
|  |   n = u32toutf8 (wc, s); | ||
|  | 
 | ||
|  |   optr = obuf; | ||
|  |   obytesleft = sizeof (obuf); | ||
|  |   iptr = s; | ||
|  |   sn = n; | ||
|  | 
 | ||
|  |   iconv (localconv, NULL, NULL, NULL, NULL); | ||
|  | 
 | ||
|  |   if (iconv (localconv, (ICONV_CONST char **)&iptr, &sn, &optr, &obytesleft) == (size_t)-1) | ||
|  |     return n;	/* You get utf-8 if iconv fails */ | ||
|  | 
 | ||
|  |   *optr = '\0'; | ||
|  | 
 | ||
|  |   /* number of chars to be copied is optr - obuf if we want to do bounds
 | ||
|  |      checking */ | ||
|  |   strcpy (s, obuf); | ||
|  |   return (optr - obuf); | ||
|  | #endif
 | ||
|  | 
 | ||
|  |   n = u32tochar (wc, s);	/* fallback */ | ||
|  |   return n; | ||
|  | } | ||
|  | 
 | ||
|  | #endif /* HANDLE_MULTIBYTE */
 |