236 lines
		
	
	
	
		
			4.5 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
		
		
			
		
	
	
			236 lines
		
	
	
	
		
			4.5 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| 
								 | 
							
								/* unicode.c - functions to convert unicode characters */
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								/* Copyright (C) 2010 Free Software Foundation, Inc.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								   This file is part of GNU Bash, the Bourne Again SHell.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								   Bash is free software: you can redistribute it and/or modify
							 | 
						||
| 
								 | 
							
								   it under the terms of the GNU General Public License as published by
							 | 
						||
| 
								 | 
							
								   the Free Software Foundation, either version 3 of the License, or
							 | 
						||
| 
								 | 
							
								   (at your option) any later version.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								   Bash is distributed in the hope that it will be useful,
							 | 
						||
| 
								 | 
							
								   but WITHOUT ANY WARRANTY; without even the implied warranty of
							 | 
						||
| 
								 | 
							
								   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
							 | 
						||
| 
								 | 
							
								   GNU General Public License for more details.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								   You should have received a copy of the GNU General Public License
							 | 
						||
| 
								 | 
							
								   along with Bash.  If not, see <http://www.gnu.org/licenses/>.
							 | 
						||
| 
								 | 
							
								*/
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#include <config.h>
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#if defined (HANDLE_MULTIBYTE)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#include <stdc.h>
							 | 
						||
| 
								 | 
							
								#include <wchar.h>
							 | 
						||
| 
								 | 
							
								#include <bashansi.h>
							 | 
						||
| 
								 | 
							
								#ifdef HAVE_UNISTD_H
							 | 
						||
| 
								 | 
							
								#include <unistd.h>
							 | 
						||
| 
								 | 
							
								#endif
							 | 
						||
| 
								 | 
							
								#include <limits.h>
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#if HAVE_ICONV
							 | 
						||
| 
								 | 
							
								#  include <iconv.h>
							 | 
						||
| 
								 | 
							
								#endif
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#include <xmalloc.h>
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#ifndef USHORT_MAX
							 | 
						||
| 
								 | 
							
								#  ifdef USHRT_MAX
							 | 
						||
| 
								 | 
							
								#    define USHORT_MAX USHRT_MAX
							 | 
						||
| 
								 | 
							
								#  else
							 | 
						||
| 
								 | 
							
								#    define USHORT_MAX ((unsigned short) ~(unsigned short)0)
							 | 
						||
| 
								 | 
							
								#  endif
							 | 
						||
| 
								 | 
							
								#endif
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#if !defined (STREQ)
							 | 
						||
| 
								 | 
							
								#  define STREQ(a, b) ((a)[0] == (b)[0] && strcmp ((a), (b)) == 0)
							 | 
						||
| 
								 | 
							
								#endif /* !STREQ */
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#if defined (HAVE_LOCALE_CHARSET)
							 | 
						||
| 
								 | 
							
								extern const char *locale_charset __P((void));
							 | 
						||
| 
								 | 
							
								#else
							 | 
						||
| 
								 | 
							
								extern char *get_locale_var __P((char *));
							 | 
						||
| 
								 | 
							
								#endif
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								static int u32init = 0;
							 | 
						||
| 
								 | 
							
								static int utf8locale = 0;
							 | 
						||
| 
								 | 
							
								#if defined (HAVE_ICONV)
							 | 
						||
| 
								 | 
							
								static iconv_t localconv;
							 | 
						||
| 
								 | 
							
								#endif
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#ifndef HAVE_LOCALE_CHARSET
							 | 
						||
| 
								 | 
							
								static char *
							 | 
						||
| 
								 | 
							
								stub_charset ()
							 | 
						||
| 
								 | 
							
								{
							 | 
						||
| 
								 | 
							
								  char *locale, *s, *t;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  locale = get_locale_var ("LC_CTYPE");
							 | 
						||
| 
								 | 
							
								  if (locale == 0 || *locale == 0)
							 | 
						||
| 
								 | 
							
								    return "ASCII";
							 | 
						||
| 
								 | 
							
								  s = strrchr (locale, '.');
							 | 
						||
| 
								 | 
							
								  if (s)
							 | 
						||
| 
								 | 
							
								    {
							 | 
						||
| 
								 | 
							
								      t = strchr (s, '@');
							 | 
						||
| 
								 | 
							
								      if (t)
							 | 
						||
| 
								 | 
							
									*t = 0;
							 | 
						||
| 
								 | 
							
								      return ++s;
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								  else if (STREQ (locale, "UTF-8"))
							 | 
						||
| 
								 | 
							
								    return "UTF-8";
							 | 
						||
| 
								 | 
							
								  else
							 | 
						||
| 
								 | 
							
								    return "ASCII";
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								#endif
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								/* u32toascii ? */
							 | 
						||
| 
								 | 
							
								int
							 | 
						||
| 
								 | 
							
								u32tochar (wc, s)
							 | 
						||
| 
								 | 
							
								     wchar_t wc;
							 | 
						||
| 
								 | 
							
								     char *s;
							 | 
						||
| 
								 | 
							
								{
							 | 
						||
| 
								 | 
							
								  unsigned long x;
							 | 
						||
| 
								 | 
							
								  int l;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  x = wc;
							 | 
						||
| 
								 | 
							
								  l = (x <= UCHAR_MAX) ? 1 : ((x <= USHORT_MAX) ? 2 : 4);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  if (x <= UCHAR_MAX)
							 | 
						||
| 
								 | 
							
								    s[0] = x & 0xFF;
							 | 
						||
| 
								 | 
							
								  else if (x <= USHORT_MAX)	/* assume unsigned short = 16 bits */
							 | 
						||
| 
								 | 
							
								    {
							 | 
						||
| 
								 | 
							
								      s[0] = (x >> 8) & 0xFF;
							 | 
						||
| 
								 | 
							
								      s[1] = x & 0xFF;
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								  else
							 | 
						||
| 
								 | 
							
								    {
							 | 
						||
| 
								 | 
							
								      s[0] = (x >> 24) & 0xFF;
							 | 
						||
| 
								 | 
							
								      s[1] = (x >> 16) & 0xFF;
							 | 
						||
| 
								 | 
							
								      s[2] = (x >> 8) & 0xFF;
							 | 
						||
| 
								 | 
							
								      s[3] = x & 0xFF;
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								  s[l] = '\0';
							 | 
						||
| 
								 | 
							
								  return l;  
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								int
							 | 
						||
| 
								 | 
							
								u32toutf8 (wc, s)
							 | 
						||
| 
								 | 
							
								     wchar_t wc;
							 | 
						||
| 
								 | 
							
								     char *s;
							 | 
						||
| 
								 | 
							
								{
							 | 
						||
| 
								 | 
							
								  int l;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  l = (wc < 0x0080) ? 1 : ((wc < 0x0800) ? 2 : 3);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  if (wc < 0x0080)
							 | 
						||
| 
								 | 
							
								    s[0] = (unsigned char)wc;
							 | 
						||
| 
								 | 
							
								  else if (wc < 0x0800)
							 | 
						||
| 
								 | 
							
								    {
							 | 
						||
| 
								 | 
							
								      s[0] = (wc >> 6) | 0xc0;
							 | 
						||
| 
								 | 
							
								      s[1] = (wc & 0x3f) | 0x80;
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								  else
							 | 
						||
| 
								 | 
							
								    {
							 | 
						||
| 
								 | 
							
								      s[0] = (wc >> 12) | 0xe0;
							 | 
						||
| 
								 | 
							
								      s[1] = ((wc >> 6) & 0x3f) | 0x80;
							 | 
						||
| 
								 | 
							
								      s[2] = (wc & 0x3f) | 0x80;
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								  s[l] = '\0';
							 | 
						||
| 
								 | 
							
								  return l;
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								/* convert a single unicode-32 character into a multibyte string and put the
							 | 
						||
| 
								 | 
							
								   result in S, which must be large enough (at least MB_LEN_MAX bytes) */
							 | 
						||
| 
								 | 
							
								int
							 | 
						||
| 
								 | 
							
								u32cconv (c, s)
							 | 
						||
| 
								 | 
							
								     unsigned long c;
							 | 
						||
| 
								 | 
							
								     char *s;
							 | 
						||
| 
								 | 
							
								{
							 | 
						||
| 
								 | 
							
								  wchar_t wc;
							 | 
						||
| 
								 | 
							
								  int n;
							 | 
						||
| 
								 | 
							
								#if HAVE_ICONV
							 | 
						||
| 
								 | 
							
								  const char *charset;
							 | 
						||
| 
								 | 
							
								  char obuf[25], *optr;
							 | 
						||
| 
								 | 
							
								  size_t obytesleft;
							 | 
						||
| 
								 | 
							
								  const char *iptr;
							 | 
						||
| 
								 | 
							
								  size_t sn;
							 | 
						||
| 
								 | 
							
								#endif
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  wc = c;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#if __STDC_ISO_10646__
							 | 
						||
| 
								 | 
							
								  if (sizeof (wchar_t) == 4)
							 | 
						||
| 
								 | 
							
								    {
							 | 
						||
| 
								 | 
							
								      n = wctomb (s, wc);
							 | 
						||
| 
								 | 
							
								      return n;
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								#endif
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#if HAVE_NL_LANGINFO
							 | 
						||
| 
								 | 
							
								  codeset = nl_langinfo (CODESET);
							 | 
						||
| 
								 | 
							
								  if (STREQ (codeset, "UTF-8"))
							 | 
						||
| 
								 | 
							
								    {
							 | 
						||
| 
								 | 
							
								      n = u32toutf8 (wc, s);
							 | 
						||
| 
								 | 
							
								      return n;
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								#endif
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#if HAVE_ICONV
							 | 
						||
| 
								 | 
							
								  /* this is mostly from coreutils-8.5/lib/unicodeio.c */
							 | 
						||
| 
								 | 
							
								  if (u32init == 0)
							 | 
						||
| 
								 | 
							
								    {
							 | 
						||
| 
								 | 
							
								#  if HAVE_LOCALE_CHARSET
							 | 
						||
| 
								 | 
							
								      charset = locale_charset ();	/* XXX - fix later */
							 | 
						||
| 
								 | 
							
								#  else
							 | 
						||
| 
								 | 
							
								      charset = stub_charset ();
							 | 
						||
| 
								 | 
							
								#  endif
							 | 
						||
| 
								 | 
							
								      if (STREQ (charset, "UTF-8"))
							 | 
						||
| 
								 | 
							
									utf8locale = 1;
							 | 
						||
| 
								 | 
							
								      else
							 | 
						||
| 
								 | 
							
									{
							 | 
						||
| 
								 | 
							
									  localconv = iconv_open (charset, "UTF-8");
							 | 
						||
| 
								 | 
							
									  if (localconv == (iconv_t)-1)
							 | 
						||
| 
								 | 
							
									    localconv = iconv_open (charset, "ASCII");
							 | 
						||
| 
								 | 
							
									}
							 | 
						||
| 
								 | 
							
								      u32init = 1;
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  if (utf8locale)
							 | 
						||
| 
								 | 
							
								    {
							 | 
						||
| 
								 | 
							
								      n = u32toutf8 (wc, s);
							 | 
						||
| 
								 | 
							
								      return n;
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  if (localconv == (iconv_t)-1)
							 | 
						||
| 
								 | 
							
								    {
							 | 
						||
| 
								 | 
							
								      n = u32tochar (wc, s);
							 | 
						||
| 
								 | 
							
								      return n;
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  n = u32toutf8 (wc, s);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  optr = obuf;
							 | 
						||
| 
								 | 
							
								  obytesleft = sizeof (obuf);
							 | 
						||
| 
								 | 
							
								  iptr = s;
							 | 
						||
| 
								 | 
							
								  sn = n;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  iconv (localconv, NULL, NULL, NULL, NULL);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  if (iconv (localconv, (ICONV_CONST char **)&iptr, &sn, &optr, &obytesleft) == (size_t)-1)
							 | 
						||
| 
								 | 
							
								    return n;	/* You get utf-8 if iconv fails */
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  *optr = '\0';
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  /* number of chars to be copied is optr - obuf if we want to do bounds
							 | 
						||
| 
								 | 
							
								     checking */
							 | 
						||
| 
								 | 
							
								  strcpy (s, obuf);
							 | 
						||
| 
								 | 
							
								  return (optr - obuf);
							 | 
						||
| 
								 | 
							
								#endif
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  n = u32tochar (wc, s);	/* fallback */
							 | 
						||
| 
								 | 
							
								  return n;
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#endif /* HANDLE_MULTIBYTE */
							 |