fix scm_to_utf8_stringn for wide strings to NUL-append the strings

* libguile/strings.c (u32_u8_strlen, scm_to_utf8_stringn): Sigh, fix the
  u32-to-u8 optimization to ensure that the utf8 string is
  NUL-terminated.
This commit is contained in:
Andy Wingo 2012-04-01 11:29:41 -07:00
commit 2c1b79513b

View file

@ -1874,6 +1874,39 @@ latin1_to_u8 (const scm_t_uint8 *str, size_t latin_len,
return u8_result;
}
/* From RFC 3629:
Char. number range | UTF-8 octet sequence
(hexadecimal) | (binary)
--------------------+---------------------------------------------
0000 0000-0000 007F | 0xxxxxxx
0000 0080-0000 07FF | 110xxxxx 10xxxxxx
0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
*/
static size_t
u32_u8_strlen (const scm_t_uint32 *str, size_t len)
{
size_t ret, i;
for (i = 0, ret = 0; i < len; i++)
{
scm_t_uint32 c = str[i];
if (c <= 0x7f)
ret += 1;
else if (c <= 0x7ff)
ret += 2;
else if (c <= 0xffff)
ret += 3;
else
ret += 4;
}
return ret;
}
char *
scm_to_utf8_stringn (SCM str, size_t *lenp)
{
@ -1882,9 +1915,39 @@ scm_to_utf8_stringn (SCM str, size_t *lenp)
scm_i_string_length (str),
NULL, lenp);
else
return (char *) u32_to_u8 ((scm_t_uint32*)scm_i_string_wide_chars (str),
scm_i_string_length (str),
NULL, lenp);
{
scm_t_uint8 *buf, *ret;
size_t len, allocated;
len = u32_u8_strlen ((scm_t_uint32*)scm_i_string_wide_chars (str),
scm_i_string_length (str));
allocated = len + 1;
buf = scm_malloc (allocated);
ret = u32_to_u8 ((scm_t_uint32*)scm_i_string_wide_chars (str),
scm_i_string_length (str), buf, &len);
if (ret == buf && len + 1 == allocated)
{
ret[len] = 0;
return (char *) ret;
}
/* An error: a bad codepoint. */
{
int saved_errno = errno;
free (buf);
if (!saved_errno)
abort ();
scm_decoding_error ("scm_to_utf8_stringn", errno,
"invalid codepoint in string", str);
/* Not reached. */
return NULL;
}
}
}
scm_t_wchar *