refactor port encoding modes: utf-8 and iconv
* libguile/ports.h (struct scm_t_port): Add a flag for the port encoding mode: UTF8 or iconv. The iconv descriptors are now in a separate structure so that we can avoid attaching finalizers to the ports themselves, in some cases. * libguile/ports.c (scm_c_make_port_with_encoding): Init the encoding mode. (scm_i_remove_port): Adapt to call close_iconv_descriptors. (finalize_iconv_descriptors, open_iconv_descriptors): (close_iconv_descriptors): New infrastructure to manage iconv descriptors. (scm_i_port_iconv_descriptors): New internal helper. (scm_i_set_port_encoding_x): Use open_iconv_descriptors, if needed. (get_iconv_codepoint): Use pt->iconv_descriptors. (get_codepoint): Check the port encoding mode flags. * libguile/print.c (display_string_using_iconv): Use scm_i_port_iconv_descriptors. (display_string): Use pt->encoding_mode flag.
This commit is contained in:
parent
ca2ec018f2
commit
6c98257f2e
3 changed files with 159 additions and 95 deletions
210
libguile/ports.c
210
libguile/ports.c
|
|
@ -563,20 +563,12 @@ finalize_port (GC_PTR ptr, GC_PTR data)
|
|||
else
|
||||
{
|
||||
scm_t_ptob_descriptor *ptob = SCM_PORT_DESCRIPTOR (port);
|
||||
scm_t_port *entry;
|
||||
|
||||
if (ptob->free)
|
||||
/* Yes, I really do mean `free' rather than `close'. `close'
|
||||
is for explicit `close-port' by user. */
|
||||
ptob->free (port);
|
||||
|
||||
entry = SCM_PTAB_ENTRY (port);
|
||||
|
||||
if (entry->input_cd != (iconv_t) -1)
|
||||
iconv_close (entry->input_cd);
|
||||
if (entry->output_cd != (iconv_t) -1)
|
||||
iconv_close (entry->output_cd);
|
||||
|
||||
SCM_SETSTREAM (port, 0);
|
||||
SCM_CLR_PORT_OPEN_FLAG (port);
|
||||
|
||||
|
|
@ -613,10 +605,12 @@ scm_c_make_port_with_encoding (scm_t_bits tag, unsigned long mode_bits,
|
|||
entry->port = ret;
|
||||
entry->stream = stream;
|
||||
entry->encoding = encoding ? scm_gc_strdup (encoding, "port") : NULL;
|
||||
/* The conversion descriptors will be opened lazily. */
|
||||
entry->input_cd = (iconv_t) -1;
|
||||
entry->output_cd = (iconv_t) -1;
|
||||
if (encoding && strcmp (encoding, "UTF-8") == 0)
|
||||
entry->encoding_mode = SCM_PORT_ENCODING_MODE_UTF8;
|
||||
else
|
||||
entry->encoding_mode = SCM_PORT_ENCODING_MODE_ICONV;
|
||||
entry->ilseq_handler = handler;
|
||||
entry->iconv_descriptors = NULL;
|
||||
|
||||
scm_weak_set_add_x (scm_i_port_weak_set, ret);
|
||||
|
||||
|
|
@ -644,6 +638,8 @@ scm_new_port_table_entry (scm_t_bits tag)
|
|||
|
||||
/* Remove a port from the table and destroy it. */
|
||||
|
||||
static void close_iconv_descriptors (scm_t_iconv_descriptors *id);
|
||||
|
||||
static void
|
||||
scm_i_remove_port (SCM port)
|
||||
#define FUNC_NAME "scm_remove_port"
|
||||
|
|
@ -658,16 +654,10 @@ scm_i_remove_port (SCM port)
|
|||
p->putback_buf = NULL;
|
||||
p->putback_buf_size = 0;
|
||||
|
||||
if (p->input_cd != (iconv_t) -1)
|
||||
if (p->iconv_descriptors)
|
||||
{
|
||||
iconv_close (p->input_cd);
|
||||
p->input_cd = (iconv_t) -1;
|
||||
}
|
||||
|
||||
if (p->output_cd != (iconv_t) -1)
|
||||
{
|
||||
iconv_close (p->output_cd);
|
||||
p->output_cd = (iconv_t) -1;
|
||||
close_iconv_descriptors (p->iconv_descriptors);
|
||||
p->iconv_descriptors = NULL;
|
||||
}
|
||||
}
|
||||
#undef FUNC_NAME
|
||||
|
|
@ -852,73 +842,145 @@ scm_i_default_port_encoding (void)
|
|||
}
|
||||
}
|
||||
|
||||
void
|
||||
scm_i_set_port_encoding_x (SCM port, const char *encoding)
|
||||
static void
|
||||
finalize_iconv_descriptors (GC_PTR ptr, GC_PTR data)
|
||||
{
|
||||
scm_t_port *pt;
|
||||
iconv_t new_input_cd, new_output_cd;
|
||||
close_iconv_descriptors (ptr);
|
||||
}
|
||||
|
||||
new_input_cd = (iconv_t) -1;
|
||||
new_output_cd = (iconv_t) -1;
|
||||
static scm_t_iconv_descriptors *
|
||||
open_iconv_descriptors (const char *encoding, int reading, int writing)
|
||||
{
|
||||
scm_t_iconv_descriptors *id;
|
||||
iconv_t input_cd, output_cd;
|
||||
|
||||
/* Set the character encoding for this port. */
|
||||
pt = SCM_PTAB_ENTRY (port);
|
||||
input_cd = (iconv_t) -1;
|
||||
output_cd = (iconv_t) -1;
|
||||
|
||||
if (encoding == NULL)
|
||||
encoding = "ISO-8859-1";
|
||||
|
||||
if (pt->encoding != encoding)
|
||||
pt->encoding = scm_gc_strdup (encoding, "port");
|
||||
|
||||
/* If ENCODING is UTF-8, then no conversion descriptor is opened
|
||||
because we do I/O ourselves. This saves 100+ KiB for each
|
||||
descriptor. */
|
||||
if (strcmp (encoding, "UTF-8"))
|
||||
if (reading)
|
||||
{
|
||||
if (SCM_CELL_WORD_0 (port) & SCM_RDNG)
|
||||
{
|
||||
/* Open an input iconv conversion descriptor, from ENCODING
|
||||
to UTF-8. We choose UTF-8, not UTF-32, because iconv
|
||||
implementations can typically convert from anything to
|
||||
UTF-8, but not to UTF-32 (see
|
||||
<http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00007.html>). */
|
||||
new_input_cd = iconv_open ("UTF-8", encoding);
|
||||
if (new_input_cd == (iconv_t) -1)
|
||||
goto invalid_encoding;
|
||||
}
|
||||
/* Open an input iconv conversion descriptor, from ENCODING
|
||||
to UTF-8. We choose UTF-8, not UTF-32, because iconv
|
||||
implementations can typically convert from anything to
|
||||
UTF-8, but not to UTF-32 (see
|
||||
<http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00007.html>). */
|
||||
|
||||
if (SCM_CELL_WORD_0 (port) & SCM_WRTNG)
|
||||
{
|
||||
new_output_cd = iconv_open (encoding, "UTF-8");
|
||||
if (new_output_cd == (iconv_t) -1)
|
||||
{
|
||||
if (new_input_cd != (iconv_t) -1)
|
||||
iconv_close (new_input_cd);
|
||||
goto invalid_encoding;
|
||||
}
|
||||
}
|
||||
/* Assume opening an iconv descriptor causes about 16 KB of
|
||||
allocation. */
|
||||
scm_gc_register_allocation (16 * 1024);
|
||||
|
||||
input_cd = iconv_open ("UTF-8", encoding);
|
||||
if (input_cd == (iconv_t) -1)
|
||||
goto invalid_encoding;
|
||||
}
|
||||
|
||||
if (pt->input_cd != (iconv_t) -1)
|
||||
iconv_close (pt->input_cd);
|
||||
if (pt->output_cd != (iconv_t) -1)
|
||||
iconv_close (pt->output_cd);
|
||||
if (writing)
|
||||
{
|
||||
/* Assume opening an iconv descriptor causes about 16 KB of
|
||||
allocation. */
|
||||
scm_gc_register_allocation (16 * 1024);
|
||||
|
||||
pt->input_cd = new_input_cd;
|
||||
pt->output_cd = new_output_cd;
|
||||
output_cd = iconv_open (encoding, "UTF-8");
|
||||
if (output_cd == (iconv_t) -1)
|
||||
{
|
||||
if (input_cd != (iconv_t) -1)
|
||||
iconv_close (input_cd);
|
||||
goto invalid_encoding;
|
||||
}
|
||||
}
|
||||
|
||||
return;
|
||||
id = scm_gc_malloc_pointerless (sizeof (*id), "iconv descriptors");
|
||||
id->input_cd = input_cd;
|
||||
id->output_cd = output_cd;
|
||||
|
||||
{
|
||||
GC_finalization_proc prev_finalizer;
|
||||
GC_PTR prev_finalization_data;
|
||||
|
||||
/* Register a finalizer to close the descriptors. */
|
||||
GC_REGISTER_FINALIZER_NO_ORDER (id, finalize_iconv_descriptors, 0,
|
||||
&prev_finalizer, &prev_finalization_data);
|
||||
}
|
||||
|
||||
return id;
|
||||
|
||||
invalid_encoding:
|
||||
{
|
||||
SCM err;
|
||||
err = scm_from_locale_string (encoding);
|
||||
scm_misc_error ("scm_i_set_port_encoding_x",
|
||||
scm_misc_error ("open_iconv_descriptors",
|
||||
"invalid or unknown character encoding ~s",
|
||||
scm_list_1 (err));
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
close_iconv_descriptors (scm_t_iconv_descriptors *id)
|
||||
{
|
||||
if (id->input_cd != (iconv_t) -1)
|
||||
iconv_close (id->input_cd);
|
||||
if (id->output_cd != (iconv_t) -1)
|
||||
iconv_close (id->output_cd);
|
||||
id->input_cd = (void *) -1;
|
||||
id->output_cd = (void *) -1;
|
||||
}
|
||||
|
||||
scm_t_iconv_descriptors *
|
||||
scm_i_port_iconv_descriptors (SCM port)
|
||||
{
|
||||
scm_t_port *pt;
|
||||
|
||||
pt = SCM_PTAB_ENTRY (port);
|
||||
|
||||
assert (pt->encoding_mode == SCM_PORT_ENCODING_MODE_ICONV);
|
||||
|
||||
if (!pt->iconv_descriptors)
|
||||
{
|
||||
if (!pt->encoding)
|
||||
pt->encoding = "ISO-8859-1";
|
||||
pt->iconv_descriptors =
|
||||
open_iconv_descriptors (pt->encoding,
|
||||
SCM_INPUT_PORT_P (port),
|
||||
SCM_OUTPUT_PORT_P (port));
|
||||
}
|
||||
|
||||
return pt->iconv_descriptors;
|
||||
}
|
||||
|
||||
void
|
||||
scm_i_set_port_encoding_x (SCM port, const char *encoding)
|
||||
{
|
||||
scm_t_port *pt;
|
||||
scm_t_iconv_descriptors *prev;
|
||||
|
||||
/* Set the character encoding for this port. */
|
||||
pt = SCM_PTAB_ENTRY (port);
|
||||
prev = pt->iconv_descriptors;
|
||||
|
||||
if (encoding == NULL)
|
||||
encoding = "ISO-8859-1";
|
||||
|
||||
if (strcmp (encoding, "UTF-8") == 0)
|
||||
{
|
||||
pt->encoding = "UTF-8";
|
||||
pt->encoding_mode = SCM_PORT_ENCODING_MODE_UTF8;
|
||||
pt->iconv_descriptors = NULL;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Open descriptors before mutating the port. */
|
||||
pt->iconv_descriptors =
|
||||
open_iconv_descriptors (encoding,
|
||||
SCM_INPUT_PORT_P (port),
|
||||
SCM_OUTPUT_PORT_P (port));
|
||||
pt->encoding = scm_gc_strdup (encoding, "port");
|
||||
pt->encoding_mode = SCM_PORT_ENCODING_MODE_ICONV;
|
||||
}
|
||||
|
||||
if (prev)
|
||||
close_iconv_descriptors (prev);
|
||||
}
|
||||
|
||||
SCM_DEFINE (scm_port_encoding, "port-encoding", 1, 0, 0,
|
||||
(SCM port),
|
||||
"Returns, as a string, the character encoding that @var{port}\n"
|
||||
|
|
@ -1616,13 +1678,13 @@ static int
|
|||
get_iconv_codepoint (SCM port, scm_t_wchar *codepoint,
|
||||
char buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
|
||||
{
|
||||
scm_t_port *pt;
|
||||
scm_t_iconv_descriptors *id;
|
||||
int err, byte_read;
|
||||
size_t bytes_consumed, output_size;
|
||||
char *output;
|
||||
scm_t_uint8 utf8_buf[SCM_MBCHAR_BUF_SIZE];
|
||||
|
||||
pt = SCM_PTAB_ENTRY (port);
|
||||
id = scm_i_port_iconv_descriptors (port);
|
||||
|
||||
for (output_size = 0, output = (char *) utf8_buf,
|
||||
bytes_consumed = 0, err = 0;
|
||||
|
|
@ -1652,8 +1714,7 @@ get_iconv_codepoint (SCM port, scm_t_wchar *codepoint,
|
|||
input_left = bytes_consumed + 1;
|
||||
output_left = sizeof (utf8_buf);
|
||||
|
||||
done = iconv (pt->input_cd, &input, &input_left,
|
||||
&output, &output_left);
|
||||
done = iconv (id->input_cd, &input, &input_left, &output, &output_left);
|
||||
if (done == (size_t) -1)
|
||||
{
|
||||
err = errno;
|
||||
|
|
@ -1689,12 +1750,7 @@ get_codepoint (SCM port, scm_t_wchar *codepoint,
|
|||
int err;
|
||||
scm_t_port *pt = SCM_PTAB_ENTRY (port);
|
||||
|
||||
if (pt->input_cd == (iconv_t) -1)
|
||||
/* Initialize the conversion descriptors, if needed. */
|
||||
scm_i_set_port_encoding_x (port, pt->encoding);
|
||||
|
||||
/* FIXME: In 2.1, add a flag to determine whether a port is UTF-8. */
|
||||
if (pt->input_cd == (iconv_t) -1)
|
||||
if (pt->encoding_mode == SCM_PORT_ENCODING_MODE_UTF8)
|
||||
err = get_utf8_codepoint (port, codepoint, (scm_t_uint8 *) buf, len);
|
||||
else
|
||||
err = get_iconv_codepoint (port, codepoint, buf, len);
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@
|
|||
#define SCM_PORTS_H
|
||||
|
||||
/* Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2003, 2004,
|
||||
* 2006, 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
|
||||
* 2006, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public License
|
||||
|
|
@ -48,6 +48,20 @@ typedef enum scm_t_port_rw_active {
|
|||
SCM_PORT_WRITE = 2
|
||||
} scm_t_port_rw_active;
|
||||
|
||||
typedef enum scm_t_port_encoding_mode {
|
||||
SCM_PORT_ENCODING_MODE_UTF8,
|
||||
SCM_PORT_ENCODING_MODE_ICONV
|
||||
} scm_t_port_encoding_mode;
|
||||
|
||||
/* This is a separate object so that only those ports that use iconv
|
||||
cause finalizers to be registered. */
|
||||
typedef struct scm_t_iconv_descriptors
|
||||
{
|
||||
/* input/output iconv conversion descriptors */
|
||||
void *input_cd;
|
||||
void *output_cd;
|
||||
} scm_t_iconv_descriptors;
|
||||
|
||||
/* C representation of a Scheme port. */
|
||||
|
||||
typedef struct
|
||||
|
|
@ -65,10 +79,6 @@ typedef struct
|
|||
long line_number; /* debugging support. */
|
||||
int column_number; /* debugging support. */
|
||||
|
||||
/* Character encoding support */
|
||||
char *encoding;
|
||||
scm_t_string_failed_conversion_handler ilseq_handler;
|
||||
|
||||
/* port buffers. the buffer(s) are set up for all ports.
|
||||
in the case of string ports, the buffer is the string itself.
|
||||
in the case of unbuffered file ports, the buffer is a
|
||||
|
|
@ -119,9 +129,11 @@ typedef struct
|
|||
unsigned char *putback_buf;
|
||||
size_t putback_buf_size; /* allocated size of putback_buf. */
|
||||
|
||||
/* input/output iconv conversion descriptors */
|
||||
void *input_cd;
|
||||
void *output_cd;
|
||||
/* Character encoding support */
|
||||
char *encoding;
|
||||
scm_t_port_encoding_mode encoding_mode;
|
||||
scm_t_string_failed_conversion_handler ilseq_handler;
|
||||
scm_t_iconv_descriptors *iconv_descriptors;
|
||||
} scm_t_port;
|
||||
|
||||
|
||||
|
|
@ -284,6 +296,7 @@ SCM_API SCM scm_close_output_port (SCM port);
|
|||
characters. */
|
||||
SCM_INTERNAL const char *scm_i_default_port_encoding (void);
|
||||
SCM_INTERNAL void scm_i_set_default_port_encoding (const char *);
|
||||
SCM_INTERNAL scm_t_iconv_descriptors *scm_i_port_iconv_descriptors (SCM port);
|
||||
SCM_INTERNAL void scm_i_set_port_encoding_x (SCM port, const char *str);
|
||||
SCM_API SCM scm_port_encoding (SCM port);
|
||||
SCM_API SCM scm_set_port_encoding_x (SCM port, SCM encoding);
|
||||
|
|
|
|||
|
|
@ -861,9 +861,9 @@ display_string_using_iconv (const void *str, int narrow_p, size_t len,
|
|||
scm_t_string_failed_conversion_handler strategy)
|
||||
{
|
||||
size_t printed;
|
||||
scm_t_port *pt;
|
||||
scm_t_iconv_descriptors *id;
|
||||
|
||||
pt = SCM_PTAB_ENTRY (port);
|
||||
id = scm_i_port_iconv_descriptors (port);
|
||||
|
||||
printed = 0;
|
||||
|
||||
|
|
@ -892,7 +892,7 @@ display_string_using_iconv (const void *str, int narrow_p, size_t len,
|
|||
output = encoded_output;
|
||||
output_left = sizeof (encoded_output);
|
||||
|
||||
done = iconv (pt->output_cd, &input, &input_left,
|
||||
done = iconv (id->output_cd, &input, &input_left,
|
||||
&output, &output_left);
|
||||
|
||||
output_len = sizeof (encoded_output) - output_left;
|
||||
|
|
@ -902,7 +902,7 @@ display_string_using_iconv (const void *str, int narrow_p, size_t len,
|
|||
int errno_save = errno;
|
||||
|
||||
/* Reset the `iconv' state. */
|
||||
iconv (pt->output_cd, NULL, NULL, NULL, NULL);
|
||||
iconv (id->output_cd, NULL, NULL, NULL, NULL);
|
||||
|
||||
/* Print the OUTPUT_LEN bytes successfully converted. */
|
||||
scm_lfwrite_unlocked (encoded_output, output_len, port);
|
||||
|
|
@ -966,12 +966,7 @@ display_string (const void *str, int narrow_p,
|
|||
|
||||
pt = SCM_PTAB_ENTRY (port);
|
||||
|
||||
if (pt->output_cd == (iconv_t) -1)
|
||||
/* Initialize the conversion descriptors, if needed. */
|
||||
scm_i_set_port_encoding_x (port, pt->encoding);
|
||||
|
||||
/* FIXME: In 2.1, add a flag to determine whether a port is UTF-8. */
|
||||
if (pt->output_cd == (iconv_t) -1)
|
||||
if (pt->encoding_mode == SCM_PORT_ENCODING_MODE_UTF8)
|
||||
return display_string_as_utf8 (str, narrow_p, len, port);
|
||||
else
|
||||
return display_string_using_iconv (str, narrow_p, len,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue