Have string ports honor `%default-port-encoding'.
* libguile/strports.c (scm_i_mkstrport): Remove.
(scm_mkstrport): Don't change the port's encoding to UTF-8; convert
STR to the default port encoding.
(scm_strport_to_string): Fix documentation & indentation.
* libguile/strports.h (scm_i_mkstrport): Remove.
* test-suite/lib.scm (exception:encoding-error): New variable.
(format-test-name): Set `%default-port-encoding' to "UTF-8".
* test-suite/tests/ports.test ("string ports")["%default-port-encoding
is honored", "suitable encoding [latin-1]", "suitable encoding
[latin-3]", "wrong encoding"]: New tests.
* test-suite/tests/r6rs-ports.test ("7.2.11 Binary
Output")["put-bytevector with UTF-16 string port", "put-bytevector
with wrong-encoding string port"]: New tests.
* test-suite/tests/reader.test (read-string): Set
`%default-port-encoding' to `#f'.
("reading")["unprintable symbol"]: Use a string that doesn't contain
zeros.
* doc/ref/api-io.texi (String Ports): Document encoding issues with
`call-with-output-string' and `with-output-to-string'.
This commit is contained in:
parent
29bcdbb059
commit
7b0419128b
7 changed files with 137 additions and 81 deletions
|
|
@ -1,6 +1,6 @@
|
|||
@c -*-texinfo-*-
|
||||
@c This is part of the GNU Guile Reference Manual.
|
||||
@c Copyright (C) 1996, 1997, 2000, 2001, 2002, 2003, 2004, 2007, 2009
|
||||
@c Copyright (C) 1996, 1997, 2000, 2001, 2002, 2003, 2004, 2007, 2009, 2010
|
||||
@c Free Software Foundation, Inc.
|
||||
@c See the file guile.texi for copying conditions.
|
||||
|
||||
|
|
@ -972,6 +972,28 @@ away from its default.
|
|||
Calls the one-argument procedure @var{proc} with a newly created output
|
||||
port. When the function returns, the string composed of the characters
|
||||
written into the port is returned. @var{proc} should not close the port.
|
||||
|
||||
Note that which characters can be written to a string port depend on the port's
|
||||
encoding. The default encoding of string ports is specified by the
|
||||
@code{%default-port-encoding} fluid (@pxref{Ports,
|
||||
@code{%default-port-encoding}}). For instance, it is an error to write Greek
|
||||
letter alpha to an ISO-8859-1-encoded string port since this character cannot be
|
||||
represented with ISO-8859-1:
|
||||
|
||||
@example
|
||||
(define alpha (integer->char #x03b1)) ; GREEK SMALL LETTER ALPHA
|
||||
|
||||
(with-fluids ((%default-port-encoding "ISO-8859-1"))
|
||||
(call-with-output-string
|
||||
(lambda (p)
|
||||
(display alpha p))))
|
||||
|
||||
@result{}
|
||||
Throw to key `encoding-error'
|
||||
@end example
|
||||
|
||||
Changing the string port's encoding to a Unicode-capable encoding such as UTF-8
|
||||
solves the problem.
|
||||
@end deffn
|
||||
|
||||
@deffn {Scheme Procedure} call-with-input-string string proc
|
||||
|
|
@ -985,6 +1007,8 @@ read. The value yielded by the @var{proc} is returned.
|
|||
Calls the zero-argument procedure @var{thunk} with the current output
|
||||
port set temporarily to a new string port. It returns a string
|
||||
composed of the characters written to the current output.
|
||||
|
||||
See @code{call-with-output-string} above for character encoding considerations.
|
||||
@end deffn
|
||||
|
||||
@deffn {Scheme Procedure} with-input-from-string string thunk
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
/* Copyright (C) 1995,1996,1998,1999,2000,2001,2002, 2003, 2005, 2006, 2009 Free Software Foundation, Inc.
|
||||
/* Copyright (C) 1995,1996,1998,1999,2000,2001,2002, 2003, 2005, 2006, 2009, 2010 Free Software Foundation, Inc.
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public License
|
||||
|
|
@ -289,84 +289,60 @@ st_truncate (SCM port, scm_t_off length)
|
|||
pt->write_pos = pt->read_end;
|
||||
}
|
||||
|
||||
SCM
|
||||
scm_i_mkstrport (SCM pos, const char *utf8_str, size_t str_len, long modes, const char *caller)
|
||||
SCM
|
||||
scm_mkstrport (SCM pos, SCM str, long modes, const char *caller)
|
||||
{
|
||||
SCM z, str;
|
||||
SCM z;
|
||||
scm_t_port *pt;
|
||||
size_t c_pos;
|
||||
char *buf;
|
||||
size_t str_len, c_pos;
|
||||
char *buf, *c_str;
|
||||
|
||||
/* Because ports are inherently 8-bit, strings need to be converted
|
||||
to a locale representation for storage. But, since string ports
|
||||
rely on string functionality for their memory management, we need
|
||||
to create a new string that has the 8-bit locale representation
|
||||
of the underlying string.
|
||||
|
||||
locale_str is already in the locale of the port. */
|
||||
str = scm_i_make_string (str_len, &buf);
|
||||
memcpy (buf, utf8_str, str_len);
|
||||
|
||||
c_pos = scm_to_unsigned_integer (pos, 0, str_len);
|
||||
SCM_ASSERT (scm_is_string (str), str, SCM_ARG1, caller);
|
||||
c_pos = scm_to_unsigned_integer (pos, 0, scm_i_string_length (str));
|
||||
|
||||
if (!((modes & SCM_WRTNG) || (modes & SCM_RDNG)))
|
||||
scm_misc_error ("scm_mkstrport", "port must read or write", SCM_EOL);
|
||||
|
||||
scm_i_scm_pthread_mutex_lock (&scm_i_port_table_mutex);
|
||||
scm_dynwind_begin (0);
|
||||
scm_i_dynwind_pthread_mutex_lock (&scm_i_port_table_mutex);
|
||||
|
||||
z = scm_new_port_table_entry (scm_tc16_strport);
|
||||
pt = SCM_PTAB_ENTRY(z);
|
||||
SCM_SETSTREAM (z, SCM_UNPACK (str));
|
||||
SCM_SET_CELL_TYPE(z, scm_tc16_strport|modes);
|
||||
pt->write_buf = pt->read_buf = (unsigned char *) scm_i_string_chars (str);
|
||||
SCM_SET_CELL_TYPE (z, scm_tc16_strport | modes);
|
||||
|
||||
/* Create a copy of STR in the encoding of Z. */
|
||||
buf = scm_to_stringn (str, &str_len, pt->encoding,
|
||||
SCM_FAILED_CONVERSION_ERROR);
|
||||
c_str = scm_gc_malloc (str_len, "strport");
|
||||
memcpy (c_str, buf, str_len);
|
||||
free (buf);
|
||||
|
||||
pt->write_buf = pt->read_buf = (unsigned char *) c_str;
|
||||
pt->read_pos = pt->write_pos = pt->read_buf + c_pos;
|
||||
pt->write_buf_size = pt->read_buf_size = str_len;
|
||||
pt->write_end = pt->read_end = pt->read_buf + pt->read_buf_size;
|
||||
|
||||
pt->rw_random = 1;
|
||||
scm_i_pthread_mutex_unlock (&scm_i_port_table_mutex);
|
||||
|
||||
/* ensure write_pos is writable. */
|
||||
scm_dynwind_end ();
|
||||
|
||||
/* Ensure WRITE_POS is writable. */
|
||||
if ((modes & SCM_WRTNG) && pt->write_pos == pt->write_end)
|
||||
st_flush (z);
|
||||
|
||||
scm_i_set_port_encoding_x (z, "UTF-8");
|
||||
scm_i_set_conversion_strategy_x (z, SCM_FAILED_CONVERSION_ERROR);
|
||||
return z;
|
||||
}
|
||||
|
||||
SCM
|
||||
scm_mkstrport (SCM pos, SCM str, long modes, const char *caller)
|
||||
/* Create a new string from the buffer of PORT, a string port, converting from
|
||||
PORT's encoding to the standard string representation. */
|
||||
SCM
|
||||
scm_strport_to_string (SCM port)
|
||||
{
|
||||
SCM z;
|
||||
size_t str_len;
|
||||
char *buf;
|
||||
|
||||
SCM_ASSERT (scm_is_string (str), str, SCM_ARG1, caller);
|
||||
|
||||
/* Because ports are inherently 8-bit, strings need to be converted
|
||||
to a locale representation for storage. But, since string ports
|
||||
rely on string functionality for their memory management, we need
|
||||
to create a new string that has the 8-bit locale representation
|
||||
of the underlying string. This violates the guideline that the
|
||||
internal encoding of characters in strings is in unicode
|
||||
codepoints. */
|
||||
|
||||
/* String ports are are always initialized with "UTF-8" as their
|
||||
encoding. */
|
||||
buf = scm_to_stringn (str, &str_len, "UTF-8", SCM_FAILED_CONVERSION_ERROR);
|
||||
z = scm_i_mkstrport (pos, buf, str_len, modes, caller);
|
||||
free (buf);
|
||||
return z;
|
||||
}
|
||||
|
||||
/* Create a new string from a string port's buffer, converting from
|
||||
the port's 8-bit locale-specific representation to the standard
|
||||
string representation. */
|
||||
SCM scm_strport_to_string (SCM port)
|
||||
{
|
||||
scm_t_port *pt = SCM_PTAB_ENTRY (port);
|
||||
SCM str;
|
||||
|
||||
scm_t_port *pt = SCM_PTAB_ENTRY (port);
|
||||
|
||||
if (pt->rw_active == SCM_PORT_WRITE)
|
||||
st_flush (port);
|
||||
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@
|
|||
#ifndef SCM_STRPORTS_H
|
||||
#define SCM_STRPORTS_H
|
||||
|
||||
/* Copyright (C) 1995,1996,2000,2001,2002, 2006, 2008 Free Software Foundation, Inc.
|
||||
/* Copyright (C) 1995,1996,2000,2001,2002, 2006, 2008, 2010 Free Software Foundation, Inc.
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public License
|
||||
|
|
@ -44,8 +44,6 @@ SCM_API scm_t_bits scm_tc16_strport;
|
|||
|
||||
|
||||
SCM_API SCM scm_mkstrport (SCM pos, SCM str, long modes, const char * caller);
|
||||
SCM_INTERNAL SCM scm_i_mkstrport (SCM pos, const char *locale_str, size_t str_len,
|
||||
long modes, const char *caller);
|
||||
SCM_API SCM scm_strport_to_string (SCM port);
|
||||
SCM_API SCM scm_object_to_string (SCM obj, SCM printer);
|
||||
SCM_API SCM scm_call_with_output_string (SCM proc);
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
;;;; test-suite/lib.scm --- generic support for testing
|
||||
;;;; Copyright (C) 1999, 2000, 2001, 2004, 2006, 2007, 2009 Free Software Foundation, Inc.
|
||||
;;;; Copyright (C) 1999, 2000, 2001, 2004, 2006, 2007, 2009, 2010 Free Software Foundation, Inc.
|
||||
;;;;
|
||||
;;;; This program is free software; you can redistribute it and/or
|
||||
;;;; modify it under the terms of the GNU Lesser General Public
|
||||
|
|
@ -30,6 +30,7 @@
|
|||
exception:numerical-overflow
|
||||
exception:struct-set!-denied
|
||||
exception:system-error
|
||||
exception:encoding-error
|
||||
exception:miscellaneous-error
|
||||
exception:string-contains-nul
|
||||
exception:read-error
|
||||
|
|
@ -267,6 +268,8 @@ with-locale with-locale*
|
|||
(cons 'misc-error "^set! denied for field"))
|
||||
(define exception:system-error
|
||||
(cons 'system-error ".*"))
|
||||
(define exception:encoding-error
|
||||
(cons 'misc-error "(cannot convert to output locale|input locale conversion error)"))
|
||||
(define exception:miscellaneous-error
|
||||
(cons 'misc-error "^.*"))
|
||||
(define exception:read-error
|
||||
|
|
@ -389,15 +392,18 @@ with-locale with-locale*
|
|||
|
||||
;;;; Turn a test name into a nice human-readable string.
|
||||
(define (format-test-name name)
|
||||
(call-with-output-string
|
||||
(lambda (port)
|
||||
(let loop ((name name)
|
||||
(separator ""))
|
||||
(if (pair? name)
|
||||
(begin
|
||||
(display separator port)
|
||||
(display (car name) port)
|
||||
(loop (cdr name) ": ")))))))
|
||||
;; Choose a Unicode-capable encoding so that the string port can contain any
|
||||
;; valid Unicode character.
|
||||
(with-fluids ((%default-port-encoding "UTF-8"))
|
||||
(call-with-output-string
|
||||
(lambda (port)
|
||||
(let loop ((name name)
|
||||
(separator ""))
|
||||
(if (pair? name)
|
||||
(begin
|
||||
(display separator port)
|
||||
(display (car name) port)
|
||||
(loop (cdr name) ": "))))))))
|
||||
|
||||
;;;; For a given test-name, deliver the full name including all prefixes.
|
||||
(define (full-name name)
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
;;;; ports.test --- test suite for Guile I/O ports -*- scheme -*-
|
||||
;;;; ports.test --- Guile I/O ports. -*- coding: utf-8; mode: scheme; -*-
|
||||
;;;; Jim Blandy <jimb@red-bean.com> --- May 1999
|
||||
;;;;
|
||||
;;;; Copyright (C) 1999, 2001, 2004, 2006, 2007, 2009 Free Software Foundation, Inc.
|
||||
;;;; Copyright (C) 1999, 2001, 2004, 2006, 2007, 2009, 2010 Free Software Foundation, Inc.
|
||||
;;;;
|
||||
;;;; This library is free software; you can redistribute it and/or
|
||||
;;;; modify it under the terms of the GNU Lesser General Public
|
||||
|
|
@ -307,7 +307,42 @@
|
|||
(string-set! text 0 #\a)
|
||||
(string-set! text (- len 1) #\b)
|
||||
(pass-if "output check"
|
||||
(string=? text result))))
|
||||
(string=? text result)))
|
||||
|
||||
(pass-if "%default-port-encoding is honored"
|
||||
(let ((encodings '("UTF-8" "UTF-16" "ISO-8859-1" "ISO-8859-3")))
|
||||
(equal? (map (lambda (e)
|
||||
(with-fluids ((%default-port-encoding e))
|
||||
(call-with-output-string
|
||||
(lambda (p)
|
||||
(display (port-encoding p) p)))))
|
||||
encodings)
|
||||
encodings)))
|
||||
|
||||
(pass-if "suitable encoding [latin-1]"
|
||||
(let ((str "hello, world"))
|
||||
(with-fluids ((%default-port-encoding "ISO-8859-1"))
|
||||
(equal? str
|
||||
(with-output-to-string
|
||||
(lambda ()
|
||||
(display str)))))))
|
||||
|
||||
(pass-if "suitable encoding [latin-3]"
|
||||
(let ((str "ĉu bone?"))
|
||||
(with-fluids ((%default-port-encoding "ISO-8859-3"))
|
||||
(equal? str
|
||||
(with-output-to-string
|
||||
(lambda ()
|
||||
(display str)))))))
|
||||
|
||||
(pass-if-exception "wrong encoding"
|
||||
exception:encoding-error
|
||||
(let ((str "ĉu bone?"))
|
||||
;; Latin-1 cannot represent ‘ĉ’.
|
||||
(with-fluids ((%default-port-encoding "ISO-8859-1"))
|
||||
(with-output-to-string
|
||||
(lambda ()
|
||||
(display str)))))))
|
||||
|
||||
(with-test-prefix "call-with-output-string"
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
;;;; r6rs-ports.test --- Exercise the R6RS I/O port API.
|
||||
;;;; r6rs-ports.test --- R6RS I/O port tests. -*- coding: iso-8859-1; -*-
|
||||
;;;;
|
||||
;;;; Copyright (C) 2009 Free Software Foundation, Inc.
|
||||
;;;; Copyright (C) 2009, 2010 Free Software Foundation, Inc.
|
||||
;;;; Ludovic Courtès
|
||||
;;;;
|
||||
;;;; This library is free software; you can redistribute it and/or
|
||||
|
|
@ -219,7 +219,25 @@
|
|||
(port (%make-void-port "w")))
|
||||
|
||||
(close-port port)
|
||||
(put-bytevector port bv))))
|
||||
(put-bytevector port bv)))
|
||||
|
||||
(pass-if "put-bytevector with UTF-16 string port"
|
||||
(let* ((str "hello, world")
|
||||
(bv (string->utf16 str)))
|
||||
(equal? str
|
||||
(with-fluids ((%default-port-encoding "UTF-16BE"))
|
||||
(call-with-output-string
|
||||
(lambda (port)
|
||||
(put-bytevector port bv)))))))
|
||||
|
||||
(pass-if-exception "put-bytevector with wrong-encoding string port"
|
||||
exception:encoding-error
|
||||
(let* ((str "hello, world")
|
||||
(bv (string->utf16 str)))
|
||||
(with-fluids ((%default-port-encoding "UTF-32"))
|
||||
(call-with-output-string
|
||||
(lambda (port)
|
||||
(put-bytevector port bv)))))))
|
||||
|
||||
|
||||
(with-test-prefix "7.2.7 Input Ports"
|
||||
|
|
@ -452,8 +470,6 @@
|
|||
(not eof?)
|
||||
(bytevector=? sink source)))))
|
||||
|
||||
|
||||
;;; Local Variables:
|
||||
;;; coding: latin-1
|
||||
;;; mode: scheme
|
||||
;;; End:
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
;;;; reader.test --- Exercise the reader. -*- Scheme -*-
|
||||
;;;; reader.test --- Reader test. -*- coding: iso-8859-1; mode: scheme -*-
|
||||
;;;;
|
||||
;;;; Copyright (C) 1999, 2001, 2002, 2003, 2007, 2008, 2009 Free Software Foundation, Inc.
|
||||
;;;; Copyright (C) 1999, 2001, 2002, 2003, 2007, 2008, 2009, 2010 Free Software Foundation, Inc.
|
||||
;;;; Jim Blandy <jimb@red-bean.com>
|
||||
;;;;
|
||||
;;;; This library is free software; you can redistribute it and/or
|
||||
|
|
@ -41,7 +41,8 @@
|
|||
|
||||
|
||||
(define (read-string s)
|
||||
(with-input-from-string s (lambda () (read))))
|
||||
(with-fluids ((%default-port-encoding #f))
|
||||
(with-input-from-string s (lambda () (read)))))
|
||||
|
||||
(define (with-read-options opts thunk)
|
||||
(let ((saved-options (read-options)))
|
||||
|
|
@ -110,8 +111,8 @@
|
|||
|
||||
(pass-if "unprintable symbol"
|
||||
;; The reader tolerates unprintable characters for symbols.
|
||||
(equal? (string->symbol "\001\002\003")
|
||||
(read-string "\001\002\003")))
|
||||
(equal? (string->symbol "\x01\x02\x03")
|
||||
(read-string "\x01\x02\x03")))
|
||||
|
||||
(pass-if "CR recognized as a token delimiter"
|
||||
;; In 1.8.3, character 0x0d was not recognized as a delimiter.
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue