detect and consume byte-order marks for textual ports
* libguile/ports.h: * libguile/ports.c (scm_consume_byte_order_mark): New procedure. * libguile/fports.c (scm_open_file): Call consume-byte-order-mark if we are opening a file in "r" mode. * libguile/read.c (scm_i_scan_for_encoding): Don't do anything about byte-order marks. * libguile/load.c (scm_primitive_load): Add a note about the duplicate encoding scan. * test-suite/tests/filesys.test: Add tests for UTF-8, UTF-16BE, and UTF-16LE BOM handling.
This commit is contained in:
parent
a8fa310b04
commit
b2cb557d75
6 changed files with 169 additions and 30 deletions
|
|
@ -1,5 +1,5 @@
|
|||
/* Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2003, 2004,
|
||||
* 2006, 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
|
||||
* 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public License
|
||||
|
|
@ -2153,6 +2153,89 @@ SCM_DEFINE (scm_set_port_filename_x, "set-port-filename!", 2, 0, 0,
|
|||
}
|
||||
#undef FUNC_NAME
|
||||
|
||||
SCM_DEFINE (scm_consume_byte_order_mark, "consume-byte-order-mark", 1, 0, 0,
|
||||
(SCM port),
|
||||
"Peek ahead in @var{port} for a byte-order mark (\\uFEFF) encoded\n"
|
||||
"in UTF-8 or in UTF-16. If found, consume the byte-order mark\n"
|
||||
"and set the port to the indicated encoding.\n"
|
||||
"\n"
|
||||
"As a special case, if the port's encoding is already UTF-16LE\n"
|
||||
"or UTF-16BE (as opposed to UTF-16), we consider that the user\n"
|
||||
"has already asked for an explicit byte order. In this case no\n"
|
||||
"scan is performed, and the byte-order mark (if any) is left in\n"
|
||||
"the port.\n"
|
||||
"\n"
|
||||
"Return @code{#t} if a byte-order mark was consumed, and\n"
|
||||
"@code{#f} otherwise.")
|
||||
#define FUNC_NAME s_scm_consume_byte_order_mark
|
||||
{
|
||||
scm_t_port *pt;
|
||||
const char *enc;
|
||||
|
||||
SCM_VALIDATE_PORT (1, port);
|
||||
|
||||
pt = SCM_PTAB_ENTRY (port);
|
||||
enc = pt->encoding;
|
||||
|
||||
if (enc && strcasecmp (enc, "UTF-16BE") == 0)
|
||||
return SCM_BOOL_F;
|
||||
|
||||
if (enc && strcasecmp (enc, "UTF-16LE") == 0)
|
||||
return SCM_BOOL_F;
|
||||
|
||||
switch (scm_peek_byte_or_eof (port))
|
||||
{
|
||||
case 0xEF:
|
||||
scm_get_byte_or_eof (port);
|
||||
switch (scm_peek_byte_or_eof (port))
|
||||
{
|
||||
case 0xBB:
|
||||
scm_get_byte_or_eof (port);
|
||||
switch (scm_peek_byte_or_eof (port))
|
||||
{
|
||||
case 0xBF:
|
||||
scm_get_byte_or_eof (port);
|
||||
scm_i_set_port_encoding_x (port, "UTF-8");
|
||||
return SCM_BOOL_T;
|
||||
default:
|
||||
scm_unget_byte (0xBB, port);
|
||||
scm_unget_byte (0xEF, port);
|
||||
return SCM_BOOL_F;
|
||||
}
|
||||
default:
|
||||
scm_unget_byte (0xEF, port);
|
||||
return SCM_BOOL_F;
|
||||
}
|
||||
case 0xFE:
|
||||
scm_get_byte_or_eof (port);
|
||||
switch (scm_peek_byte_or_eof (port))
|
||||
{
|
||||
case 0xFF:
|
||||
scm_get_byte_or_eof (port);
|
||||
scm_i_set_port_encoding_x (port, "UTF-16BE");
|
||||
return SCM_BOOL_T;
|
||||
default:
|
||||
scm_unget_byte (0xFE, port);
|
||||
return SCM_BOOL_F;
|
||||
}
|
||||
case 0xFF:
|
||||
scm_get_byte_or_eof (port);
|
||||
switch (scm_peek_byte_or_eof (port))
|
||||
{
|
||||
case 0xFE:
|
||||
scm_get_byte_or_eof (port);
|
||||
scm_i_set_port_encoding_x (port, "UTF-16LE");
|
||||
return SCM_BOOL_T;
|
||||
default:
|
||||
scm_unget_byte (0xFF, port);
|
||||
return SCM_BOOL_F;
|
||||
}
|
||||
default:
|
||||
return SCM_BOOL_F;
|
||||
}
|
||||
}
|
||||
#undef FUNC_NAME
|
||||
|
||||
/* A fluid specifying the default encoding for newly created ports. If it is
|
||||
a string, that is the encoding. If it is #f, it is in the "native"
|
||||
(Latin-1) encoding. */
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue