detect and consume byte-order marks for textual ports

* libguile/ports.h: * libguile/ports.c (scm_consume_byte_order_mark): New procedure. * libguile/fports.c (scm_open_file): Call consume-byte-order-mark if we are opening a file in "r" mode. * libguile/read.c (scm_i_scan_for_encoding): Don't do anything about byte-order marks. * libguile/load.c (scm_primitive_load): Add a note about the duplicate encoding scan. * test-suite/tests/filesys.test: Add tests for UTF-8, UTF-16BE, and UTF-16LE BOM handling.
2013-01-30 10:17:25 +01:00 · 2013-01-30 10:17:25 +01:00 · b2cb557d75
commit b2cb557d75
parent a8fa310b04
6 changed files with 169 additions and 30 deletions
--- a/libguile/ports.c
+++ b/libguile/ports.c
@ -1,5 +1,5 @@
 /* Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2003, 2004,
- *   2006, 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
+ *   2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public License
@ -2153,6 +2153,89 @@ SCM_DEFINE (scm_set_port_filename_x, "set-port-filename!", 2, 0, 0,
 }
 #undef FUNC_NAME

+SCM_DEFINE (scm_consume_byte_order_mark, "consume-byte-order-mark", 1, 0, 0,
+            (SCM port),
+            "Peek ahead in @var{port} for a byte-order mark (\\uFEFF) encoded\n"
+            "in UTF-8 or in UTF-16.  If found, consume the byte-order mark\n"
+            "and set the port to the indicated encoding.\n"
+            "\n"
+            "As a special case, if the port's encoding is already UTF-16LE\n"
+            "or UTF-16BE (as opposed to UTF-16), we consider that the user\n"
+            "has already asked for an explicit byte order.  In this case no\n"
+            "scan is performed, and the byte-order mark (if any) is left in\n"
+            "the port.\n"
+            "\n"
+            "Return @code{#t} if a byte-order mark was consumed, and\n"
+            "@code{#f} otherwise.")
+#define FUNC_NAME s_scm_consume_byte_order_mark
+{
+  scm_t_port *pt;
+  const char *enc;
+
+  SCM_VALIDATE_PORT (1, port);
+
+  pt = SCM_PTAB_ENTRY (port);
+  enc = pt->encoding;
+
+  if (enc && strcasecmp (enc, "UTF-16BE") == 0)
+    return SCM_BOOL_F;
+
+  if (enc && strcasecmp (enc, "UTF-16LE") == 0)
+    return SCM_BOOL_F;
+
+  switch (scm_peek_byte_or_eof (port))
+    {
+    case 0xEF:
+      scm_get_byte_or_eof (port);
+      switch (scm_peek_byte_or_eof (port))
+        {
+        case 0xBB:
+          scm_get_byte_or_eof (port);
+          switch (scm_peek_byte_or_eof (port))
+            {
+            case 0xBF:
+              scm_get_byte_or_eof (port);
+              scm_i_set_port_encoding_x (port, "UTF-8");
+              return SCM_BOOL_T;
+            default:
+              scm_unget_byte (0xBB, port);
+              scm_unget_byte (0xEF, port);
+              return SCM_BOOL_F;
+            }
+        default:
+          scm_unget_byte (0xEF, port);
+          return SCM_BOOL_F;
+        }
+    case 0xFE:
+      scm_get_byte_or_eof (port);
+      switch (scm_peek_byte_or_eof (port))
+        {
+        case 0xFF:
+          scm_get_byte_or_eof (port);
+          scm_i_set_port_encoding_x (port, "UTF-16BE");
+          return SCM_BOOL_T;
+        default:
+          scm_unget_byte (0xFE, port);
+          return SCM_BOOL_F;
+        }
+    case 0xFF:
+      scm_get_byte_or_eof (port);
+      switch (scm_peek_byte_or_eof (port))
+        {
+        case 0xFE:
+          scm_get_byte_or_eof (port);
+          scm_i_set_port_encoding_x (port, "UTF-16LE");
+          return SCM_BOOL_T;
+        default:
+          scm_unget_byte (0xFF, port);
+          return SCM_BOOL_F;
+        }
+    default:
+      return SCM_BOOL_F;
+    }
+}
+#undef FUNC_NAME
+
 /* A fluid specifying the default encoding for newly created ports.  If it is
   a string, that is the encoding.  If it is #f, it is in the "native"
   (Latin-1) encoding.  */