247 lines
10 KiB
Scheme
247 lines
10 KiB
Scheme
|
|
;;;; (sxml ssax) -- the SSAX parser
|
|||
|
|
;;;;
|
|||
|
|
;;;; Copyright (C) 2009 Free Software Foundation, Inc.
|
|||
|
|
;;;; Modified 2004 by Andy Wingo <wingo at pobox dot com>.
|
|||
|
|
;;;; Written 2001,2002,2003,2004 by Oleg Kiselyov <oleg at pobox dot com> as SSAX.scm.
|
|||
|
|
;;;;
|
|||
|
|
;;;; This library is free software; you can redistribute it and/or
|
|||
|
|
;;;; modify it under the terms of the GNU Lesser General Public
|
|||
|
|
;;;; License as published by the Free Software Foundation; either
|
|||
|
|
;;;; version 3 of the License, or (at your option) any later version.
|
|||
|
|
;;;;
|
|||
|
|
;;;; This library is distributed in the hope that it will be useful,
|
|||
|
|
;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|||
|
|
;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|||
|
|
;;;; Lesser General Public License for more details.
|
|||
|
|
;;;;
|
|||
|
|
;;;; You should have received a copy of the GNU Lesser General Public
|
|||
|
|
;;;; License along with this library; if not, write to the Free Software
|
|||
|
|
;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|||
|
|
;;;;
|
|||
|
|
|
|||
|
|
;;; Commentary:
|
|||
|
|
;;
|
|||
|
|
;@subheading Functional XML parsing framework
|
|||
|
|
;@subsubheading SAX/DOM and SXML parsers with support for XML Namespaces and validation
|
|||
|
|
;
|
|||
|
|
; This is a package of low-to-high level lexing and parsing procedures
|
|||
|
|
; that can be combined to yield a SAX, a DOM, a validating parser, or
|
|||
|
|
; a parser intended for a particular document type. The procedures in
|
|||
|
|
; the package can be used separately to tokenize or parse various
|
|||
|
|
; pieces of XML documents. The package supports XML Namespaces,
|
|||
|
|
; internal and external parsed entities, user-controlled handling of
|
|||
|
|
; whitespace, and validation. This module therefore is intended to be
|
|||
|
|
; a framework, a set of "Lego blocks" you can use to build a parser
|
|||
|
|
; following any discipline and performing validation to any degree. As
|
|||
|
|
; an example of the parser construction, this file includes a
|
|||
|
|
; semi-validating SXML parser.
|
|||
|
|
|
|||
|
|
; The present XML framework has a "sequential" feel of SAX yet a
|
|||
|
|
; "functional style" of DOM. Like a SAX parser, the framework scans the
|
|||
|
|
; document only once and permits incremental processing. An application
|
|||
|
|
; that handles document elements in order can run as efficiently as
|
|||
|
|
; possible. @emph{Unlike} a SAX parser, the framework does not require
|
|||
|
|
; an application register stateful callbacks and surrender control to
|
|||
|
|
; the parser. Rather, it is the application that can drive the framework
|
|||
|
|
; -- calling its functions to get the current lexical or syntax element.
|
|||
|
|
; These functions do not maintain or mutate any state save the input
|
|||
|
|
; port. Therefore, the framework permits parsing of XML in a pure
|
|||
|
|
; functional style, with the input port being a monad (or a linear,
|
|||
|
|
; read-once parameter).
|
|||
|
|
|
|||
|
|
; Besides the @var{port}, there is another monad -- @var{seed}. Most of
|
|||
|
|
; the middle- and high-level parsers are single-threaded through the
|
|||
|
|
; @var{seed}. The functions of this framework do not process or affect
|
|||
|
|
; the @var{seed} in any way: they simply pass it around as an instance
|
|||
|
|
; of an opaque datatype. User functions, on the other hand, can use the
|
|||
|
|
; seed to maintain user's state, to accumulate parsing results, etc. A
|
|||
|
|
; user can freely mix his own functions with those of the framework. On
|
|||
|
|
; the other hand, the user may wish to instantiate a high-level parser:
|
|||
|
|
; @code{SSAX:make-elem-parser} or @code{SSAX:make-parser}. In the latter
|
|||
|
|
; case, the user must provide functions of specific signatures, which
|
|||
|
|
; are called at predictable moments during the parsing: to handle
|
|||
|
|
; character data, element data, or processing instructions (PI). The
|
|||
|
|
; functions are always given the @var{seed}, among other parameters, and
|
|||
|
|
; must return the new @var{seed}.
|
|||
|
|
|
|||
|
|
; From a functional point of view, XML parsing is a combined
|
|||
|
|
; pre-post-order traversal of a "tree" that is the XML document
|
|||
|
|
; itself. This down-and-up traversal tells the user about an element
|
|||
|
|
; when its start tag is encountered. The user is notified about the
|
|||
|
|
; element once more, after all element's children have been
|
|||
|
|
; handled. The process of XML parsing therefore is a fold over the
|
|||
|
|
; raw XML document. Unlike a fold over trees defined in [1], the
|
|||
|
|
; parser is necessarily single-threaded -- obviously as elements
|
|||
|
|
; in a text XML document are laid down sequentially. The parser
|
|||
|
|
; therefore is a tree fold that has been transformed to accept an
|
|||
|
|
; accumulating parameter [1,2].
|
|||
|
|
|
|||
|
|
; Formally, the denotational semantics of the parser can be expressed
|
|||
|
|
; as
|
|||
|
|
;@smallexample
|
|||
|
|
; parser:: (Start-tag -> Seed -> Seed) ->
|
|||
|
|
; (Start-tag -> Seed -> Seed -> Seed) ->
|
|||
|
|
; (Char-Data -> Seed -> Seed) ->
|
|||
|
|
; XML-text-fragment -> Seed -> Seed
|
|||
|
|
; parser fdown fup fchar "<elem attrs> content </elem>" seed
|
|||
|
|
; = fup "<elem attrs>" seed
|
|||
|
|
; (parser fdown fup fchar "content" (fdown "<elem attrs>" seed))
|
|||
|
|
;
|
|||
|
|
; parser fdown fup fchar "char-data content" seed
|
|||
|
|
; = parser fdown fup fchar "content" (fchar "char-data" seed)
|
|||
|
|
;
|
|||
|
|
; parser fdown fup fchar "elem-content content" seed
|
|||
|
|
; = parser fdown fup fchar "content" (
|
|||
|
|
; parser fdown fup fchar "elem-content" seed)
|
|||
|
|
;@end smallexample
|
|||
|
|
|
|||
|
|
; Compare the last two equations with the left fold
|
|||
|
|
;@smallexample
|
|||
|
|
; fold-left kons elem:list seed = fold-left kons list (kons elem seed)
|
|||
|
|
;@end smallexample
|
|||
|
|
|
|||
|
|
; The real parser created by @code{SSAX:make-parser} is slightly more
|
|||
|
|
; complicated, to account for processing instructions, entity
|
|||
|
|
; references, namespaces, processing of document type declaration, etc.
|
|||
|
|
|
|||
|
|
|
|||
|
|
; The XML standard document referred to in this module is
|
|||
|
|
; @uref{http://www.w3.org/TR/1998/REC-xml-19980210.html}
|
|||
|
|
;
|
|||
|
|
; The present file also defines a procedure that parses the text of an
|
|||
|
|
; XML document or of a separate element into SXML, an S-expression-based
|
|||
|
|
; model of an XML Information Set. SXML is also an Abstract Syntax Tree
|
|||
|
|
; of an XML document. SXML is similar but not identical to DOM; SXML is
|
|||
|
|
; particularly suitable for Scheme-based XML/HTML authoring, SXPath
|
|||
|
|
; queries, and tree transformations. See SXML.html for more details.
|
|||
|
|
; SXML is a term implementation of evaluation of the XML document [3].
|
|||
|
|
; The other implementation is context-passing.
|
|||
|
|
|
|||
|
|
; The present frameworks fully supports the XML Namespaces Recommendation:
|
|||
|
|
; @uref{http://www.w3.org/TR/REC-xml-names/}
|
|||
|
|
; Other links:
|
|||
|
|
;@table @asis
|
|||
|
|
;@item [1]
|
|||
|
|
; Jeremy Gibbons, Geraint Jones, "The Under-appreciated Unfold,"
|
|||
|
|
; Proc. ICFP'98, 1998, pp. 273-279.
|
|||
|
|
;@item [2]
|
|||
|
|
; Richard S. Bird, The promotion and accumulation strategies in
|
|||
|
|
; transformational programming, ACM Trans. Progr. Lang. Systems,
|
|||
|
|
; 6(4):487-504, October 1984.
|
|||
|
|
;@item [3]
|
|||
|
|
; Ralf Hinze, "Deriving Backtracking Monad Transformers,"
|
|||
|
|
; Functional Pearl. Proc ICFP'00, pp. 186-197.
|
|||
|
|
;@end table
|
|||
|
|
;;
|
|||
|
|
;;; Code:
|
|||
|
|
|
|||
|
|
(define-module (sxml ssax)
|
|||
|
|
#:use-module (sxml ssax input-parse)
|
|||
|
|
#:use-module (srfi srfi-1)
|
|||
|
|
#:use-module (srfi srfi-13)
|
|||
|
|
|
|||
|
|
#:export (current-ssax-error-port
|
|||
|
|
with-ssax-error-to-port
|
|||
|
|
xml-token? xml-token-kind xml-token-head
|
|||
|
|
make-empty-attlist attlist-add
|
|||
|
|
attlist-null?
|
|||
|
|
attlist-remove-top
|
|||
|
|
attlist->alist attlist-fold
|
|||
|
|
ssax:uri-string->symbol
|
|||
|
|
ssax:skip-internal-dtd
|
|||
|
|
ssax:read-pi-body-as-string
|
|||
|
|
ssax:reverse-collect-str-drop-ws
|
|||
|
|
ssax:read-markup-token
|
|||
|
|
ssax:read-cdata-body
|
|||
|
|
ssax:read-char-ref
|
|||
|
|
ssax:read-attributes
|
|||
|
|
ssax:complete-start-tag
|
|||
|
|
ssax:read-external-id
|
|||
|
|
ssax:read-char-data
|
|||
|
|
ssax:xml->sxml
|
|||
|
|
ssax:make-parser
|
|||
|
|
ssax:make-pi-parser
|
|||
|
|
ssax:make-elem-parser))
|
|||
|
|
|
|||
|
|
(define (parser-error port message . rest)
|
|||
|
|
(apply throw 'parser-error port message rest))
|
|||
|
|
(define ascii->char integer->char)
|
|||
|
|
(define char->ascii char->integer)
|
|||
|
|
|
|||
|
|
(define *current-ssax-error-port* (make-fluid))
|
|||
|
|
(define (current-ssax-error-port)
|
|||
|
|
(fluid-ref *current-ssax-error-port*))
|
|||
|
|
|
|||
|
|
(define (with-ssax-error-to-port port thunk)
|
|||
|
|
(with-fluids ((*current-ssax-error-port* port))
|
|||
|
|
(thunk)))
|
|||
|
|
|
|||
|
|
(define (ssax:warn port msg . args)
|
|||
|
|
(format (current-ssax-error-port)
|
|||
|
|
";;; SSAX warning: ~a ~a\n" msg args))
|
|||
|
|
|
|||
|
|
(define (ucscode->string codepoint)
|
|||
|
|
(string (integer->char codepoint)))
|
|||
|
|
|
|||
|
|
(define char-newline #\newline)
|
|||
|
|
(define char-return #\return)
|
|||
|
|
(define char-tab #\tab)
|
|||
|
|
(define nl "\n")
|
|||
|
|
|
|||
|
|
;; if condition is true, execute stmts in turn and return the result of
|
|||
|
|
;; the last statement otherwise, return #f
|
|||
|
|
(define-syntax when
|
|||
|
|
(syntax-rules ()
|
|||
|
|
((when condition . stmts)
|
|||
|
|
(and condition (begin . stmts)))))
|
|||
|
|
|
|||
|
|
;; Execute a sequence of forms and return the result of the _first_ one.
|
|||
|
|
;; Like PROG1 in Lisp. Typically used to evaluate one or more forms with
|
|||
|
|
;; side effects and return a value that must be computed before some or
|
|||
|
|
;; all of the side effects happen.
|
|||
|
|
(define-syntax begin0
|
|||
|
|
(syntax-rules ()
|
|||
|
|
((begin0 form form1 ... )
|
|||
|
|
(let ((val form)) form1 ... val))))
|
|||
|
|
|
|||
|
|
; Like let* but allowing for multiple-value bindings
|
|||
|
|
(define-syntax let*-values
|
|||
|
|
(syntax-rules ()
|
|||
|
|
((let*-values () . bodies) (begin . bodies))
|
|||
|
|
((let*-values (((var) initializer) . rest) . bodies)
|
|||
|
|
(let ((var initializer)) ; a single var optimization
|
|||
|
|
(let*-values rest . bodies)))
|
|||
|
|
((let*-values ((vars initializer) . rest) . bodies)
|
|||
|
|
(call-with-values (lambda () initializer) ; the most generic case
|
|||
|
|
(lambda vars (let*-values rest . bodies))))))
|
|||
|
|
|
|||
|
|
;; needed for some dumb reason
|
|||
|
|
(define inc 1+)
|
|||
|
|
(define dec 1-)
|
|||
|
|
|
|||
|
|
(define-syntax include-from-path/filtered
|
|||
|
|
(lambda (x)
|
|||
|
|
(define (read-filtered accept-list file)
|
|||
|
|
(with-input-from-file (%search-load-path file)
|
|||
|
|
(lambda ()
|
|||
|
|
(let loop ((sexp (read)) (out '()))
|
|||
|
|
(cond
|
|||
|
|
((eof-object? sexp) (reverse out))
|
|||
|
|
((and (pair? sexp) (memq (car sexp) accept-list))
|
|||
|
|
(loop (read) (cons sexp out)))
|
|||
|
|
(else
|
|||
|
|
(loop (read) out)))))))
|
|||
|
|
(syntax-case x ()
|
|||
|
|
((_ accept-list file)
|
|||
|
|
(with-syntax (((exp ...) (datum->syntax
|
|||
|
|
x
|
|||
|
|
(read-filtered
|
|||
|
|
(syntax->datum #'accept-list)
|
|||
|
|
(syntax->datum #'file)))))
|
|||
|
|
#'(begin exp ...))))))
|
|||
|
|
|
|||
|
|
(include-from-path "sxml/upstream/assert.scm")
|
|||
|
|
(include-from-path/filtered
|
|||
|
|
(define define-syntax ssax:define-labeled-arg-macro)
|
|||
|
|
"sxml/upstream/SSAX.scm")
|