Module: htmlprag

 @Package     HtmlPrag
 @Subtitle    Pragmatic Parsing and Emitting of HTML using SXML and SHTML
 @HomePage    http://www.neilvandyke.org/htmlprag/
 @Author      Neil W. Van Dyke
 @AuthorEmail neil@@neilvandyke.org
 @Version     0.13
 @Date        2005-02-23


f: %htmlprag:a2c
f: %htmlprag:append!
f: %htmlprag:reverse!ok
f: %htmlprag:down
f: %htmlprag:down!ok
f: %htmlprag:gosc
f: shtml-comment-symbol
f: shtml-decl-symbol
f: shtml-empty-symbol
f: shtml-end-symbol
f: shtml-entity-symbol
f: shtml-pi-symbol
f: shtml-start-symbol
f: shtml-text-symbol
f: shtml-top-symbol
f: shtml-named-char-id
f: shtml-numeric-char-id
f: make-shtml-entity
f: shtml-entity-value
f: make-html-tokenizer
f: tokenize-html
f: shtml-token-kind
f: %htmlprag:empty-elements
f: parse-html/tokenizer
f: %htmlprag:parse-html
f: html->sxml-0nf
f: html->sxml-1nf
f: html->sxml-2nf
f: html->sxml
f: html->shtml
f: %htmlprag:write-shtml-as-html/fixed
f: write-shtml-as-html
f: shtml->html
f: sxml->html
f: write-sxml-html
f: %htmlprag:test

%htmlprag:a2c

(define %htmlprag:a2c
... Full Code ... )


%htmlprag:append!

(define (%htmlprag:append! a b)
... Full Code ... )


%htmlprag:reverse!ok

(define %htmlprag:reverse!ok
... Full Code ... )


%htmlprag:down

(define (%htmlprag:down s)
... Full Code ... )


%htmlprag:down!ok

(define %htmlprag:down!ok
... Full Code ... )


%htmlprag:gosc

(define (%htmlprag:gosc os)
... Full Code ... )


shtml-comment-symbol

(define shtml-comment-symbol
... Full Code ... )


shtml-decl-symbol

(define shtml-decl-symbol
... Full Code ... )


shtml-empty-symbol

(define shtml-empty-symbol
... Full Code ... )


shtml-end-symbol

(define shtml-end-symbol
... Full Code ... )


shtml-entity-symbol

(define shtml-entity-symbol
... Full Code ... )


shtml-pi-symbol

(define shtml-pi-symbol
... Full Code ... )


shtml-start-symbol

(define shtml-start-symbol
... Full Code ... )


shtml-text-symbol

(define shtml-text-symbol
... Full Code ... )


shtml-top-symbol

(define shtml-top-symbol
... Full Code ... )


shtml-named-char-id

(define shtml-named-char-id
... Full Code ... )


shtml-numeric-char-id

(define shtml-numeric-char-id
... Full Code ... )


make-shtml-entity

(define (make-shtml-entity val)
... Full Code ... )


shtml-entity-value

(define (shtml-entity-value entity)
... Full Code ... )


make-html-tokenizer

(define make-html-tokenizer
... Full Code ... )


tokenize-html

(define (tokenize-html in normalized?)
... Full Code ... )


shtml-token-kind

(define (shtml-token-kind token)
... Full Code ... )


%htmlprag:empty-elements

(define %htmlprag:empty-elements
... Full Code ... )


parse-html/tokenizer

(define parse-html/tokenizer
... Full Code ... )


%htmlprag:parse-html

(define (%htmlprag:parse-html input normalized? top?)
... Full Code ... )


html->sxml-0nf

(define (html->sxml-0nf input)
... Full Code ... )


html->sxml-1nf

(define (html->sxml-1nf input)
... Full Code ... )


html->sxml-2nf

(define (html->sxml-2nf input)
... Full Code ... )


html->sxml

(define html->sxml
... Full Code ... )


html->shtml

(define html->shtml
... Full Code ... )


%htmlprag:write-shtml-as-html/fixed

(define (%htmlprag:write-shtml-as-html/fixed shtml out foreign-filter)
... Full Code ... )


write-shtml-as-html

(define write-shtml-as-html
... Full Code ... )


shtml->html

(define (shtml->html shtml)
... Full Code ... )


sxml->html

(define sxml->html
... Full Code ... )


write-sxml-html

(define write-sxml-html
... Full Code ... )


%htmlprag:test

(define (%htmlprag:test)
... Full Code ... )


Code

cond-expand
Index
 DL: disable tests for Bigloo
(cond-expand
 ((or bigloo chicken gambit)
  (define-macro (%htmlprag:testeez . x) #t)
 )
 (else

(define-syntax %htmlprag:testeez
  (syntax-rules () ((_ x ...)
                    ;;(testeez x ...)
                    (error "Tests disabled.")
                    )))
))

%htmlprag:a2c

Index
(define %htmlprag:a2c integer->char)

%htmlprag:append!

Index
(define (%htmlprag:append! a b)
  (cond ((null? a) b)
        ((null? b) a)
        (else      (let loop  ((sub a))
                     (if (null? (cdr sub))
                         (begin (set-cdr! sub b)
                                a)
                         (loop (cdr sub)))))))

%htmlprag:reverse!ok

Index
(define %htmlprag:reverse!ok reverse)
cond-expand
Index
(cond-expand  ; DL: wrote this explicitly as a cond-expand
 ((or bigloo chicken gambit)
   (define %htmlprag:error error)
 )
 (else 

(define-syntax %htmlprag:error
  (syntax-rules ()
    ((_ p m o) (error (string-append p " : " m) o))
    ;; ((_ p m o) (error p m o))))
    ))
))

%htmlprag:down

Index
(define (%htmlprag:down s)
  (list->string (map char-downcase (string->list s))))

%htmlprag:down!ok

Index
(define %htmlprag:down!ok %htmlprag:down)

%htmlprag:gosc

Index
(define (%htmlprag:gosc os)
  (let ((str (get-output-string os)))
    ;; Note: By default, we don't call close-output-port, since at least one
    ;; tested Scheme implementation barfs on that.
    ;;
    ;; (close-output-port os)
    str))

shtml-comment-symbol

Index
(define shtml-comment-symbol (string->symbol "*COMMENT*"))

shtml-decl-symbol

Index
(define shtml-decl-symbol    (string->symbol "*DECL*"))

shtml-empty-symbol

Index
(define shtml-empty-symbol   (string->symbol "*EMPTY*"))

shtml-end-symbol

Index
(define shtml-end-symbol     (string->symbol "*END*"))

shtml-entity-symbol

Index
(define shtml-entity-symbol  (string->symbol "*ENTITY*"))

shtml-pi-symbol

Index
(define shtml-pi-symbol      (string->symbol "*PI*"))

shtml-start-symbol

Index
(define shtml-start-symbol   (string->symbol "*START*"))

shtml-text-symbol

Index
(define shtml-text-symbol    (string->symbol "*TEXT*"))

shtml-top-symbol

Index
(define shtml-top-symbol     (string->symbol "*TOP*"))

shtml-named-char-id

Index
(define shtml-named-char-id   "shtml-named-char")

shtml-numeric-char-id

Index
(define shtml-numeric-char-id "shtml-numeric-char")

make-shtml-entity

Index
(define (make-shtml-entity val)
  (list '& (cond ((symbol?  val) val)
                 ((integer? val) val)
                 ((string?  val) (string->symbol val))
                 (else (%htmlprag:error "make-shtml-entity"
                                        "invalid SHTML entity value:"
                                        val)))))

shtml-entity-value

Index
(define (shtml-entity-value entity)
  (cond ((not (pair? entity)) #f)
        ((null? (cdr entity)) #f)
        ((eqv? (car entity) '&)
         ;; TODO: Error-check for extraneous list members?
         (let ((val (cadr entity)))
           (cond ((symbol?  val) val)
                 ((integer? val) val)
                 ((string?  val) (string->symbol val))
                 (else           #f))))
        ((eqv? (car entity) shtml-entity-symbol)
         (if (null? (cddr entity))
             #f
             (let ((public-id (list-ref entity 1))
                   (system-id (list-ref entity 2)))
               ;; TODO: Error-check for extraneous list members?
               (cond ((equal? public-id shtml-named-char-id)
                      (string->symbol system-id))
                     ((equal? public-id shtml-numeric-char-id)
                      (string->number system-id))
                     (else #f)))))
        (else #f)))

make-html-tokenizer

Index
(define make-html-tokenizer
  ;; TODO: Have the tokenizer replace contiguous whitespace within individual
  ;; text tokens with single space characters (except for when in `pre' and
  ;; verbatim elements).  The parser will introduce new contiguous whitespace
  ;; (e.g., when text tokens are concatenated, invalid end tags are removed,
  ;; whitespace is irrelevant between certain elements), but then the parser
  ;; only has to worry about the first and last character of each string.
  ;; Perhaps the text tokens should have both leading and trailing whitespace
  ;; stripped, and contain flags for whether or not leading and trailing
  ;; whitespace occurred.
  (letrec ((no-token '())

           ;; TODO: Maybe make this an option.
           (verbatim-to-eof-elems '(plaintext))

           ;; TODO: Implement proper parsing of `verbatim-pair-elems' elements.
           ;; Note that we must support invalid termination like this:
           (verbatim-pair-elems '(script server style xmp))

           (ws-chars (list #\space
                           (%htmlprag:a2c 9)
                           (%htmlprag:a2c 10)
                           (%htmlprag:a2c 11)
                           (%htmlprag:a2c 12)
                           (%htmlprag:a2c 13)))

           (output-string->string-or-false
            (lambda (os)
              (let ((s (%htmlprag:gosc os)))
                (if (string=? s "") #f s))))

           (output-string->symbol-or-false
            (lambda (os)
              (let ((s (output-string->string-or-false os)))
                (if s (string->symbol s) #f))))
           )
    (lambda (in normalized?)
      ;; TODO: Make a tokenizer option that causes XML namespace qualifiers to
      ;; be ignored.
      (letrec
          (
           ;; Port buffer with inexpensive unread of one character and slightly
           ;; more expensive pushback of second character to unread.  The
           ;; procedures themselves do no consing.  The tokenizer currently
           ;; needs two-symbol lookahead, due to ambiguous "/" while parsing
           ;; element and attribute names, which could be either empty-tag
           ;; syntax or XML qualified names.
           (c           #f)
           (next-c      #f)
           (c-consumed? #t)
           (read-c      (lambda ()
                          (if c-consumed?
                              (if next-c
                                  (begin (set! c      next-c)
                                         (set! next-c #f))
                                  (set! c (read-char in)))
                              (set! c-consumed? #t))))
           (unread-c    (lambda ()
                          (if c-consumed?
                              (set! c-consumed? #f)
                              ;; TODO: Procedure name in error message really
                              ;; isn't "make-html-tokenizer"...
                              (%htmlprag:error "make-html-tokenizer"
                                               "already unread:"
                                               c))))
           (push-c      (lambda (new-c)
                          (if c-consumed?
                              (begin (set! c           new-c)
                                     (set! c-consumed? #f))
                              (if next-c
                                  (%htmlprag:error
                                   "make-html-tokenizer"
                                   "pushback full:"
                                   c)
                                  (begin (set! next-c      c)
                                         (set! c           new-c)
                                         (set! c-consumed? #f))))))

           ;; TODO: These procedures are a temporary convenience for
           ;; enumerating the pertinent character classes, with an eye towards
           ;; removing redundant tests of character class.  These procedures
           ;; should be eliminated in a future version.
           (c-eof?      (lambda () (eof-object? c)))
           (c-amp?      (lambda () (eqv? c #\&)))
           (c-apos?     (lambda () (eqv? c #\')))
           (c-bang?     (lambda () (eqv? c #\!)))
           (c-colon?    (lambda () (eqv? c #\:)))
           (c-quot?     (lambda () (eqv? c #\")))
           (c-equals?   (lambda () (eqv? c #\=)))
           (c-gt?       (lambda () (eqv? c #\>)))
           (c-lt?       (lambda () (eqv? c #\<)))
           (c-minus?    (lambda () (eqv? c #\-)))
           (c-pound?    (lambda () (eqv? c #\#)))
           (c-ques?     (lambda () (eqv? c #\?)))
           (c-semi?     (lambda () (eqv? c #\;)))
           (c-slash?    (lambda () (eqv? c #\/)))
           (c-splat?    (lambda () (eqv? c #\*)))
           (c-lf?       (lambda () (eqv? c #\newline)))
           (c-angle?    (lambda () (memv c '(#\< #\>))))
           (c-ws?       (lambda () (memv c ws-chars)))
           (c-alpha?    (lambda () (char-alphabetic? c)))
           (c-digit?    (lambda () (char-numeric? c)))
           (c-alphanum? (lambda () (or (c-alpha?) (c-digit?))))
           (c-hexlet?   (lambda () (memv c '(#\a #\b #\c #\d #\e #\f
                                             #\A #\B #\C #\D #\E #\F))))

           (skip-ws     (lambda () (read-c) (if (c-ws?) (skip-ws) (unread-c))))

           (make-start-token
            (if normalized?
                (lambda (name ns attrs)
                  (list name (cons '@ attrs)))
                (lambda (name ns attrs)
                  (if (null? attrs)
                      (list name)
                      (list name (cons '@ attrs))))))

           (make-empty-token
            (lambda (name ns attrs)
              (cons shtml-empty-symbol
                    (make-start-token name ns attrs))))

           (make-end-token
            (if normalized?
                (lambda (name ns attrs)
                  (list shtml-end-symbol
                        name
                        (cons '@ attrs)))
                (lambda (name ns attrs)
                  (if (null? attrs)
                      (list shtml-end-symbol name)
                      (list shtml-end-symbol
                            name
                            (cons '@ attrs))))))

           (make-comment-token
            (lambda (str) (list shtml-comment-symbol str)))

           (make-decl-token
            (lambda (parts) (cons shtml-decl-symbol parts)))

           (scan-qname
            ;; TODO: Make sure we don't accept local names that have "*", since
            ;; this can break SXML tools.  Have to validate this afterwards if
            ;; "verbatim-safe?".  Also check for "@" and maybe "@@".  Check
            ;; qname parsing code, especially for verbatim mode.  This is
            ;; important!
            (lambda (verbatim-safe?)
              ;; Note: If we accept some invalid local names, we only need two
              ;; symbols of lookahead to determine the end of a qname.
              (letrec ((os      #f)
                       (ns      '())
                       (vcolons 0)
                       (good-os (lambda ()
                                  (or os
                                      (begin (set! os (open-output-string))
                                             os)))))
                (let loop ()
                  (read-c)
                  (cond ((c-eof?) #f)
                        ((or (c-ws?) (c-splat?))
                         (if verbatim-safe?
                             (unread-c)))
                        ((or (c-angle?) (c-equals?) (c-quot?) (c-apos?))
                         (unread-c))
                        ((c-colon?)
                         (or (null? ns)
                             (set! ns (cons ":" ns)))
                         (if os
                             (begin
                               (set! ns (cons (%htmlprag:gosc os)
                                              ns))
                               (set! os #f)))
                         (loop))
                        ((c-slash?)
                         (read-c)
                         (cond ((or (c-eof?)
                                    (c-ws?)
                                    (c-equals?)
                                    (c-apos?)
                                    (c-quot?)
                                    (c-angle?)
                                    (c-splat?))
                                (unread-c)
                                (push-c #\/))
                               (else (write-char #\/ (good-os))
                                     (write-char c   os)
                                     (loop))))
                        (else (write-char c (good-os))
                              (loop))))
                (let ((ns    (if (null? ns)
                                 #f
                                 (apply string-append
                                        (%htmlprag:reverse!ok ns))))
                      (local (if os (%htmlprag:gosc os) #f)))
                  (if verbatim-safe?
                      ;; TODO: Make sure we don't have ambiguous ":" or drop
                      ;; any characters!
                      (cons ns local)
                      ;; Note: We represent "xml:" and "xmlns:" syntax as
                      ;; normal qnames, for lack of something better to do with
                      ;; them when we don't support XML namespaces.
                      ;;
                      ;; TODO: Local names are currently forced to lowercase,
                      ;; since HTML is usually case-insensitive.  If XML
                      ;; namespaces are used, we might wish to keep local names
                      ;; case-sensitive.
                      (if local
                          (if ns
                              (if (or (string=? ns "xml")
                                      (string=? ns "xmlns"))
                                  (string->symbol (string-append ns ":" local))
                                  (cons ns
                                        (string->symbol
                                         (%htmlprag:down!ok
                                          local))))
                              (string->symbol
                               (%htmlprag:down!ok local)))
                          (if ns
                              (string->symbol
                               (%htmlprag:down!ok ns))
                              ;; TODO: Ensure that it's OK to return #f as a
                              ;; name.
                              #f)))))))

           (scan-tag
            (lambda (start?)
              (skip-ws)
              (let ((tag-name   (scan-qname #f))
                    (tag-ns     #f)
                    (tag-attrs  #f)
                    (tag-empty? #f))
                ;; Scan element name.
                (if (pair? tag-name)
                    (begin (set! tag-ns   (car tag-name))
                           (set! tag-name (cdr tag-name))))
                ;; TODO: Ensure there's no case in which a #f tag-name isn't
                ;; compensated for later.
                ;;
                ;; Scan element attributes.
                (set! tag-attrs
                      (let scan-attr-list ()
                        (read-c)
                        (cond ((c-eof?)   '())
                              ((c-angle?) (unread-c) '())
                              ((c-slash?)
                               (set! tag-empty? #t)
                               (scan-attr-list))
                              ((c-alpha?)
                               (unread-c)
                               (let ((attr (scan-attr)))
                                 (cons attr (scan-attr-list))))
                              (else (scan-attr-list)))))
                ;; Find ">" or unnatural end.
                (let loop ()
                  (read-c)
                  (cond ((c-eof?)   no-token)
                        ((c-slash?) (set! tag-empty? #t) (loop))
                        ((c-gt?)    #f)
                        ((c-ws?)    (loop))
                        (else       (unread-c))))
                ;; Change the tokenizer mode if necessary.
                (cond ((not start?) #f)
                      (tag-empty?   #f)
                      ;; TODO: Maybe make one alist lookup here, instead of
                      ;; two.
                      ((memq tag-name verbatim-to-eof-elems)
                       (set! nexttok verbeof-nexttok))
                      ((memq tag-name verbatim-pair-elems)
                       (set! nexttok (make-verbpair-nexttok tag-name))))
                ;; Return a token object.
                (if start?
                    (if tag-empty?
                        (make-empty-token tag-name tag-ns tag-attrs)
                        (make-start-token tag-name tag-ns tag-attrs))
                    (make-end-token tag-name tag-ns tag-attrs)))))

           (scan-attr
            (lambda ()
              (let ((name (scan-qname #f))
                    (val  #f))
                (if (pair? name)
                    (set! name (cdr name)))
                (let loop-equals-or-end ()
                  (read-c)
                  (cond ((c-eof?) no-token)
                        ((c-ws?)  (loop-equals-or-end))
                        ((c-equals?)
                         (let loop-quote-or-unquoted ()
                           (read-c)
                           (cond ((c-eof?) no-token)
                                 ((c-ws?) (loop-quote-or-unquoted))
                                 ((or (c-apos?) (c-quot?))
                                  (let ((term c))
                                    (set! val (open-output-string))
                                    (let loop-quoted-val ()
                                      (read-c)
                                      (cond ((c-eof?)      #f)
                                            ((eqv? c term) #f)
                                            (else (write-char c val)
                                                  (loop-quoted-val))))))
                                 ((c-angle?) (unread-c))
                                 (else
                                  (set! val (open-output-string))
                                  (write-char c val)
                                  (let loop-unquoted-val ()
                                    (read-c)
                                    (cond ((c-eof?)  no-token)
                                          ((c-apos?) #f)
                                          ((c-quot?) #f)
                                          ((or (c-ws?) (c-angle?)
                                               ;;(c-slash?)
                                               )
                                           (unread-c))
                                          ;; Note: We can treat a slash in an
                                          ;; unquoted attribute value as a
                                          ;; value constituent because the
                                          ;; slash is specially-handled only
                                          ;; for XHTML, and XHTML attribute
                                          ;; values must always be quoted.  We
                                          ;; could do lookahead for "/>", but
                                          ;; that wouldn't let us parse HTML
                                          ;; "<a href=/>" correctly, so this is
                                          ;; an easier and more correct way to
                                          ;; do things.
                                          (else (write-char c val)
                                                (loop-unquoted-val))))))))
                        (else (unread-c))))
                (if normalized?
                    (list name (if val
                                   (%htmlprag:gosc val)
                                   (symbol->string name)))
                    (if val
                        (list name (%htmlprag:gosc val))
                        (list name))))))

           (scan-comment
            ;; TODO: Rewrite this to use tail recursion rather than a state
            ;; variable.
            (lambda ()
              (let ((os    (open-output-string))
                    (state 'start-minus))
                (let loop ()
                  (read-c)
                  (cond ((c-eof?) #f)
                        ((c-minus?)
                         (set! state
                               (case state
                                 ((start-minus) 'start-minus-minus)
                                 ((start-minus-minus body) 'end-minus)
                                 ((end-minus) 'end-minus-minus)
                                 ((end-minus-minus)
                                  (write-char #\- os)
                                  state)
                                 (else (%htmlprag:error
                                        "make-html-tokenizer"
                                        "invalid state:"
                                        state))))
                         (loop))
                        ((and (c-gt?) (eq? state 'end-minus-minus)) #f)
                        (else (case state
                                ((end-minus)       (write-char #\- os))
                                ((end-minus-minus) (display "--" os)))
                              (set! state 'body)
                              (write-char c os)
                              (loop))))
                (make-comment-token (%htmlprag:gosc os)))))

           (scan-pi
            (lambda ()
              (skip-ws)
              (let ((name (open-output-string))
                    (val  (open-output-string)))
                (let scan-name ()
                  (read-c)
                  (cond ((c-eof?)   #f)
                        ((c-ws?)    #f)
                        ((c-alpha?) (write-char c name) (scan-name))
                        (else       (unread-c))))
                ;; TODO: Do we really want to emit #f for PI name?
                (set! name (output-string->symbol-or-false name))
                (let scan-val ()
                  (read-c)
                  (cond ((c-eof?)  #f)
                        ;; ((c-amp?) (display (scan-entity) val)
                        ;;           (scan-val))
                        ((c-ques?)
                         (read-c)
                         (cond ((c-eof?) (write-char #\? val))
                               ((c-gt?)  #f)
                               (else     (write-char #\? val)
                                         (unread-c)
                                         (scan-val))))
                        (else (write-char c val) (scan-val))))
                (list shtml-pi-symbol
                      name
                      (%htmlprag:gosc val)))))

           (scan-decl
            ;; TODO: Find if SXML includes declaration forms, and if so, use
            ;; whatever format SXML wants.
            ;;
            ;; TODO: Rewrite to eliminate state variables.
            (letrec
                ((scan-parts
                  (lambda ()
                    (let ((part       (open-output-string))
                          (nonsymbol? #f)
                          (state      'before)
                          (last?      #f))
                      (let loop ()
                        (read-c)
                        (cond ((c-eof?) #f)
                              ((c-ws?)
                               (case state
                                 ((before) (loop))
                                 ((quoted) (write-char c part) (loop))))
                              ((and (c-gt?) (not (eq? state 'quoted)))
                               (set! last? #t))
                              ((and (c-lt?) (not (eq? state 'quoted)))
                               (unread-c))
                              ((c-quot?)
                               (case state
                                 ((before)   (set! state 'quoted) (loop))
                                 ((unquoted) (unread-c))
                                 ((quoted)   #f)))
                              (else
                               (if (eq? state 'before)
                                   (set! state 'unquoted))
                               (set! nonsymbol? (or nonsymbol?
                                                    (not (c-alphanum?))))
                               (write-char c part)
                               (loop))))
                      (set! part (%htmlprag:gosc part))
                      (if (string=? part "")
                          '()
                          (cons (if (or (eq? state 'quoted) nonsymbol?)
                                    part
                                    ;; TODO: Normalize case of things we make
                                    ;; into symbols here.
                                    (string->symbol part))
                                (if last?
                                    '()
                                    (scan-parts))))))))
              (lambda () (make-decl-token (scan-parts)))))

           (scan-entity
            (lambda ()
              (read-c)
              (cond ((c-eof?) "&")
                    ((c-alpha?)
                     ;; TODO: Do entity names have a maximum length?
                     (let ((name (open-output-string)))
                       (write-char c name)
                       (let loop ()
                         (read-c)
                         (cond ((c-eof?)   #f)
                               ((c-alpha?) (write-char c name) (loop))
                               ((c-semi?)  #f)
                               (else       (unread-c))))
                       (set! name (%htmlprag:gosc name))
                       ;; TODO: Make the entity map an option.
                       (let ((pair (assoc name '(("amp"  . "&")
                                                 ("apos" . "'")
                                                 ("gt"   . ">")
                                                 ("lt"   . "<")
                                                 ("quot" . "\"")))))
                         (if pair
                             (cdr pair)
                             (make-shtml-entity name)))))
                    ((c-pound?)
                     (let ((num  (open-output-string))
                           (hex? #f))
                       (read-c)
                       (cond ((c-eof?)            #f)
                             ((memv c '(#\x #\X)) (set! hex? #t) (read-c)))
                       (let loop ()
                         (cond ((c-eof?)  #f)
                               ((c-semi?) #f)
                               ((or (c-digit?) (and hex? (c-hexlet?)))
                                (write-char c num)
                                (read-c)
                                (loop))
                               (else (unread-c))))
                       (set! num (%htmlprag:gosc num))
                       (if (string=? num "")
                           "&#;"
                           (let ((n (string->number num (if hex? 16 10))))
                             (if (and (<= 32 n 255) (not (= n 127)))
                                 (string (%htmlprag:a2c n))
                                 (make-shtml-entity n))))))
                    (else (unread-c) "&"))))

           (normal-nexttok
            (lambda ()
              (read-c)
              (cond ((c-eof?) no-token)
                    ((c-lt?)
                     (let loop ()
                       (read-c)
                       (cond ((c-eof?)   "<")
                             ((c-ws?)    (loop))
                             ((c-slash?) (scan-tag #f))
                             ((c-ques?)  (scan-pi))
                             ((c-bang?)  (let loop ()
                                           (read-c)
                                           (cond ((c-eof?)   no-token)
                                                 ((c-ws?)    (loop))
                                                 ((c-minus?) (scan-comment))
                                                 (else       (unread-c)
                                                             (scan-decl)))))
                             ((c-alpha?) (unread-c) (scan-tag #t))
                             (else       (unread-c) "<"))))
                    ((c-gt?) ">")
                    (else (let ((os (open-output-string)))
                            (let loop ()
                              (cond ((c-eof?)   #f)
                                    ((c-angle?) (unread-c))
                                    ((c-amp?)
                                     (let ((entity (scan-entity)))
                                       (if (string? entity)
                                           (begin (display entity os)
                                                  (read-c)
                                                  (loop))
                                           (let ((saved-nexttok nexttok))
                                             (set! nexttok
                                                   (lambda ()
                                                     (set! nexttok
                                                           saved-nexttok)
                                                     entity))))))
                                    (else (write-char c os)
                                          (or (c-lf?)
                                              (begin (read-c) (loop))))))
                            (let ((text (%htmlprag:gosc os)))
                              (if (equal? text "")
                                  (nexttok)
                                  text)))))))

           (verbeof-nexttok
            (lambda ()
              (read-c)
              (if (c-eof?)
                  no-token
                  (let ((os (open-output-string)))
                    (let loop ()
                      (or (c-eof?)
                          (begin (write-char c os)
                                 (or (c-lf?)
                                     (begin (read-c) (loop))))))
                    (%htmlprag:gosc os)))))

           (make-verbpair-nexttok
            (lambda (elem-name)
              (lambda ()
                (let ((os (open-output-string)))
                  ;; Accumulate up to a newline-terminated line.
                  (let loop ()
                    (read-c)
                    (cond ((c-eof?)
                           ;; Got EOF in verbatim context, so set the normal
                           ;; nextok procedure, then fall out of loop.
                           (set! nexttok normal-nexttok))
                          ((c-lt?)
                           ;; Got "<" in verbatim context, so get next
                           ;; character.
                           (read-c)
                           (cond ((c-eof?)
                                  ;; Got "<" then EOF, so set to the normal
                                  ;; nexttok procedure, add the "<" to the
                                  ;; verbatim string, and fall out of loop.
                                  (set! nexttok normal-nexttok)
                                  (write-char #\< os))
                                 ((c-slash?)
                                  ;; Got "</", so...
                                  (read-c)
                                  (cond
                                   ((c-eof?)
                                    (display "</" os))
                                   ((c-alpha?)
                                    ;; Got "</" followed by alpha, so unread
                                    ;; the alpha, scan qname, compare...
                                    (unread-c)
                                    (let* ((vqname (scan-qname #t))
                                           (ns     (car vqname))
                                           (local  (cdr vqname)))
                                      ;; Note: We ignore XML namespace
                                      ;; qualifier for purposes of comparison.
                                      ;;
                                      ;; Note: We're interning strings here for
                                      ;; comparison when in theory there could
                                      ;; be many such unique interned strings
                                      ;; in a valid HTML document, although in
                                      ;; practice this should not be a problem.
                                      (if (and local
                                               (eqv? (string->symbol
                                                      (%htmlprag:down
                                                       local))
                                                     elem-name))
                                          ;; This is the terminator tag, so
                                          ;; scan to the end of it, set the
                                          ;; nexttok, and fall out of the loop.
                                          (begin
                                            (let scan-to-end ()
                                              (read-c)
                                              (cond ((c-eof?) #f)
                                                    ((c-gt?)  #f)
                                                    ((c-lt?)  (unread-c))
                                                    ((c-alpha?)
                                                     (unread-c)
                                                     ;; Note: This is an
                                                     ;; expensive way to skip
                                                     ;; over an attribute, but
                                                     ;; in practice more
                                                     ;; verbatim end tags will
                                                     ;; not have attributes.
                                                     (scan-attr)
                                                     (scan-to-end))
                                                    (else (scan-to-end))))
                                            (set! nexttok
                                                  (lambda ()
                                                    (set! nexttok
                                                          normal-nexttok)
                                                    (make-end-token
                                                     elem-name #f '()))))
                                          ;; This isn't the terminator tag, so
                                          ;; add to the verbatim string the
                                          ;; "</" and the characters of what we
                                          ;; were scanning as a qname, and
                                          ;; recurse in the loop.
                                          (begin
                                            (display "</" os)
                                            (if ns
                                                (begin (display ns os)
                                                       (display ":" os)))
                                            (if local
                                                (display local os))
                                            (loop)))))
                                   (else
                                    ;; Got "</" and non-alpha, so unread new
                                    ;; character, add the "</" to verbatim
                                    ;; string, then loop.
                                    (unread-c)
                                    (display "</" os)
                                    (loop))))
                                 (else
                                  ;; Got "<" and non-slash, so unread the new
                                  ;; character, write the "<" to the verbatim
                                  ;; string, then loop.
                                  (unread-c)
                                  (write-char #\< os)
                                  (loop))))
                          (else
                           ;; Got non-"<" in verbatim context, so just add it
                           ;; to the buffer, then, if it's not a linefeed, fall
                           ;; out of the loop so that the token can be
                           ;; returned.
                           (write-char c os)
                           (or (c-lf?) (loop)))))
                  ;; Return the accumulated line string, if non-null, or call
                  ;; nexttok.
                  (or (output-string->string-or-false os) (nexttok))))))

           (nexttok #f))

        (set! nexttok normal-nexttok)
        (lambda () (nexttok))))))

tokenize-html

Index
(define (tokenize-html in normalized?)
  (let ((next-tok (make-html-tokenizer in normalized?)))
    (let loop ((tok (next-tok)))
      (if (null? tok)
          '()
          (cons tok (loop (next-tok)))))))

shtml-token-kind

Index
(define (shtml-token-kind token)
  (cond ((string? token) shtml-text-symbol)
        ((list?   token)
         (let ((s (list-ref token 0)))
           (if (memq s `(,shtml-comment-symbol
                         ,shtml-decl-symbol
                         ,shtml-empty-symbol
                         ,shtml-end-symbol
                         ,shtml-entity-symbol
                         ,shtml-pi-symbol))
               s
               shtml-start-symbol)))
        (else (%htmlprag:error "shtml-token-kind"
                               "unrecognized token kind:"
                               token))))

%htmlprag:empty-elements

Index
(define %htmlprag:empty-elements
  '(& area base br frame hr img input isindex keygen link meta object param
      spacer wbr))

parse-html/tokenizer

Index
(define parse-html/tokenizer
  ;; TODO: Document the algorithm, then see if rewriting as idiomatic Scheme
  ;; can make it more clear.
  (letrec ((empty-elements
            ;; TODO: Maybe make this an option.  This might also be an
            ;; acceptable way to parse old HTML that uses the `p' element as a
            ;; paragraph terminator.
            %htmlprag:empty-elements)
           (parent-constraints
            ;; TODO: Maybe make this an option.
            '((area     . (map))
              (body     . (html))
              (caption  . (table))
              (colgroup . (table))
              (dd       . (dl))
              (dt       . (dl))
              (frame    . (frameset))
              (head     . (html))
              (isindex  . (head))
              (li       . (dir menu ol ul))
              (meta     . (head))
              (noframes . (frameset))
              (option   . (select))
              (p        . (body td th))
              (param    . (applet))
              (tbody    . (table))
              (td       . (tr))
              (th       . (tr))
              (thead    . (table))
              (title    . (head))
              (tr       . (table tbody thead))))
           (start-tag-name (lambda (tag-token) (car tag-token)))
           (end-tag-name   (lambda (tag-token) (list-ref tag-token 1))))
    (lambda (tokenizer normalized?)
      ;; Example `begs' value:
      ;;
      ;; ( ((head ...) . ( (title ...)                         ))
      ;;   ((html ...) . ( (head  ...) (*COMMENT* ...)         ))
      ;;   (#f         . ( (html  ...) (*DECL*    doctype ...) )) )
      (let ((begs (list (cons #f '()))))
        (letrec ((add-to-current-beg
                  (lambda (tok)
                    (set-cdr! (car begs) (cons tok (cdr (car begs))))))
                 (finish-all-begs
                  (lambda ()
                    (let ((toplist #f))
                      (map (lambda (beg) (set! toplist (finish-beg beg)))
                           begs)
                      toplist)))
                 (finish-beg
                  (lambda (beg)
                    (let ((start-tok (car beg)))
                      (if start-tok
                          (%htmlprag:append!
                           (car beg)
                           (%htmlprag:reverse!ok (cdr beg)))
                          (%htmlprag:reverse!ok (cdr beg))))))
                 (finish-begs-to
                  (lambda (name lst)
                    (let* ((top      (car lst))
                           (starttag (car top)))
                      (cond ((not starttag) #f)
                            ((eqv? name (start-tag-name starttag))
                             (set! begs (cdr lst))
                             (finish-beg top)
                             #t)
                            (else (if (finish-begs-to name (cdr lst))
                                      (begin (finish-beg top) #t)
                                      #f))))))
                 (finish-begs-upto
                  (lambda (parents lst)
                    (let* ((top      (car lst))
                           (starttag (car top)))
                      (cond ((not starttag) #f)
                            ((memq (start-tag-name starttag) parents)
                             (set! begs lst)
                             #t)
                            (else (if (finish-begs-upto parents (cdr lst))
                                      (begin (finish-beg top) #t)
                                      #f)))))))
          (let loop ()
            (let ((tok (tokenizer)))
              (if (null? tok)
                  (finish-all-begs)
                  (let ((kind (shtml-token-kind tok)))
                    (cond ((memv kind `(,shtml-comment-symbol
                                        ,shtml-decl-symbol
                                        ,shtml-entity-symbol
                                        ,shtml-pi-symbol
                                        ,shtml-text-symbol))
                           (add-to-current-beg tok))
                          ((eqv? kind shtml-start-symbol)
                           (let* ((name (start-tag-name tok))
                                  (cell (assq name parent-constraints)))
                             (and cell (finish-begs-upto (cdr cell) begs))
                             (add-to-current-beg tok)
                             (or (memq name empty-elements)
                                 (set! begs (cons (cons tok '()) begs)))))
                          ((eqv? kind shtml-empty-symbol)
                           ;; Empty tag token, so just add it to current
                           ;; beginning while stripping off leading `*EMPTY*'
                           ;; symbol so that the token becomes normal SXML
                           ;; element syntax.
                           (add-to-current-beg (cdr tok)))
                          ((eqv? kind shtml-end-symbol)
                           (let ((name (end-tag-name tok)))
                             (if name
                                 ;; Try to finish to a start tag matching this
                                 ;; end tag.  If none, just drop the token,
                                 ;; though we used to add it to the current
                                 ;; beginning.
                                 (finish-begs-to name begs)
                                 ;; We have an anonymous end tag, so match it
                                 ;; with the most recent beginning.  If no
                                 ;; beginning to match, then just drop the
                                 ;; token, though we used to add it to the
                                 ;; current beginning.
                                 (and (car (car begs))
                                      (begin (finish-beg (car begs))
                                             (set! begs (cdr begs)))))))
                          (else (%htmlprag:error "parse-html/tokenizer"
                                                 "unknown tag kind:"
                                                 kind)))
                    (loop))))))))))

%htmlprag:parse-html

Index
(define (%htmlprag:parse-html input normalized? top?)
  (let ((parse
         (lambda ()
           (parse-html/tokenizer
            (make-html-tokenizer
             (cond ((input-port? input) input)
                   ((string?     input) (open-input-string input))
                   (else (%htmlprag:error
                          "%htmlprag:parse-html"
                          "invalid input type:"
                          input)))
             normalized?)
            normalized?))))
    (if top?
        (cons shtml-top-symbol (parse))
        (parse))))

html->sxml-0nf

Index
(define (html->sxml-0nf input) (%htmlprag:parse-html input #f #t))

html->sxml-1nf

Index
(define (html->sxml-1nf input) (%htmlprag:parse-html input #f #t))

html->sxml-2nf

Index
(define (html->sxml-2nf input) (%htmlprag:parse-html input #t #t))

html->sxml

Index
(define html->sxml  html->sxml-0nf)

html->shtml

Index
(define html->shtml html->sxml-0nf)

%htmlprag:write-shtml-as-html/fixed

Index
(define (%htmlprag:write-shtml-as-html/fixed shtml out foreign-filter)
  (letrec
      ((write-shtml-text
        (lambda (str out)
          (let ((len (string-length str)))
            (let loop ((i 0))
              (if (< i len)
                  (begin (display (let ((c (string-ref str i)))
                                    (case c
                                      ;; ((#\") "&quot;")
                                      ((#\&) "&amp;")
                                      ((#\<) "&lt;")
                                      ((#\>) "&gt;")
                                      (else c)))
                                  out)
                         (loop (+ 1 i))))))))
       (write-dquote-ampified
        (lambda (str out)
          ;; TODO: If we emit "&quot;", we really should parse it, and HTML
          ;; 4.01 says we should, but anachronisms in HTML create the potential
          ;; for nasty mutilation of URI in attribute values.
          (let ((len (string-length str)))
            (let loop ((i 0))
              (if (< i len)
                  (begin (display (let ((c (string-ref str i)))
                                    (if (eqv? c #\") "&quot;" c))
                                  out)
                         (loop (+ 1 i))))))))
       (do-thing
        (lambda (thing)
          (cond ((string? thing) (write-shtml-text thing out))
                ((list? thing)   (if (not (null? thing))
                                     (do-list-thing thing)))
                (else (do-thing (foreign-filter thing #f))))))
       (do-list-thing
        (lambda (thing)
          (let ((head (car thing)))
            (cond ((symbol? head)
                   ;; Head is a symbol, so...
                   (cond ((eq? head shtml-comment-symbol)
                          ;; TODO: Make sure the comment text doesn't contain a
                          ;; comment end sequence.
                          (display "<!-- " out)
                          (let ((text (car (cdr thing))))
                            (if (string? text)
                                ;; TODO: Enforce whitespace safety without
                                ;; padding unnecessarily.
                                ;;
                                ;; (let ((len (string-length text)))
                                ;; (if (= len 0)
                                ;; (display #\space out)
                                ;; (begin (if (not (eqv?
                                ;; (string-ref text 0)
                                ;; #\space))
                                (display text out)
                                (%htmlprag:error
                                 "write-shtml-as-html"
                                 "invalid SHTML comment text:"
                                 thing)))
                          (or (null? (cdr (cdr thing)))
                              (%htmlprag:error
                               "write-shtml-as-html"
                               "invalid SHTML comment body:"
                               thing))
                          (display " -->" out))
                         ((eq? head shtml-decl-symbol)
                          (let ((head (car (cdr thing))))
                            (display "<!" out)
                            (display (symbol->string head) out)
                            (for-each
                             (lambda (n)
                               (cond ((symbol? n)
                                      (display #\space out)
                                      (display (symbol->string n) out))
                                     ((string? n)
                                      (display " \"" out)
                                      (write-dquote-ampified n out)
                                      (display #\" out))
                                     (else (%htmlprag:error
                                            "write-shtml-as-html"
                                            "invalid SHTML decl:"
                                            thing))))
                             (cdr (cdr thing)))
                            (display #\> out)))
                         ((eq? head shtml-pi-symbol)
                          (display "<?" out)
                          (display (symbol->string (car (cdr thing))) out)
                          (display #\space out)
                          (display (car (cdr (cdr thing))) out)
                          ;; TODO: Error-check that no more rest of PI.
                          (display "?>" out))
                         ((eq? head shtml-top-symbol)
                          (for-each do-thing (cdr thing)))
                         ((eq? head shtml-empty-symbol)
                          #f)
                         ((eq? head '@)
                          (%htmlprag:error
                           "write-shtml-as-html"
                           "illegal position of SHTML attributes:"
                           thing))
                         ((or (eq? head '&) (eq? head shtml-entity-symbol))
                          (let ((val (shtml-entity-value thing)))
                            (if val
                                (begin (write-char     #\& out)
                                       (if (integer? val)
                                           (write-char #\# out))
                                       (display        val out)
                                       (write-char     #\; out))
                                (%htmlprag:error
                                 "write-shtml-as-html"
                                 "invalid SHTML entity reference:"
                                 thing))))
                         ((memq head `(,shtml-end-symbol
                                       ,shtml-start-symbol
                                       ,shtml-text-symbol))
                          (%htmlprag:error "write-shtml-as-html"
                                           "invalid SHTML symbol:"
                                           head))
                         (else
                          (display #\< out)
                          (display head out)
                          (let* ((rest   (cdr thing)))
                            (if (not (null? rest))
                                (let ((second (car rest)))
                                  (and (list? second)
                                       (not (null? second))
                                       (eq? (car second)
                                            '@)
                                       (begin (for-each do-attr (cdr second))
                                              (set! rest (cdr rest))))))
                            (if (memq head
                                      %htmlprag:empty-elements)
                                ;; TODO: Error-check to make sure the element
                                ;; has no content other than attributes.  We
                                ;; have to test for cases like: (br (@) ()
                                ;; (()))
                                (display " />" out)
                                (begin (display #\> out)
                                       (for-each do-thing rest)
                                       (display "</" out)
                                       (display (symbol->string head) out)
                                       (display #\> out)))))))
                  ;; ((or (list? head) (string? head))
                  ;;
                  ;; Head is a list or string, which might occur as the result
                  ;; of an SXML transform, so we'll cope.
                  (else
                   ;; Head is not a symbol, which might occur as the result of
                   ;; an SXML transform, so we'll cope.
                   (for-each do-thing thing))
                  ;;(else
                  ;; ;; Head is NOT a symbol, list, or string, so error.
                  ;; (%htmlprag:error "write-shtml-as-html"
                  ;;                          "invalid SHTML list:"
                  ;;                          thing))
                  ))))
       (write-attr-val-dquoted
        (lambda (str out)
          (display #\" out)
          (display str out)
          (display #\" out)))
       (write-attr-val-squoted
        (lambda (str out)
          (display #\' out)
          (display str out)
          (display #\' out)))
       (write-attr-val-dquoted-and-amped
        (lambda (str out)
          (display #\" out)
          (write-dquote-ampified str out)
          (display #\" out)))
       (write-attr-val
        (lambda (str out)
          (let ((len (string-length str)))
            (let find-dquote-and-squote ((i 0))
              (if (= i len)
                  (write-attr-val-dquoted str out)
                  (let ((c (string-ref str i)))
                    (cond ((eqv? c #\")
                           (let find-squote ((i (+ 1 i)))
                             (if (= i len)
                                 (write-attr-val-squoted str out)
                                 (if (eqv? (string-ref str i) #\')
                                     (write-attr-val-dquoted-and-amped str
                                                                       out)
                                     (find-squote (+ 1 i))))))
                          ((eqv? c #\')
                           (let find-dquote ((i (+ 1 i)))
                             (if (= i len)
                                 (write-attr-val-dquoted str out)
                                 (if (eqv? (string-ref str i) #\")
                                     (write-attr-val-dquoted-and-amped str
                                                                       out)
                                     (find-dquote (+ 1 i))))))
                          (else (find-dquote-and-squote (+ 1 i))))))))))

       (collect-and-write-attr-val
        ;; TODO: Take another look at this.
        (lambda (lst out)
          (let ((os #f))
            (let do-list ((lst lst))
              (for-each
               (lambda (thing)
                 (let do-thing ((thing thing))
                   (cond ((string? thing)
                          (or os (set! os (open-output-string)))
                          (display thing os))
                         ((list? thing)
                          (do-list thing))
                         ((eq? thing #t)
                          #f)
                         (else
                          (do-thing (foreign-filter thing #t))))))
               lst))
            (if os
                (begin
                  (display #\= out)
                  (write-attr-val (%htmlprag:gosc os) out))))))

       (do-attr
        (lambda (attr)
          (or (list? attr)
              (%htmlprag:error "write-shtml-as-html"
                               "invalid SHTML attribute:"
                               attr))
          (if (not (null? attr))
              (let ((name (car attr)))
                (or (symbol? name)
                    (%htmlprag:error
                     "write-shtml-as-html"
                     "invalid name in SHTML attribute:"
                     attr))
                (if (not (eq? name '@))
                    (begin
                      (display #\space out)
                      (display name    out)
                      (collect-and-write-attr-val (cdr attr) out)

                      )))))))
    (do-thing shtml)
    (if #f #f)))

write-shtml-as-html

Index
(define write-shtml-as-html
  (letrec ((error-foreign-filter
            (lambda (foreign-object in-attribute-value?)
              (%htmlprag:error
               "write-shtml-as-html"
               (if in-attribute-value?
                   "unhandled foreign object in shtml attribute value:"
                   "unhandled foreign object in shtml:")
               foreign-object))))
    (lambda (shtml . rest)
      (case (length rest)
        ((0) (%htmlprag:write-shtml-as-html/fixed
              shtml
              (current-output-port)
              error-foreign-filter))
        ((1) (%htmlprag:write-shtml-as-html/fixed
              shtml
              (car rest)
              error-foreign-filter))
        ((2) (%htmlprag:write-shtml-as-html/fixed
              shtml
              (car rest)
              (cadr rest)))
        (else
         (%htmlprag:error "write-shtml-as-html"
                          "extraneous arguments:"
                          (cddr rest)))))))

shtml->html

Index
(define (shtml->html shtml)
  (let ((os (open-output-string)))
    (write-shtml-as-html shtml os)
    (%htmlprag:gosc os)))

sxml->html

Index
(define sxml->html      shtml->html)

write-sxml-html

Index
(define write-sxml-html write-shtml-as-html)

%htmlprag:test

Index
(define (%htmlprag:test)
  (%htmlprag:testeez
   "HtmlPrag"

   (test-define "" lf (string (%htmlprag:a2c 10)))

   (test/equal "" (html->shtml "<a>>") `(,shtml-top-symbol (a ">")))
   (test/equal "" (html->shtml "<a<>") `(,shtml-top-symbol (a "<" ">")))

   (test/equal "" (html->shtml "<>")      `(,shtml-top-symbol "<" ">"))
   (test/equal "" (html->shtml "< >")     `(,shtml-top-symbol "<" ">"))
   (test/equal "" (html->shtml "< a>")    `(,shtml-top-symbol (a)))
   (test/equal "" (html->shtml "< a / >") `(,shtml-top-symbol (a)))

   (test/equal "" (html->shtml "<a<")  `(,shtml-top-symbol (a "<")))
   (test/equal "" (html->shtml "<a<b") `(,shtml-top-symbol (a (b))))

   (test/equal "" (html->shtml "><a>") `(,shtml-top-symbol ">" (a)))

   (test/equal "" (html->shtml "</>") `(,shtml-top-symbol))

   (test/equal "" (html->shtml "<\">") `(,shtml-top-symbol "<" "\"" ">"))

   (test/equal ""
               (html->shtml (string-append "<a>xxx<plaintext>aaa" lf
                                           "bbb" lf
                                           "c<c<c"))
               `(,shtml-top-symbol
                 (a "xxx" (plaintext ,(string-append "aaa" lf)
                                     ,(string-append "bbb" lf)
                                     "c<c<c"))))

   (test/equal ""
               (html->shtml "aaa<!-- xxx -->bbb")
               `(,shtml-top-symbol
                 "aaa" (,shtml-comment-symbol " xxx ")   "bbb"))

   (test/equal ""
               (html->shtml "aaa<! -- xxx -->bbb")
               `(,shtml-top-symbol
                 "aaa" (,shtml-comment-symbol " xxx ")   "bbb"))

   (test/equal ""
               (html->shtml "aaa<!-- xxx --->bbb")
               `(,shtml-top-symbol
                 "aaa" (,shtml-comment-symbol " xxx -")  "bbb"))

   (test/equal ""
               (html->shtml "aaa<!-- xxx ---->bbb")
               `(,shtml-top-symbol
                 "aaa" (,shtml-comment-symbol " xxx --") "bbb"))

   (test/equal ""
               (html->shtml "aaa<!-- xxx -y-->bbb")
               `(,shtml-top-symbol
                 "aaa" (,shtml-comment-symbol " xxx -y") "bbb"))

   (test/equal ""
               (html->shtml "aaa<!----->bbb")
               `(,shtml-top-symbol
                 "aaa" (,shtml-comment-symbol "-")       "bbb"))

   (test/equal ""
               (html->shtml "aaa<!---->bbb")
               `(,shtml-top-symbol
                 "aaa" (,shtml-comment-symbol "")        "bbb"))

   (test/equal ""
               (html->shtml "aaa<!--->bbb")
               `(,shtml-top-symbol "aaa" (,shtml-comment-symbol "->bbb")))

   (test/equal "" (html->shtml "<hr>")   `(,shtml-top-symbol (hr)))
   (test/equal "" (html->shtml "<hr/>")  `(,shtml-top-symbol (hr)))
   (test/equal "" (html->shtml "<hr />") `(,shtml-top-symbol (hr)))

   (test/equal ""
               (html->shtml "<hr noshade>")
               `(,shtml-top-symbol (hr (@ (noshade)))))
   (test/equal ""
               (html->shtml "<hr noshade/>")
               `(,shtml-top-symbol (hr (@ (noshade)))))
   (test/equal ""
               (html->shtml "<hr noshade />")
               `(,shtml-top-symbol (hr (@ (noshade)))))
   (test/equal ""
               (html->shtml "<hr noshade / >")
               `(,shtml-top-symbol (hr (@ (noshade)))))
   (test/equal ""
               (html->shtml "<hr noshade=1 />")
               `(,shtml-top-symbol (hr (@ (noshade "1")))))
   (test/equal ""
               (html->shtml "<hr noshade=1/>")
               `(,shtml-top-symbol (hr (@ (noshade "1/")))))

   (test/equal ""
               (html->shtml "<q>aaa<p/>bbb</q>ccc</p>ddd")
               `(,shtml-top-symbol (q "aaa" (p) "bbb") "ccc" "ddd"))

   (test/equal "" (html->shtml "&lt;") `(,shtml-top-symbol "<"))
   (test/equal "" (html->shtml "&gt;") `(,shtml-top-symbol ">"))

   (test/equal ""
               (html->shtml "Gilbert &amp; Sullivan")
               `(,shtml-top-symbol "Gilbert & Sullivan"))
   (test/equal ""
               (html->shtml "Gilbert &amp Sullivan")
               `(,shtml-top-symbol "Gilbert & Sullivan"))
   (test/equal ""
               (html->shtml "Gilbert & Sullivan")
               `(,shtml-top-symbol "Gilbert & Sullivan"))

   (test/equal ""
               (html->shtml "Copyright &copy; Foo")
               `(,shtml-top-symbol "Copyright "
                                   (& ,(string->symbol "copy"))
                                   " Foo"))
   (test/equal ""
               (html->shtml "aaa&copy;bbb")
               `(,shtml-top-symbol
                 "aaa" (& ,(string->symbol "copy")) "bbb"))
   (test/equal ""
               (html->shtml "aaa&copy")
               `(,shtml-top-symbol
                 "aaa" (& ,(string->symbol "copy"))))

   (test/equal "" (html->shtml "&#42;")  `(,shtml-top-symbol "*"))
   (test/equal "" (html->shtml "&#42")   `(,shtml-top-symbol "*"))
   (test/equal "" (html->shtml "&#42x")  `(,shtml-top-symbol "*x"))
   (test/equal "" (html->shtml "&#151")  `(,shtml-top-symbol
                                           ,(string (%htmlprag:a2c 151))))
   (test/equal "" (html->shtml "&#1000") `(,shtml-top-symbol (& 1000)))
   (test/equal "" (html->shtml "&#x42")  `(,shtml-top-symbol "B"))
   (test/equal "" (html->shtml "&#xA2")  `(,shtml-top-symbol
                                           ,(string (%htmlprag:a2c 162))))
   (test/equal "" (html->shtml "&#xFF")  `(,shtml-top-symbol
                                           ,(string (%htmlprag:a2c 255))))
   (test/equal "" (html->shtml "&#x100") `(,shtml-top-symbol (& 256)))
   (test/equal "" (html->shtml "&#X42")  `(,shtml-top-symbol "B"))
   (test/equal "" (html->shtml "&42;")   `(,shtml-top-symbol "&42;"))

   (test/equal ""
               (html->shtml (string-append "aaa&copy;bbb&amp;ccc&lt;ddd&&gt;"
                                           "eee&#42;fff&#1000;ggg&#x5a;hhh"))
               `(,shtml-top-symbol
                 "aaa"
                 (& ,(string->symbol "copy"))
                 "bbb&ccc<ddd&>eee*fff"
                 (& 1000)
                 "gggZhhh"))

   (test/equal ""
               (html->shtml
                (string-append
                 "<IMG src=\"http://e.e/aw/pics/listings/"
                 "ebayLogo_38x16.gif\" border=0 width=\"38\" height=\"16\" "
                 "HSPACE=5 VSPACE=0\">2</FONT>"))
               `(,shtml-top-symbol
                 (img (@
                       (src
                        "http://e.e/aw/pics/listings/ebayLogo_38x16.gif")
                       (border "0") (width "38") (height "16")
                       (hspace "5") (vspace "0")))
                 "2"))

   (test/equal ""
               (html->shtml "<aaa bbb=ccc\"ddd>eee")
               `(,shtml-top-symbol (aaa (@ (bbb "ccc") (ddd)) "eee")))
   (test/equal ""
               (html->shtml "<aaa bbb=ccc \"ddd>eee")
               `(,shtml-top-symbol (aaa (@ (bbb "ccc") (ddd)) "eee")))

   (test/equal ""
               (html->shtml
                (string-append
                 "<HTML><Head><Title>My Title</Title></Head>"
                 "<Body BGColor=\"white\" Foo=42>"
                 "This is a <B><I>bold-italic</B></I> test of </Erk>"
                 "broken HTML.<br>Yes it is.</Body></HTML>"))
               `(,shtml-top-symbol
                 (html (head (title "My Title"))
                       (body (@ (bgcolor "white") (foo "42"))
                             "This is a "
                             (b (i "bold-italic"))
                             " test of "
                             "broken HTML."
                             (br)
                             "Yes it is."))))

   (test/equal ""
               (html->shtml
                (string-append
                 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\""
                 " \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">"))
               `(,shtml-top-symbol
                 (,shtml-decl-symbol
                  ,(string->symbol "DOCTYPE")
                  html
                  ,(string->symbol "PUBLIC")
                  "-//W3C//DTD XHTML 1.0 Strict//EN"
                  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd")))

   (test/equal ""
               (html->shtml
                (string-append
                 "<html xmlns=\"http://www.w3.org/1999/xhtml\" "
                 "xml:lang=\"en\" "
                 "lang=\"en\">"))
               `(,shtml-top-symbol
                 (html (@ (xmlns "http://www.w3.org/1999/xhtml")
                          (xml:lang "en") (lang "en")))))

   (test/equal
    ""
    (html->shtml
     (string-append
      "<html:html xmlns:html=\"http://www.w3.org/TR/REC-html40\">"
      "<html:head><html:title>Frobnostication</html:title></html:head>"
      "<html:body><html:p>Moved to <html:a href=\"http://frob.com\">"
      "here.</html:a></html:p></html:body></html:html>"))
    `(,shtml-top-symbol
      (html (@ (xmlns:html "http://www.w3.org/TR/REC-html40"))
            (head (title "Frobnostication"))
            (body (p "Moved to "
                     (a (@ (href "http://frob.com"))
                        "here."))))))

   (test/equal ""
               (html->shtml
                (string-append
                 "<RESERVATION xmlns:HTML=\"http://www.w3.org/TR/REC-html40\">"
                 "<NAME HTML:CLASS=\"largeSansSerif\">Layman, A</NAME>"
                 "<SEAT CLASS=\"Y\" HTML:CLASS=\"largeMonotype\">33B</SEAT>"
                 "<HTML:A HREF=\"/cgi-bin/ResStatus\">Check Status</HTML:A>"
                 "<DEPARTURE>1997-05-24T07:55:00+1</DEPARTURE></RESERVATION>"))
               `(,shtml-top-symbol
                 (reservation (@ (,(string->symbol "xmlns:HTML")
                                  "http://www.w3.org/TR/REC-html40"))
                              (name (@ (class "largeSansSerif"))
                                    "Layman, A")
                              (seat (@ (class "Y") (class "largeMonotype"))
                                    "33B")
                              (a (@ (href "/cgi-bin/ResStatus"))
                                 "Check Status")
                              (departure "1997-05-24T07:55:00+1"))))

   (test/equal
    ""
    (html->shtml
     (string-append
      "<html><head><title></title><title>whatever</title></head><body>"
      "<a href=\"url\">link</a><p align=center><ul compact style=\"aa\">"
      "<p>BLah<!-- comment <comment> --> <i> italic <b> bold <tt> ened </i>"
      " still &lt; bold </b></body><P> But not done yet..."))
    `(,shtml-top-symbol
      (html (head (title) (title "whatever"))
            (body (a (@ (href "url")) "link")
                  (p (@ (align "center"))
                     (ul (@ (compact) (style "aa"))))
                  (p "BLah"
                     (,shtml-comment-symbol " comment <comment> ")
                     " "
                     (i " italic " (b " bold " (tt " ened ")))
                     " still < bold "))
            (p " But not done yet..."))))

   (test/equal ""
               (html->shtml "<?xml version=\"1.0\" encoding=\"UTF-8\"?>")
               `(,shtml-top-symbol
                 (,shtml-pi-symbol xml "version=\"1.0\" encoding=\"UTF-8\"")))

   (test/equal ""
               (html->shtml "<?php php_info(); ?>")
               `(,shtml-top-symbol (,shtml-pi-symbol php "php_info(); ")))
   (test/equal ""
               (html->shtml "<?php php_info(); ?")
               `(,shtml-top-symbol (,shtml-pi-symbol php "php_info(); ?")))
   (test/equal ""
               (html->shtml "<?php php_info(); ")
               `(,shtml-top-symbol (,shtml-pi-symbol php "php_info(); ")))

   (test/equal ""
               (html->shtml "<?foo bar ? baz > blort ?>")
               `(,shtml-top-symbol
                 (,shtml-pi-symbol foo "bar ? baz > blort ")))

   (test/equal ""
               (html->shtml "<?foo b?>x")
               `(,shtml-top-symbol (,shtml-pi-symbol foo "b") "x"))
   (test/equal ""
               (html->shtml "<?foo ?>x")
               `(,shtml-top-symbol (,shtml-pi-symbol foo "")  "x"))
   (test/equal ""
               (html->shtml "<?foo ?>x")
               `(,shtml-top-symbol (,shtml-pi-symbol foo "")  "x"))
   (test/equal ""
               (html->shtml "<?foo?>x")
               `(,shtml-top-symbol (,shtml-pi-symbol foo "")  "x"))
   (test/equal ""
               (html->shtml "<?f?>x")
               `(,shtml-top-symbol (,shtml-pi-symbol f   "")  "x"))
   (test/equal ""
               (html->shtml "<??>x")
               `(,shtml-top-symbol (,shtml-pi-symbol #f  "")  "x"))
   (test/equal ""
               (html->shtml "<?>x")
               `(,shtml-top-symbol (,shtml-pi-symbol #f  ">x")))

   (test/equal ""
               (html->shtml "<foo bar=\"baz\">blort")
               `(,shtml-top-symbol (foo (@ (bar "baz")) "blort")))
   (test/equal ""
               (html->shtml "<foo bar='baz'>blort")
               `(,shtml-top-symbol (foo (@ (bar "baz")) "blort")))
   (test/equal ""
               (html->shtml "<foo bar=\"baz'>blort")
               `(,shtml-top-symbol (foo (@ (bar "baz'>blort")))))
   (test/equal ""
               (html->shtml "<foo bar='baz\">blort")
               `(,shtml-top-symbol (foo (@ (bar "baz\">blort")))))

   (test/equal ""
               (html->shtml (string-append "<p>A</p>"
                                           "<script>line0 <" lf
                                           "line1" lf
                                           "<line2></script>"
                                           "<p>B</p>"))
               `(,shtml-top-symbol (p "A")
                                   (script ,(string-append "line0 <" lf)
                                           ,(string-append "line1"   lf)
                                           "<line2>")
                                   (p "B")))

   (test/equal ""
               (html->shtml "<xmp>a<b>c</XMP>d")
               `(,shtml-top-symbol (xmp "a<b>c") "d"))
   (test/equal ""
               (html->shtml "<XMP>a<b>c</xmp>d")
               `(,shtml-top-symbol (xmp "a<b>c") "d"))
   (test/equal ""
               (html->shtml "<xmp>a<b>c</foo:xmp>d")
               `(,shtml-top-symbol (xmp "a<b>c") "d"))
   (test/equal ""
               (html->shtml "<foo:xmp>a<b>c</xmp>d")
               `(,shtml-top-symbol (xmp "a<b>c") "d"))
   (test/equal ""
               (html->shtml "<foo:xmp>a<b>c</foo:xmp>d")
               `(,shtml-top-symbol (xmp "a<b>c") "d"))
   (test/equal ""
               (html->shtml "<foo:xmp>a<b>c</bar:xmp>d")
               `(,shtml-top-symbol (xmp "a<b>c") "d"))

   (test/equal ""
               (html->shtml "<xmp>a</b>c</xmp>d")
               `(,shtml-top-symbol (xmp "a</b>c")     "d"))
   (test/equal ""
               (html->shtml "<xmp>a</b >c</xmp>d")
               `(,shtml-top-symbol (xmp "a</b >c")    "d"))
   (test/equal ""
               (html->shtml "<xmp>a</ b>c</xmp>d")
               `(,shtml-top-symbol (xmp "a</ b>c")    "d"))
   (test/equal ""
               (html->shtml "<xmp>a</ b >c</xmp>d")
               `(,shtml-top-symbol (xmp "a</ b >c")   "d"))
   (test/equal ""
               (html->shtml "<xmp>a</b:x>c</xmp>d")
               `(,shtml-top-symbol (xmp "a</b:x>c")   "d"))
   (test/equal ""
               (html->shtml "<xmp>a</b::x>c</xmp>d")
               `(,shtml-top-symbol (xmp "a</b::x>c")  "d"))
   (test/equal ""
               (html->shtml "<xmp>a</b:::x>c</xmp>d")
               `(,shtml-top-symbol (xmp "a</b:::x>c") "d"))
   (test/equal ""
               (html->shtml "<xmp>a</b:>c</xmp>d")
               `(,shtml-top-symbol (xmp "a</b:>c")    "d"))
   (test/equal ""
               (html->shtml "<xmp>a</b::>c</xmp>d")
               `(,shtml-top-symbol (xmp "a</b::>c")   "d"))
   (test/equal ""
               (html->shtml "<xmp>a</xmp:b>c</xmp>d")
               `(,shtml-top-symbol (xmp "a</xmp:b>c") "d"))

   (test-define "expected output for next two tests"
                expected
                `(,shtml-top-symbol (p "real1")
                                    ,lf
                                    (xmp ,lf
                                         ,(string-append "alpha"       lf)
                                         ,(string-append "<P>fake</P>" lf)
                                         ,(string-append "bravo"       lf))
                                    (p "real2")))

   (test/equal ""
               (html->shtml (string-append "<P>real1</P>" lf
                                           "<XMP>"        lf
                                           "alpha"        lf
                                           "<P>fake</P>"  lf
                                           "bravo"        lf
                                           "</XMP "       lf
                                           "<P>real2</P>"))
               expected)

   (test/equal ""
               (html->shtml (string-append "<P>real1</P>" lf
                                           "<XMP>"        lf
                                           "alpha"        lf
                                           "<P>fake</P>"  lf
                                           "bravo"        lf
                                           "</XMP"        lf
                                           "<P>real2</P>"))
               expected)

   (test/equal ""
               (html->shtml "<xmp>a</xmp>x")
               `(,shtml-top-symbol (xmp "a")   "x"))
   (test/equal ""
               (html->shtml (string-append "<xmp>a" lf "</xmp>x"))
               `(,shtml-top-symbol (xmp ,(string-append "a" lf)) "x"))
   (test/equal ""
               (html->shtml "<xmp></xmp>x")
               `(,shtml-top-symbol (xmp)       "x"))

   (test/equal ""
               (html->shtml "<xmp>a</xmp") `(,shtml-top-symbol (xmp "a")))
   (test/equal ""
               (html->shtml "<xmp>a</xm")  `(,shtml-top-symbol (xmp "a</xm")))
   (test/equal ""
               (html->shtml "<xmp>a</x")   `(,shtml-top-symbol (xmp "a</x")))
   (test/equal ""
               (html->shtml "<xmp>a</")    `(,shtml-top-symbol (xmp "a</")))
   (test/equal ""
               (html->shtml "<xmp>a<")     `(,shtml-top-symbol (xmp "a<")))
   (test/equal ""
               (html->shtml "<xmp>a")      `(,shtml-top-symbol (xmp "a")))
   (test/equal ""
               (html->shtml "<xmp>")       `(,shtml-top-symbol (xmp)))
   (test/equal ""
               (html->shtml "<xmp")        `(,shtml-top-symbol (xmp)))

   (test/equal ""
               (html->shtml "<xmp x=42 ")
               `(,shtml-top-symbol (xmp (@ (x "42")))))
   (test/equal ""
               (html->shtml "<xmp x= ")   `(,shtml-top-symbol (xmp (@ (x)))))
   (test/equal ""
               (html->shtml "<xmp x ")    `(,shtml-top-symbol (xmp (@ (x)))))
   (test/equal ""
               (html->shtml "<xmp x")     `(,shtml-top-symbol (xmp (@ (x)))))

   (test/equal ""
               (html->shtml "<script>xxx")
               `(,shtml-top-symbol (script "xxx")))
   (test/equal ""
               (html->shtml "<script/>xxx")
               `(,shtml-top-symbol (script) "xxx"))

   (test/equal ""
               (html->shtml "<html xml:lang=\"en\" lang=\"en\">")
               `(,shtml-top-symbol (html (@ (xml:lang "en") (lang "en")))))

   (test/equal ""
               (html->shtml "<a href=/foo.html>")
               `(,shtml-top-symbol (a (@ (href "/foo.html")))))
   (test/equal ""
               (html->shtml "<a href=/>foo.html")
               `(,shtml-top-symbol (a (@ (href "/")) "foo.html")))

   ;; TODO: Add verbatim-pair cases with attributes in the end tag.

   (test/equal ""
               (shtml->html '(p))            "<p></p>")
   (test/equal ""
               (shtml->html '(p "CONTENT"))  "<p>CONTENT</p>")
   (test/equal ""
               (shtml->html '(br))           "<br />")
   (test/equal ""
               (shtml->html '(br "CONTENT")) "<br />")

   (test/equal ""
               (shtml->html `(hr (@ (clear "all"))))
               "<hr clear=\"all\" />")

   (test/equal ""
               (shtml->html `(hr (@ (noshade))))
               "<hr noshade />")
   (test/equal ""
               (shtml->html `(hr (@ (noshade #t))))
               "<hr noshade />") ;; TODO: Maybe lose this test.
   (test/equal ""
               (shtml->html `(hr (@ (noshade "noshade"))))
               "<hr noshade=\"noshade\" />")

   (test/equal ""
               (shtml->html `(hr (@ (aaa "bbbccc"))))
               "<hr aaa=\"bbbccc\" />")
   (test/equal ""
               (shtml->html `(hr (@ (aaa "bbb'ccc"))))
               "<hr aaa=\"bbb'ccc\" />")
   (test/equal ""
               (shtml->html `(hr (@ (aaa "bbb\"ccc"))))
               "<hr aaa='bbb\"ccc' />")
   (test/equal ""
               (shtml->html `(hr (@ (aaa "bbb\"ccc'ddd"))))
               "<hr aaa=\"bbb&quot;ccc'ddd\" />")

   (test/equal "" (shtml->html '(& "copy"))                   "&copy;")
   (test/equal "" (shtml->html '(& "rArr"))                   "&rArr;")
   (test/equal "" (shtml->html `(& ,(string->symbol "rArr"))) "&rArr;")
   (test/equal "" (shtml->html '(& 151))                      "&#151;")

   (test/equal ""
               (html->shtml "&copy;")
               `(,shtml-top-symbol (& ,(string->symbol "copy"))))
   (test/equal ""
               (html->shtml "&rArr;")
               `(,shtml-top-symbol (& ,(string->symbol "rArr"))))
   (test/equal ""
               (html->shtml "&#151;")
               `(,shtml-top-symbol ,(string (%htmlprag:a2c 151))))

   (test/equal ""
               (html->shtml "&#999;")
               `(,shtml-top-symbol (& 999)))

   (test/equal ""
               (shtml->html
                `(,shtml-pi-symbol xml "version=\"1.0\" encoding=\"UTF-8\""))
               "<?xml version=\"1.0\" encoding=\"UTF-8\"?>")

   (test/equal ""
               (shtml->html
                `(,shtml-decl-symbol
                  ,(string->symbol "DOCTYPE")
                  html
                  ,(string->symbol "PUBLIC")
                  "-//W3C//DTD XHTML 1.0 Strict//EN"
                  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"))
               (string-append
                "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\""
                " \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">"))

   (test/equal ""
               (shtml-entity-value '(*ENTITY* "shtml-named-char" "rArr"))
               (string->symbol "rArr"))

   (test/equal ""
               (shtml-entity-value '(& "rArr"))
               (string->symbol "rArr"))

   (test/equal ""
               (shtml-entity-value `(& ,(string->symbol "rArr")))
               (string->symbol "rArr"))

   ;; TODO: Write more test cases for HTML encoding.

   ;; TODO: Write test cases for foreign-filter of HTML writing.

   ;; TODO: Write test cases for attribute values that aren't simple strings.

   ;; TODO: Document this.
   ;;
   ;; (define html-1 "<myelem myattr=\"&\">")
   ;; (define shtml   (html->shtml html-1))
   ;; shtml
   ;; (define html-2 (shtml->html shtml))
   ;; html-2

   ))