1
0
Fork 0
tekuti/tekuti/marxdown.scm
2022-10-20 22:57:13 +02:00

631 lines
22 KiB
Scheme

;; Tekuti
;; Copyright (C) 2022 Andy Wingo <wingo at pobox dot com>
;; This program is free software; you can redistribute it and/or
;; modify it under the terms of the GNU General Public License as
;; published by the Free Software Foundation; either version 3 of
;; the License, or (at your option) any later version.
;;
;; This program is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;; GNU General Public License for more details.
;;
;; You should have received a copy of the GNU General Public License
;; along with this program; if not, contact:
;;
;; Free Software Foundation Voice: +1-617-542-5942
;; 59 Temple Place - Suite 330 Fax: +1-617-542-2652
;; Boston, MA 02111-1307, USA gnu@gnu.org
;;; Commentary:
;;
;; "Marxdown" to SXML parser.
;;
;; Marxdown is a dialect of Markdown, designed for simplicity and
;; embeddability. The X in markdown refers to how it processes embedded
;; XML.
;;
;; In some ways, Marxdown is a more restrictive subset of Markdown:
;;
;; - Marxdown only supports headings with the ### prefix ("ATX
;; headings"), and doesn't support so-called "Setext headings" that
;; follow the heading with --- or similar. Why bother?
;;
;; - Marxdown doesn't support indented code blocks, instead only using
;; fenced code blocks.
;;
;; - Markxdown doesn't currently support link titles or reference
;; links.
;;
;; - Marxdown requires each line of a block quote to have the same
;; indent and `>` characters, not just the first line.
;;
;; - Marxdown doesn't support multi-` inline code sequences; just the
;; single `.
;;
;; - ...
;;
;; Generally speaking, a valid Marxdown document will also be valid
;; Markdown, as defined by CommonMark. However there are exceptions.
;; Marxdown is less "sloppy", if you will; unlike Markdown, not any
;; sequence of characters is valid Marxdown. Parsing Marxdown can raise
;; an error.
;;
;; Notably, Marxdown requires valid nesting. For example, `*_foo*_` is
;; an error in Marxdown, as the emphasis markers aren't nested properly.
;;
;; Also, when Marxdown parses embedded XML, it hands over control of the
;; character stream to a validating XML parser which can also throw an
;; error. This XML parser doesn't stop at what CommonMark Markdown
;; would consider block boundaries. As a consequence, Marxdown cannot
;; be embedded inside XML. Due to a limitation which may be relaxed in
;; the future, the XML parser doesn't strip off any blockquote prefix.
;;
;; These limitations come with a benefit: tools can easily consume
;; Marxdown and then embed that result in SXML documents of different
;; kinds.
;;
;;; Code:
(define-module (tekuti marxdown)
#:use-module (ice-9 match)
#:use-module (ice-9 textual-ports)
#:use-module (ice-9 rdelim)
#:use-module (sxml ssax)
#:use-module (sxml transform)
#:use-module ((srfi srfi-1) #:select (fold))
#:export (marxdown->smarxdown
smarxdown->shtml))
(define (parse-one-xml-element port)
;; -> seed
(define (fdown elem-gi attributes namespaces expected-content seed)
'())
;; -> seed
(define (fup elem-gi attributes namespaces parent-seed seed)
(cons `(,elem-gi
,@(match namespaces
(() '())
;; fixme: prohibit?
(_ `((*NAMESPACES* ,namespaces))))
,@(match attributes
(() '())
(((attr . value) ...)
`((@ ,(map list attr value)))))
. ,(reverse seed))
parent-seed))
(define (ftext string1 string2 seed)
(if (string-null? string2)
(cons string1 seed)
(cons* string2 string1 seed)))
(define parse-element (ssax:make-elem-parser fdown fup ftext ()))
(let ((token (ssax:read-markup-token port)))
(match (xml-token-kind token)
('COMMENT #f)
('START
(let ((elems #f) (entities '()) (namespaces '()) (seed '()))
(match (parse-element (xml-token-head token) port elems
entities namespaces #t seed)
((elt) elt))))
(kind (error "unexpected XML token" token)))))
(define (marxdown->smarxdown port)
(define (round-up x y)
(* y (ceiling-quotient x y)))
(define (advance/tab indent)
(round-up (1+ indent) 4))
(define (advance indent)
(1+ indent))
(define (unget1 ch)
(unget-char port ch))
(define (unget chars)
(match chars
(()
(values))
((ch . chars)
(unget1 ch)
(unget chars))))
(define (next) (get-char port))
(define (next-not-eof ctx)
(let ((ch (next)))
(if (eof-object? ch)
(error "EOF while reading" ctx)
ch)))
(define (next-line-and-delim) (read-line port 'split))
(define (peek) (peek-char port))
(define (skip-whitespace k)
(let lp ((indent 0))
(let ((ch (next)))
(case ch
((#\space)
(lp (advance indent)))
((#\tab)
(lp (advance/tab indent)))
(else
(k ch indent))))))
(define empty-indent '(0))
(define (read-indent k)
(skip-whitespace
(lambda (ch indent)
(match ch
(#\>
(read-indent
(lambda (ch indent*)
(k ch
(cons indent
(match indent*
((head . tail)
;; Account for the #\>.
(cons (1+ head) tail))))))))
(_
(k ch (list indent)))))))
(define (advance-indent indent)
(match indent
((indent* ... indent)
(append indent* (list (advance indent))))))
(define (advance-indent/tab indent)
(let* ((col (apply + indent))
(col* (advance/tab col)))
(match indent
((indent* ... indent)
(append indent* (+ (- col* col) indent))))))
(define (compare-indents prev new)
(match (vector prev new)
(#((x . prev-tail) (x . new-tail))
(compare-indents prev-tail new-tail))
(_
(match (vector prev new)
(#(() ()) 'same)
(#((prev) (new)) (if (< prev new) 'inner 'outer))
(#(() _) 'inner-blockquote)
(#(_ ()) 'outer-blockquote)
(_ 'different-blockquote)))))
;; indent as list of blockquote
;; kup ::= (node type info indent) -> _
;; knext ::= nodelist -> node
(define (drop-whitespace-up-to n col kt kf)
(define col-end (+ col n))
(let lp ((n n))
(cond
((zero? n) (kt))
(else
(match (next)
(#\space (lp (1- n)))
(#\tab
(let ((col (advance/tab (- col-end n))))
(cond
((<= col col-end) (lp (- col-end col)))
(else (kt)))))
(#\newline
;; Sure. Trailing whitespace can be any indent.
(kt))
(ch
(unless (eof-object? ch) (unget1 ch))
(kf)))))))
(define (drop-whitespace-then-blockquote n col kt kf)
(define col-end (+ col n))
(let lp ((n n) (kf kf))
(let* ((ch (next))
(kf (lambda () (unless (eof-object? ch) (unget1 ch)) (kf))))
(cond
((zero? n)
(match ch
(#\> (kt kf))
(_ (kf))))
(else
(match ch
(#\space (lp (1- n) kf))
(#\tab
(let ((col (advance/tab (- col-end n))))
(cond
((<= col col-end) (lp (- col-end col) kf))
(else (kf)))))
(_ (kf))))))))
(define (consume-indent indent kt kf)
(match indent
((0) (kt))
(_
(let lp ((indent indent) (col 0) (kf kf))
(match indent
((n) (drop-whitespace-up-to n col kt kf))
((n m . indent)
(let* ((indent (cons (1- m) indent))
(kt (lambda (kf) (lp indent (+ col n 1) kf))))
(drop-whitespace-then-blockquote n col kt kf))))))))
(define (read-pre tag indent k)
(let lp ((body '()))
(define (finish tail)
(k (list 'pre tag (string-concatenate-reverse body tail))))
(consume-indent
indent
(lambda ()
(match (next-line-and-delim)
((str . delim)
(cond
((eof-object? delim)
(finish (if (eof-object? str) "" str)))
((string=? str "```")
(finish ""))
(else
(lp (cons* (string delim) str body)))))))
(lambda ()
(finish "")))))
(define (read-link-destination)
(match (next-not-eof "link destination")
(#\<
(let lp ((chars '()))
(match (next-not-eof "<>-delimited link")
(#\> (reverse-list->string chars))
(#\< (error "< inside <>-delimited link"))
(#\\
(match (next)
((? eof-object?) (lp (cons #\\ chars)))
(ch (lp (cons ch chars)))))
(ch (lp (cons ch chars))))))
(ch
(unget1 ch)
(let lp ((chars '()) (k reverse-list->string))
(match (next-not-eof "link destination")
(#\) (unget1 #\)) (k chars))
((and ch
(or #\space #\)
(? (lambda (ch)
(or (char<? ch (integer->char #x20))
(char=? ch (integer->char #x7f)))))))
(unget1 ch)
(k chars))
(#\(
(lp (cons #\( chars)
(lambda (chars)
(match (next-not-eof "link destination")
(#\) (lp (cons #\) chars) k))
(ch (error "unexpected char" ch))))))
(#\\
(lp (cons (next-not-eof "link-destination") chars) k))
(ch (lp (cons ch chars) k)))))))
(define (read-link indent continue)
(read-text
#f indent
(lambda (ch)
(match ch
(#\]
(lambda (text)
(match (next)
(#\(
(let ((dest (read-link-destination)))
(match (next)
(#\)
(continue `(link ,dest . ,text)))
(ch
(error "unexpected after link dest" ch)))))
(ch (error "link missing destination URL")))))
(_ #f)))
(lambda (elts)
(error "end-of-block while reading link"))))
(define (read-emph indent delim continue)
(define (delim? ch) (eqv? ch delim))
(match (next-not-eof "emphasis")
((? delim?)
(let ((done? (lambda (ch)
(match ch
((? delim?)
(match (next-not-eof "emphasis")
((? delim?) continue)
(ch (unget1 ch) #f)))
(_ #f)))))
(read-text 'strong indent done?
(lambda (elt)
(error "end of block while reading strong" elt)))))
(ch
(unget1 ch)
(let ((done? (lambda (ch) (and (delim? ch) continue))))
(read-text 'emph indent done?
(lambda (elt)
(error "end of block while reading emph" elt)))))))
(define (read-code indent continue)
(let lp ((chars '()))
(match (next-not-eof "backticks")
(#\` (continue `(code ,(reverse-list->string chars))))
(#\newline
(consume-indent
indent
(lambda ()
(match (next-not-eof "code")
(#\newline (error "end of block while reading code"))
(ch (unget1 ch) (lp (cons #\space chars)))))
(lambda () (error "end of block while reading code"))))
(ch (lp (cons ch chars))))))
(define (read-text tag indent done? on-block-end)
(let lp ((elts '()))
(define (continue elt) (lp (cons elt elts)))
(define (finish kdone)
(let lp ((elts elts) (out '()))
(match elts
(() (kdone (if tag (cons tag out) out)))
(((? char? ch) . elts)
(let lp2 ((elts elts) (chars (list ch)))
(match elts
(((? char? ch) . elts)
(lp2 elts (cons ch chars)))
(_
(lp elts (cons (list->string chars) out))))))
((elt . elts)
(lp elts (cons elt out))))))
(match (next)
((? eof-object?) (finish on-block-end))
(#\newline
(consume-indent
indent
(lambda ()
(cond
((done? #\newline) => finish)
(else (lp (cons #\newline elts)))))
(lambda ()
(finish on-block-end))))
((= done? (and kdone (not #f))) (finish kdone))
((and (or #\_ #\*) delim) (read-emph indent delim continue))
(#\` (read-code indent continue))
(#\[
(read-link indent continue))
(#\<
(unget1 #\<)
(match (parse-one-xml-element port)
(#f (lp elts))
(elt (continue `(inline-xml ,elt)))))
(#\\ (lp (cons (next-not-eof "backslash") elts)))
(#\! (match (next)
((? eof-object?) (lp (cons #\! elts)))
(#\[
(read-link indent
(lambda (link)
(match link
(('link dest . alt)
(continue `(image ,dest . ,alt)))))))
(ch (lp (cons* ch #\! elts)))))
(ch (lp (cons ch elts))))))
(define (read-para indent kup knext)
(define (make-continuation reader)
(lambda (para)
(reader indent kup (lambda (nodelist)
(knext (cons para nodelist))))))
(define (done? ch)
(match ch
(#\newline
(let ((ch (next)))
(match ch
((? eof-object?)
(lambda (para)
(kup (knext (list para)) ch empty-indent)))
(ch
(read-block-type ch #t
make-continuation
(lambda (chars)
(unget chars)
#f))))))
(_ #f)))
(read-text 'para indent done? (make-continuation read-block-list)))
(define (read-para* chars indent kup knext)
(unget chars)
(read-para indent kup knext))
(define (read-heading level knext)
(error "unimplemented"))
(define (read-li marker marker-indent marker-size kup knext)
(read-indent
(lambda (ch indent)
(match indent
((outer . inner)
(match marker-indent
((marker-outer ... marker-inner)
(let ((body-indent
(append marker-outer
(list (+ marker-inner marker-size outer))
inner)))
(pk 'hey marker marker-indent marker-size indent body-indent ch)
(read-indented-block
ch marker-indent body-indent
(lambda (elt ch indent)
(pk ch marker-indent indent elt)
(read-indented-block ch marker-indent indent kup
(lambda (nodelist)
(knext (cons elt nodelist)))))
(lambda (nodelist)
(cons* 'li marker nodelist)))))))))))
(define (read-block ch indent kup knext)
(define (have-block read-block)
(read-block indent kup knext))
(define (have-text chars)
(read-para* chars indent kup knext))
(read-block-type ch #f have-block have-text))
(define (read-block-type ch in-text? kblock ktext)
(define (make-continue indent kup knext)
(lambda (block)
(read-block-list indent kup
(lambda (nodelist)
(knext (cons block nodelist))))))
(match ch
(#\#
(let lp ((level 1))
(match (next-not-eof "heading")
(#\#
(lp (1+ level)))
(#\space
(kblock
(lambda (indent kup knext)
(read-heading level (make-continue indent kup knext)))))
(#\newline
(kblock
(lambda (indent kup knext)
((make-continue indent kup knext) `(heading ,level)))))
(ch
(ktext (cons ch (make-list level #\#)))))))
(#\`
(match (next)
((? eof-object?) (ktext '(#\`)))
(#\`
(match (next)
((? eof-object?) (ktext '(#\` #\`)))
(#\`
(kblock
(lambda (indent kup knext)
(match (next-line-and-delim)
((tag . delim)
(cond
((eof-object? delim)
(error "eof while reading code block"))
(else
(read-pre (if (string-null? tag) #f tag) indent
(make-continue indent kup knext)))))))))
(ch
(ktext (list ch #\` #\`)))))
(ch
(ktext (list ch #\`)))))
((or #\- #\* #\+)
(match (peek)
((or #\space #\tab)
(kblock (lambda (indent kup knext)
(read-li ch indent 1 kup knext))))
(_
(ktext (list ch)))))
((or #\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9)
(let lp ((chars (list ch)))
(let ((ch (next)))
(match ch
((? eof-object?) (ktext chars))
((or #\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9)
(lp (cons ch chars)))
((or #\. #\))
;; fixme: record delimiter
(match (next)
((? eof-object?)
(ktext (cons ch chars)))
((and ch2 (or #\space #\tab))
(unget1 ch2)
(if (or (not in-text?) (equal? chars '(#\1)))
(kblock
(lambda (indent kup knext)
(let ((idx (fold (lambda (ch idx)
(+ (* idx 10)
(- (char->integer ch)
(char->integer #\0))))
0 chars)))
(read-li idx indent (1+ (length chars))
kup knext))))
(ktext (cons ch chars))))
(ch2
(ktext (cons* ch2 ch chars)))))
(_
(ktext (cons ch chars)))))))
(#\<
(unget1 #\<)
(if in-text?
(ktext '())
(kblock
(lambda (indent kup knext)
(match (parse-one-xml-element port)
(#f (read-block-list indent kup knext))
(elt ((make-continue indent kup knext) `(block-xml ,elt))))))))
(#\newline
;; fixme: record loose li
(kblock read-block-list))
((? eof-object?)
(kblock
(lambda (indent kup knext)
(kup (knext '()) ch empty-indent))))
(_
(ktext (list ch)))))
(define (read-indented-block ch outer-indent indent kup knext)
(define (recurse kind recurse-indent)
(read-indented-block ch recurse-indent indent
(lambda (nested ch indent)
(read-indented-block ch outer-indent indent
kup
(lambda (nodelist)
(knext (cons nested nodelist)))))
(lambda (nodelist)
(cons kind nodelist))))
(match (compare-indents outer-indent indent)
('same
(read-block ch indent kup knext))
((or 'outer
'outer-blockquote
'different-blockquote)
(kup (knext '()) ch indent))
('inner
(recurse 'begin indent))
('inner-blockquote
(recurse 'blockquote (list-head indent (1+ (length outer-indent)))))))
(define (parse-error reason)
(error reason))
(define (read-block-list outer-indent kup knext)
(read-indent
(lambda (ch indent)
(read-indented-block ch outer-indent indent kup knext))))
(read-block-list empty-indent
(lambda (nodelist ch indent) nodelist)
(lambda (nodelist) (cons 'top nodelist))))
(define* (smarxdown->shtml exp #:key
(heading-offset 0)
(handle-inline-xml identity)
(handle-block-xml identity))
(define (transform-inline exp)
(match exp
(('inline-xml xml) (handle-inline-xml xml))
(('code . body) exp)
(('emph . body) `(emph . ,(map transform-inline body)))
(('strong . body) `(strong . ,(map transform-inline body)))
(('link dest . body) `(a (@ (href ,dest)) . ,(map transform-inline body)))
((? string? str) str)))
(define (transform-block exp)
(match exp
(('block-xml xml) (handle-block-xml xml))
(('para . body) `(p . ,(map transform-inline body)))
(('blockquote . body) `(blockquote . ,(map transform-block body)))
(('list-item . body) (error "fixme"))
(('pre #f . body) `(pre . ,body))
(('pre info . body) `(pre (@ (class ,(string-append "pre-" info)))
. ,body))
(('heading level . body)
(let* ((level (+ level heading-offset))
(tag (cond ((<= level 0) 'h1)
((<= level 6)
(string->symbol
(string #\h (integer->char
(+ level (char->integer #\0))))))
(else 'h6))))
`(,tag . ,(map transform-inline body))))))
(match exp
(('top exp ...)
`(div . ,(map transform-block exp)))))