From 0b476a921dabd38d7ca80d8df3989abc077ad11c Mon Sep 17 00:00:00 2001 From: Andy Wingo Date: Thu, 21 Feb 2008 23:48:44 +0100 Subject: another checkpoint, la la la --- src/tekuti | 2 +- src/upgrade-metadata | 92 ++++++++++++++++++++++++++++++++++++++++++++ tekuti/boot.scm | 6 ++- tekuti/categories.scm | 4 +- tekuti/comment.scm | 5 +-- tekuti/filters.scm | 89 +++++++++++++++++++++++++++++++++++++++++++ tekuti/git.scm | 32 +++++++--------- tekuti/mod-lisp.scm | 61 ++++++++++++++++++++++-------- tekuti/post.scm | 23 ++++++----- tekuti/util.scm | 44 ++++++++++++++++++++- wordpress-to-dir.py | 103 +++++++++++++++++++++++++++++++++----------------- 11 files changed, 373 insertions(+), 88 deletions(-) create mode 100755 src/upgrade-metadata create mode 100644 tekuti/filters.scm diff --git a/src/tekuti b/src/tekuti index bfff6b2..54500d2 100755 --- a/src/tekuti +++ b/src/tekuti @@ -30,7 +30,7 @@ exec guile $GUILE_FLAGS -l $0 -e main -- "$@" ;;hack! -(turn-on-debugging) +;(turn-on-debugging) (use-modules (tekuti boot)) (define (main args) (start-stack 'tekuti diff --git a/src/upgrade-metadata b/src/upgrade-metadata new file mode 100755 index 0000000..14edadb --- /dev/null +++ b/src/upgrade-metadata @@ -0,0 +1,92 @@ +#! /bin/bash +# -*- scheme -*- +exec guile $GUILE_FLAGS -l $0 -e main -- "$@" +!# +;; Tekuti +;; Copyright (C) 2008 Andy Wingo + +;; This program is free software; you can redistribute it and/or +;; modify it under the terms of the GNU General Public License as +;; published by the Free Software Foundation; either version 3 of +;; the License, or (at your option) any later version. +;; +;; This program is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with this program; if not, contact: +;; +;; Free Software Foundation Voice: +1-617-542-5942 +;; 59 Temple Place - Suite 330 Fax: +1-617-542-2652 +;; Boston, MA 02111-1307, USA gnu@gnu.org + +;;; Commentary: +;; +;; This is the main script that will launch tekuti. +;; +;;; Code: + +;; (turn-on-debugging) +(use-modules (tekuti git) (tekuti post)) + +;; order: name object type mode +(define (make-tree-full alist) + (string-trim-both + (git* '("mktree") + #:input (string-join + (map (lambda (l) + (apply format #f "~a ~a ~a\t~a" + (reverse l))) + alist) + "\n" 'suffix)))) + +(define (create-blob contents) + (string-trim-both + (git* '("hash-object" "-w" "--stdin") #:input contents))) + +(define (munge-comment sha1) + (let ((dir (git-ls-tree sha1 #f)) + (metadata (parse-metadata (string-append sha1 ":metadata") + (@@ (tekuti comment) *comment-spec*)))) + (set! dir (assoc-set! dir "metadata" + (list (create-blob (with-output-to-string + (lambda () (write metadata)))) + "blob" "100644"))) + (make-tree-full dir))) + +(define (munge-comments alist) + (let ((comments (make-tree + (map (lambda (pair) + (cons (car pair) (munge-comment (cdr pair)))) + alist)))) + (make-tree (acons "comments" comments '())))) + +(define (munge-post pair) + (let* ((sha1 (cdr pair)) + (dir (git-ls-tree sha1 #f)) + (metadata (parse-metadata (string-append sha1 ":metadata") + (@@ (tekuti post) *post-spec*)))) + (set! dir (assoc-set! dir "metadata" + (list (create-blob (with-output-to-string + (lambda () (write metadata)))) + "blob" "100644"))) + (if (assoc "comments" dir) + (set! dir (assoc-set! dir "comments" + (list (munge-comments (git-ls-subdirs (string-append sha1 ":comments/") #f)) + "tree" "040000")))) + (pk 'out (cons (car pair) (make-tree-full dir))))) + +(define (main args) + (ensure-git-repo) + (let* ((master (git-rev-parse "master")) + (posts (git-ls-subdirs master #f))) + (let ((new-tree (make-tree (map munge-post posts)))) + (write + (git* `("commit-tree" ,new-tree "-p" ,master) + #:input "upgraded metadata to s-expressions"))))) + + + + diff --git a/tekuti/boot.scm b/tekuti/boot.scm index 2f35f3a..13e8e82 100644 --- a/tekuti/boot.scm +++ b/tekuti/boot.scm @@ -34,6 +34,7 @@ exec guile $GUILE_FLAGS -l $0 -e main -- "$@" #:use-module (ice-9 format) #:use-module (ice-9 getopt-long) #:use-module (ice-9 threads) + #:use-module (tekuti util) #:use-module (tekuti git) #:use-module (tekuti mod-lisp) #:export (boot)) @@ -73,7 +74,10 @@ exec guile $GUILE_FLAGS -l $0 -e main -- "$@" (version) (exit 0))) (if (option-ref opts 'gds #f) - (make-thread (@ (ice-9 gds-client) run-utility))) + (let ((run-utility (@ (ice-9 gds-client) run-utility))) + (make-thread + (lambda () + (with-backtrace run-utility))))) opts)) (define (boot args) diff --git a/tekuti/categories.scm b/tekuti/categories.scm index d2bea29..77ec047 100644 --- a/tekuti/categories.scm +++ b/tekuti/categories.scm @@ -41,5 +41,5 @@ posts) hash)) -(define (reindex-categories master) - (compute-categories (all-published-posts master))) +(define (reindex-categories index) + (compute-categories (assq-ref index 'posts))) diff --git a/tekuti/comment.scm b/tekuti/comment.scm index 69bff69..6754f77 100644 --- a/tekuti/comment.scm +++ b/tekuti/comment.scm @@ -31,8 +31,6 @@ #:use-module (srfi srfi-1) #:export (comment-from-tree build-comment-skeleton)) -(use-modules (ice-9 regex)) - (use-modules (ice-9 rdelim) (ice-9 popen) (srfi srfi-1) @@ -40,13 +38,12 @@ (tekuti url) (match-bind) (sxml transform)) -(debug-enable 'backtrace) (define *comment-spec* `((timestamp . ,string->number))) (define (comment-from-tree encoded-name sha1) (acons 'encoded-name encoded-name - (parse-metadata (string-append sha1 ":" "metadata") *comment-spec*))) + (parse-metadata (string-append sha1 ":" "metadata")))) (define (comment-timestamp comment-alist) (or (assq-ref comment-alist 'timestamp) #f)) diff --git a/tekuti/filters.scm b/tekuti/filters.scm new file mode 100644 index 0000000..d46e95c --- /dev/null +++ b/tekuti/filters.scm @@ -0,0 +1,89 @@ +;; Tekuti +;; Copyright (C) 2008 Andy Wingo + +;; This program is free software; you can redistribute it and/or +;; modify it under the terms of the GNU General Public License as +;; published by the Free Software Foundation; either version 3 of +;; the License, or (at your option) any later version. +;; +;; This program is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with this program; if not, contact: +;; +;; Free Software Foundation Voice: +1-617-542-5942 +;; 59 Temple Place - Suite 330 Fax: +1-617-542-2652 +;; Boston, MA 02111-1307, USA gnu@gnu.org + +;;; Commentary: +;; +;; This is the main script that will launch tekuti. +;; +;;; Code: + +(define-module (tekuti filters) + #:use-module (sxml simple) + #:use-module (sxml transform) + #:use-module (match-bind) + #:export (wordpress->sxml)) + +(define blocks '(table thead tfoot caption colgroup tbody tr td th div + dl dd dt ul ol li pre select form map area blockquote + address math style input p h1 h2 h3 h4 h5 h6)) + +(define (can-contain-p? tag) + (memq tag '(div li blockquote))) + +(define (inline? tag) + (not (memq tag blocks))) + +(define (wpautop tag body) + (define (pclose p out) + (if p (cons (cons 'p (reverse p)) out) out)) + (define (scons x p) + (if (and (string? x) (string-null? x)) + p + (cons x (or p '())))) + (define (pbreak p) + (if p (cons '(br) p) p)) + + (let* ((head (if (and (pair? (car body)) + (eq? (caar body) '@)) + (list tag (car body)) + (list tag))) + (body (if (null? (cdr head)) body (cdr body)))) + (let lp ((p #f) (in body) (out (reverse head))) + (cond + ((null? in) + (reverse (pclose p out))) + ((string? (car in)) + (match-bind "^([^\n]*)\n(\n*)(.*)$" (car in) (_ head p-or-br? tail) + (if (string-null? p-or-br?) + (lp (if (string-null? tail) + (scons head p) + (pbreak (scons head p))) + (scons tail (cdr in)) out) + (lp #f (scons tail (cdr in)) + (pclose (scons head p) out))) + (lp (cons (car in) (or p '())) + (cdr in) out))) + ((inline? (caar in)) + (lp (scons (car in) p) (cdr in) out)) + (else + (lp #f (cdr in) + (cons (car in) (pclose p out)))))))) + +(define (wordpress->sxml text) + (let ((sxml (cadr (with-input-from-string (string-append "
" text "
") + xml->sxml)))) + (pre-post-order + sxml + `((*default* . ,(lambda (tag . body) + (if (can-contain-p? tag) + (wpautop tag body) + (cons tag body)))) + (*text* . ,(lambda (tag text) + text)))))) diff --git a/tekuti/git.scm b/tekuti/git.scm index 11f739c..35323dd 100644 --- a/tekuti/git.scm +++ b/tekuti/git.scm @@ -31,7 +31,6 @@ #:use-module (tekuti config) #:use-module (scheme kwargs) #:use-module (match-bind) - #:use-module (ice-9 regex) ; hack #:use-module (srfi srfi-34) #:use-module (srfi srfi-35) #:export (&git-condition git-condition? git-condition-argv @@ -61,6 +60,12 @@ (string->list str)) (display #\')))) +(define *debug* #f) +(define (trc . args) + (if *debug* + (apply pk args) + (car (last-pair args)))) + (define (run-git env input-file args) (define (prepend-env args) (if (null? env) @@ -70,10 +75,11 @@ (cons* *git* "--bare" args)) (define (redirect-input args) (if input-file - (list "/bin/sh" "-c" (string-join (map shell:quote args) " ") - "<" input-file) + (list "/bin/sh" "-c" + (string-append (string-join (map shell:quote args) " ") + "<" input-file)) args)) - (let* ((real-args (pk (redirect-input (prepend-env (prepend-git args))))) + (let* ((real-args (trc (redirect-input (prepend-env (prepend-git args))))) (pipe (apply open-pipe* OPEN_READ real-args)) (output (read-delimited "" pipe)) (ret (close-pipe pipe))) @@ -87,7 +93,7 @@ (define (call-with-temp-file contents proc) (let* ((template (string-copy "/tmp/tekutiXXXXXX")) (tmp (mkstemp! template))) - (display input tmp) + (display contents tmp) (close tmp) (unwind-protect (proc template) @@ -99,6 +105,7 @@ (call-with-temp-file input (lambda (tempname) + (trc input) (run-git env tempname args))) (run-git env #f args))) @@ -130,19 +137,8 @@ "^(.+) tree (.+)\t(.+)$" (_ mode object name) (cons name object))) -(define (parse-metadata treeish specs) - (filter - identity - (match-lines (git "cat-file" "blob" treeish) - "^([^: ]+): +(.*)$" (_ k v) - (let* ((k (string->symbol k)) - (parse (assq-ref specs k))) - (if parse - (catch 'parse-error - (lambda () - (cons k (parse v))) - (lambda args #f)) - (cons k v)))))) +(define (parse-metadata treeish) + (with-input-from-string (git "show" treeish) read)) (define (parse-commit commit) (let ((text (git "cat-file" "commit" commit))) diff --git a/tekuti/mod-lisp.scm b/tekuti/mod-lisp.scm index 7c22267..b796815 100644 --- a/tekuti/mod-lisp.scm +++ b/tekuti/mod-lisp.scm @@ -28,6 +28,7 @@ #:use-module (ice-9 rdelim) #:use-module (ice-9 receive) #:use-module (ice-9 stack-catch) + #:use-module ((srfi srfi-1) #:select (fold)) #:use-module (sxml simple) #:use-module (sxml transform) #:use-module (tekuti config) @@ -74,8 +75,8 @@ (define (connection-received socket sockaddr index) (let* ((headers (pk (read-headers socket))) (post-data (read-chars (string->number - (pk (or (assoc-ref headers "content-length") - "0"))) + (or (assoc-ref headers "content-length") + "0")) socket))) (catch #t @@ -101,35 +102,63 @@ (define (with-socket proc) (pk 'listening) (let ((socket (socket PF_INET SOCK_STREAM 0))) + (setsockopt socket SOL_SOCKET SO_REUSEADDR 1) (bind socket AF_INET (inet-aton *host*) *port*) (listen socket *backlog*) (unwind-protect (proc socket) (shutdown socket 2)))) +(define (with-time-debugging proc) + (let ((start (gettimeofday))) + (pk 'start-clock) + (let ((val (proc))) + (let ((stop (gettimeofday))) + (pk 'stop-clock (+ (- (car stop) (car start)) + (* 1e-6 (- (cdr stop) (cdr start))))) + + val)))) + +(define indices + `((posts . ,reindex-posts) + (categories . ,reindex-categories))) + +(use-modules (statprof)) +(define (reindex master) + (with-backtrace + (lambda () + (with-time-debugging + (lambda () + (with-statprof #:hz 100 + (fold (lambda (pair index) + (acons (car pair) ((cdr pair) index) + index)) + (acons 'master master '()) + indices))))))) + +(define (maybe-reindex old-master old-index) + (let ((master (git-rev-parse "master"))) + (values + master + (if (equal? master old-master) + old-index + (reindex master))))) + (define (inner-loop socket cookie index) (let* ((pair (accept socket)) (fd (car pair)) (sockaddr (cdr pair))) (receive (new-cookie new-index) (maybe-reindex cookie index) - (pk new-cookie new-index) (connection-received (car pair) (cdr pair) new-index) (inner-loop socket new-cookie new-index)))) -(define (maybe-reindex old-master old-index) - (let ((master (git-rev-parse "master"))) - (values - master - (if (equal? master old-master) - old-index - (acons 'master master - (map (lambda (k reindex) - (cons k (reindex master))) - (list 'posts 'categories) - (list reindex-posts reindex-categories))))))) - (define (event-loop) (with-socket (lambda (socket) - (inner-loop socket #f #f)))) + (format #t "running initial index\n") + (receive + (master index) (maybe-reindex #f #f) + (format #t "entering inner loop\n") + (inner-loop socket master index))))) + diff --git a/tekuti/post.scm b/tekuti/post.scm index afa420f..3748e66 100644 --- a/tekuti/post.scm +++ b/tekuti/post.scm @@ -30,6 +30,7 @@ #:use-module (tekuti util) #:use-module (tekuti comment) #:use-module (tekuti git) + #:use-module (tekuti filters) #:use-module (srfi srfi-1) #:use-module (srfi srfi-19) #:export (reindex-posts post-from-tree post-from-key post-categories @@ -53,18 +54,20 @@ (categories . ,(lambda (v) (map string-trim-both (string-split v #\,)))) (title . ,identity))) -(define (post-from-tree encoded-name sha1) - (let ((treels (git-ls-tree sha1 #f))) - (acons 'key encoded-name - (acons 'content-sha1 (and=> (assoc "content" treels) cadr) - (parse-metadata (and=> (assoc "metadata" treels) cadr) - *post-spec*))))) +(define-memoized (post-from-tree encoded-name sha1) + (acons 'key encoded-name + (acons 'content-ref (string-append sha1 ":content") + (parse-metadata (string-append sha1 ":metadata"))))) (define (post-raw-content post) - (git "cat-file" "blob" (assq-ref post 'content-sha1))) + (git "show" (assq-ref post 'content-ref))) (define (post-sxml-content post) - `(pre ,(post-raw-content post))) + (let ((format (or (assq-ref post 'format) 'wordpress))) + ((case format + ((wordpress) wordpress->sxml) + (else (lambda (text) `(pre ,text)))) + (post-raw-content post)))) (define (post-readable-date post) (let ((date (time-utc->date @@ -98,5 +101,5 @@ comment-timestamp <)) -(define (reindex-posts master) - (all-published-posts master)) +(define (reindex-posts index) + (all-published-posts (assq-ref index 'master))) diff --git a/tekuti/util.scm b/tekuti/util.scm index edd8dba..6c9231e 100644 --- a/tekuti/util.scm +++ b/tekuti/util.scm @@ -29,7 +29,7 @@ #:use-module (srfi srfi-1) #:export (expanduser match-lines dbg unwind-protect dbg dsu-sort hash-push! list-has-length? list-head-match mapn - list-intersperse)) + list-intersperse with-backtrace define-memoized)) (define (expanduser path) (let ((parts (string-split path #\/))) @@ -91,3 +91,45 @@ (let loop ((l (cdr src-l)) (dest (cons (car src-l) '()))) (if (null? l) (reverse dest) (loop (cdr l) (cons (car l) (cons elem dest))))))) + +(define (handle-error key . args) + (let ((cep (current-error-port)) + (highlights (if (or (eq? key 'wrong-type-arg) + (eq? key 'out-of-range)) + (list-ref args 3) + '()))) + (newline cep) + (display "Backtrace:\n") + (display-backtrace (fluid-ref the-last-stack) cep + #f #f highlights) + (newline cep) + (if (= (length args) 4) + (apply display-error (fluid-ref the-last-stack) cep args) + (format cep "~a" args)) + (force-output cep) + (apply throw key args))) + +(define (with-backtrace proc) + (debug-enable 'backtrace) + (start-stack 'with-backtrace + (catch #t + proc + handle-error + (lambda args + (fluid-set! the-last-stack (make-stack #t 2 0)) + (apply throw args))))) + +(define (memoize1 proc) + (let ((old-args #f) (cache #f) (proc proc)) + (lambda args + (if (equal? args old-args) + cache + (let ((val (apply proc args))) + (set! old-args args) + (set! cache val) + val))))) + +(define-macro (define-memoized form . body) + `(begin + (define ,form ,@body) + (set! ,(car form) (,memoize1 ,(car form))))) diff --git a/wordpress-to-dir.py b/wordpress-to-dir.py index e92bd80..82669ee 100644 --- a/wordpress-to-dir.py +++ b/wordpress-to-dir.py @@ -4,41 +4,12 @@ import sys import tempfile import MySQLdb as db import os +import urllib +import time cxn = None def all_posts(): - `ID` bigint(20) unsigned NOT NULL auto_increment, - `post_author` bigint(20) NOT NULL default '0', - `post_date` datetime NOT NULL default '0000-00-00 00:00:00', - `post_date_gmt` datetime NOT NULL default '0000-00-00 00:00:00', - `post_content` longtext NOT NULL, - `post_title` text NOT NULL, - `post_category` int(4) NOT NULL default '0', - `post_excerpt` text NOT NULL, - `post_lat` float default NULL, - `post_lon` float default NULL, - `post_status` enum('publish','draft','private','static','object','attachment') NOT NULL default 'publish', - `comment_status` enum('open','closed','registered_only') NOT NULL default 'open', - `ping_status` enum('open','closed') NOT NULL default 'open', - `post_password` varchar(7) NOT NULL default '', - `post_name` varchar(67) NOT NULL default '', - `to_ping` text NOT NULL, - `pinged` text NOT NULL, - `post_modified` datetime NOT NULL default '0000-00-00 00:00:00', - `post_modified_gmt` datetime NOT NULL default '0000-00-00 00:00:00', - `post_content_filtered` text NOT NULL, - `post_parent` bigint(20) NOT NULL default '0', - `guid` varchar(85) NOT NULL default '', - `menu_order` int(11) NOT NULL default '0', - `post_type` varchar(34) NOT NULL default '', - `post_mime_type` varchar(34) NOT NULL default '', - `comment_count` bigint(20) NOT NULL default '0', - PRIMARY KEY (`ID`), - KEY `post_date` (`post_date`), - KEY `post_date_gmt` (`post_date_gmt`), - KEY `post_name` (`post_name`), - KEY `post_status` (`post_status`) cur = cxn.cursor() sql = ('select ID, post_author, post_date_gmt, post_content,' ' post_title, post_status, comment_status, post_name,' @@ -55,16 +26,78 @@ def all_posts(): else: break -def write_post(post): - print post['name'] +def post_categories(post): + cur = cxn.cursor() + sql = ('select cat_name from wp_categories c, wp_post2cat p2c' + ' where p2c.post_id=%s and p2c.category_id=c.cat_ID') + cur.execute(sql, (post['id'],)) + return [row[0] for row in cur.fetchall()] + +def post_comments(post): + cur = cxn.cursor() + sql = ('select comment_ID, comment_author, comment_author_email,' + ' comment_author_url, comment_author_IP,' + ' comment_date, comment_date_gmt, comment_content, comment_approved' + ' from wp_comments where comment_post_ID=%s') + cur.execute(sql, (post['id'],)) + keys = ('id', 'author', 'author_email', 'author_url', 'author_ip', + 'date', 'date-gmt', 'content', 'approved') + return [dict(zip(keys, row)) for row in cur.fetchall()] + +def write_file(path, content): + f = open(path, 'w') + f.write(content) + f.close() + +def make_dir(path): + os.mkdir(path) + return path + '/' + +def write_comment(comment, dir): + def make_metadata(): + out = '' + for k, v in comment.items(): + if k not in ('content',): + out += '%s: %s\n' % (k, v) + date = comment['date-gmt'] or comment['date'] + out += 'timestamp: %s\n' % int(time.mktime(date.timetuple())) + return out + + d = make_dir(dir + str(comment['id'])) + write_file(d + 'content', comment['content']) + write_file(d + 'metadata', make_metadata()) + +def make_post_key(post): + d = post['date'] + pre = '%d/%02d/%02d/%s' % (d.year, d.month, d.day, post['name']) + return urllib.quote(pre, '') + +def write_post(post, categories, comments): + def make_metadata(): + out = '' + for k, v in post.items(): + if k not in ('content', 'content_filtered'): + out += '%s: %s\n' % (k, v) + out += 'categories: %s\n' % ', '.join(categories) + out += 'timestamp: %s\n' % int(time.mktime(post['date'].timetuple())) + return out + + key = make_post_key(post) + d = make_dir(key) + write_file(d + 'content', post['content']) + write_file(d + 'content-filtered', post['content_filtered']) + write_file(d + 'metadata', make_metadata()) + c = make_dir(d + 'comments') + for comment in comments: + write_comment(comment, c) def main(args): global cxn d = tempfile.mkdtemp(prefix='wp2dir') print 'writing dir', d os.chdir(d) - _, host, user, passwd, db = args - cxn = db.connect(host=host, user=user, passwd=passwd, db=db) + _, host, user, passwd, database = args + cxn = db.connect(host=host, user=user, passwd=passwd, db=database) for post in all_posts(): write_post (post, post_categories (post), post_comments (post)) -- cgit v1.2.3-54-g00ecf