path: root/
diff options
Diffstat (limited to '')
1 files changed, 162 insertions, 0 deletions
diff --git a/ b/
new file mode 100644
index 0000000..6c442ba
--- /dev/null
+++ b/
@@ -0,0 +1,162 @@
+#!/usr/bin/env python
+import os
+import pycurl
+import string
+import sys
+import tempfile
+import time
+import urllib
+import urlparse
+import xmlrpclib
+from HTMLParser import HTMLParser
+class WordPress:
+ def __init__(self, url, username, password):
+ self.__username = username
+ self.__password = password
+ self.__proxy = xmlrpclib.ServerProxy(url)
+ def all_posts(self):
+ MAX_POST_NO = 10000
+ return self.__proxy.metaWeblog.getRecentPosts("",
+ self.__username,
+ self.__password,
+ def post_comments(self, post):
+ return self.__proxy.wp.getComments("",
+ self.__username,
+ self.__password,
+ {"post_id" : post["postid"]})
+class MyHTMLParser(HTMLParser):
+ def __init__(self, url, imgdir):
+ HTMLParser.__init__(self)
+ self.url = url
+ self.imgdir = imgdir
+ def handle_starttag(self, tag, attrs):
+ if tag == "img":
+ for attr in attrs:
+ if attr[0] == "src":
+ self.download_image(attr[1])
+ def download_image(self, url):
+ parsed = urlparse.urlparse(url)
+ if parsed.netloc == self.url:
+ print " downloading %s" % url
+ filename = os.path.join(self.imgdir, parsed.path[1:])
+ try:
+ os.makedirs(os.path.dirname(filename))
+ except os.error:
+ pass
+ fp = open(filename, "wb")
+ curl = pycurl.Curl()
+ curl.setopt(pycurl.URL, url)
+ curl.setopt(pycurl.FOLLOWLOCATION, 1)
+ curl.setopt(pycurl.MAXREDIRS, 5)
+ curl.setopt(pycurl.CONNECTTIMEOUT, 30)
+ curl.setopt(pycurl.TIMEOUT, 300)
+ curl.setopt(pycurl.NOSIGNAL, 1)
+ curl.setopt(pycurl.WRITEDATA, fp)
+ curl.perform()
+ curl.close()
+ fp.close()
+def make_dir(path):
+ os.mkdir(path)
+ return path + "/"
+def write_file(path, content):
+ f = open(path, "w")
+ f.write(content.encode("utf-8"))
+ f.close()
+def write_comment(comment, dir):
+ def make_metadata():
+ out = ""
+ keys = {"comment_id" : "id",
+ "author" : "author",
+ "author_email" : "author_email",
+ "author_url" : "author_url",
+ "author_ip" : "author_ip",
+ "status" : "approved"}
+ for k in keys:
+ if (k != "status") or ((k == "status") and (comment[k] != "spam")):
+ out += "%s: %s\n" % (k, comment[k])
+ date = comment["date_created_gmt"]
+ out += "timestamp: %s\n" % int(time.mktime(date.timetuple()))
+ return out
+ write_file(dir + str(comment["comment_id"]),
+ make_metadata() + "\n" + comment["content"])
+def unescape(s):
+ s = s.replace("&lt;", "<")
+ s = s.replace("&gt;", ">")
+ s = s.replace("&amp;", "&")
+ return s
+def make_post_key(post):
+ d = post["dateCreated"].timetuple()
+ pre = "%d/%02d/%02d/%s" % (d.tm_year, d.tm_mon, d.tm_mday, post["wp_slug"])
+ return urllib.quote(pre, "").lower()
+def get_post_images(post, url, imgdir):
+ parser = MyHTMLParser(url, imgdir)
+ parser.feed(post["description"])
+def write_post(post, categories, comments, images_url, new_images_url):
+ print "writing post %s" % unescape(post["title"])
+ def make_metadata():
+ out = ""
+ keys = {"postid" : "id",
+ "wp_author_display_name" : "author",
+ "title" : "title",
+ "post_status" : "status",
+ "mt_allow_comments" : "comment_status"}
+ for k in keys:
+ value = post[k]
+ if k == "mt_allow_comments":
+ if value == 1:
+ value = "open"
+ else:
+ value = "closed"
+ out += "%s: %s\n" % (keys[k], value)
+ out += "tags: %s\n" % ", ".join(categories)
+ out += "timestamp: %s\n" % int(time.mktime(post["dateCreated"].timetuple()))
+ return unicode(out)
+ key = make_post_key(post)
+ d = make_dir(key)
+ content = string.replace(post["description"], images_url, new_images_url)
+ write_file(d + "content", content)
+ write_file(d + "metadata", make_metadata())
+ if comments:
+ c = make_dir(d + "comments")
+ for comment in comments:
+ write_comment(comment, c)
+def main(args):
+ d_posts = tempfile.mkdtemp(prefix="wp2dir")
+ d_images = tempfile.mkdtemp(prefix="wp2img")
+ print "creating directories %s and %s" % (d_posts, d_images)
+ _, url, images_url, new_images_url, user, passwd = args
+ wp = WordPress(url, user, passwd)
+ posts = wp.all_posts()
+ for post in posts:
+ comments = wp.post_comments(post)
+ os.chdir(d_posts)
+ write_post(post, post["categories"], comments, images_url, new_images_url)
+ os.chdir(d_images)
+ get_post_images(post, images_url, d_images)
+if __name__ == "__main__":
+ main(sys.argv)