2010-12-22 16:12:07 +01:00
|
|
|
#!/usr/bin/env python
|
|
|
|
|
|
|
|
import os
|
|
|
|
import pycurl
|
2010-12-23 11:16:20 +01:00
|
|
|
import re
|
2010-12-22 16:12:07 +01:00
|
|
|
import string
|
|
|
|
import sys
|
|
|
|
import tempfile
|
|
|
|
import time
|
|
|
|
import urllib
|
|
|
|
import urlparse
|
|
|
|
import xmlrpclib
|
|
|
|
|
|
|
|
from HTMLParser import HTMLParser
|
|
|
|
|
2010-12-23 03:14:59 +01:00
|
|
|
MAX_POST_NO = 10000
|
|
|
|
|
2010-12-22 16:12:07 +01:00
|
|
|
class WordPress:
|
|
|
|
|
|
|
|
def __init__(self, url, username, password):
|
|
|
|
self.__username = username
|
|
|
|
self.__password = password
|
|
|
|
self.__proxy = xmlrpclib.ServerProxy(url)
|
|
|
|
|
|
|
|
def all_posts(self):
|
|
|
|
return self.__proxy.metaWeblog.getRecentPosts("",
|
|
|
|
self.__username,
|
|
|
|
self.__password,
|
|
|
|
MAX_POST_NO)
|
|
|
|
|
|
|
|
def post_comments(self, post):
|
|
|
|
return self.__proxy.wp.getComments("",
|
|
|
|
self.__username,
|
|
|
|
self.__password,
|
2010-12-23 03:14:59 +01:00
|
|
|
{"post_id" : post["postid"],
|
|
|
|
"number": MAX_POST_NO})
|
2010-12-22 16:12:07 +01:00
|
|
|
|
|
|
|
class MyHTMLParser(HTMLParser):
|
|
|
|
|
|
|
|
def __init__(self, url, imgdir):
|
|
|
|
HTMLParser.__init__(self)
|
|
|
|
self.url = url
|
|
|
|
self.imgdir = imgdir
|
|
|
|
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
|
|
if tag == "img":
|
|
|
|
for attr in attrs:
|
|
|
|
if attr[0] == "src":
|
|
|
|
self.download_image(attr[1])
|
|
|
|
|
|
|
|
def download_image(self, url):
|
|
|
|
parsed = urlparse.urlparse(url)
|
|
|
|
if parsed.netloc == self.url:
|
|
|
|
print " downloading %s" % url
|
|
|
|
filename = os.path.join(self.imgdir, parsed.path[1:])
|
|
|
|
try:
|
|
|
|
os.makedirs(os.path.dirname(filename))
|
|
|
|
except os.error:
|
|
|
|
pass
|
|
|
|
fp = open(filename, "wb")
|
|
|
|
curl = pycurl.Curl()
|
2010-12-23 02:08:24 +01:00
|
|
|
curl.setopt(pycurl.URL, str(url))
|
2010-12-22 16:12:07 +01:00
|
|
|
curl.setopt(pycurl.FOLLOWLOCATION, 1)
|
|
|
|
curl.setopt(pycurl.MAXREDIRS, 5)
|
|
|
|
curl.setopt(pycurl.CONNECTTIMEOUT, 30)
|
|
|
|
curl.setopt(pycurl.TIMEOUT, 300)
|
|
|
|
curl.setopt(pycurl.NOSIGNAL, 1)
|
|
|
|
curl.setopt(pycurl.WRITEDATA, fp)
|
|
|
|
curl.perform()
|
|
|
|
curl.close()
|
|
|
|
fp.close()
|
|
|
|
|
2010-12-23 17:42:03 +01:00
|
|
|
def html_media_object(service, url, media_id):
|
|
|
|
services = {"youtube" : '<object width="480" height="385"><param name="movie" value="http://%url%/v/%media_id%fs=1"></param><param name="allowFullScreen" value="true"></param><param name="allowscriptaccess" value="always"></param><embed src="http://%url%/v/%media_id%?fs=1" type="application/x-shockwave-flash" allowscriptaccess="always" allowfullscreen="true" width="480" height="385"></embed></object>',
|
|
|
|
"googlevideo": '<object type="application/x-shockwave-flash" data="http://%url%/googleplayer.swf?docid=%media_id%" height="330" width="400"><param name="allowScriptAccess" value="never"/><param name="movie" value="http://%url%/googleplayer.swf?docid=%media_id%"/><param name="quality" value="best"/><param name="bgcolor" value="#ffffff"/><param name="scale" value="noScale"/><param name="wmode" value="opaque"/></object>'}
|
2010-12-23 13:23:38 +01:00
|
|
|
|
2010-12-23 11:16:20 +01:00
|
|
|
html_center_start = '<p><span style="text-align: center; display: block;">'
|
|
|
|
html_center_end = '</span></p>'
|
|
|
|
html_service = services[service]
|
2010-12-23 17:42:03 +01:00
|
|
|
html_service = html_service.replace("%url%", url)
|
2010-12-23 13:23:38 +01:00
|
|
|
html_service = html_service.replace("%media_id%", media_id)
|
2010-12-23 11:16:20 +01:00
|
|
|
return html_center_start + html_service + html_center_end
|
|
|
|
|
|
|
|
def analyze_media(content):
|
2010-12-23 17:42:03 +01:00
|
|
|
p_youtube = re.compile(r"\[youtube=http://([a-zA-Z0-9\-\.]+)/watch\?v=(.+)\]")
|
|
|
|
p_googlevideo = re.compile(r"\[googlevideo=http://([a-zA-Z0-9\-\.]+)/videoplay\?docid=([\-0-9]+).*\]")
|
2010-12-23 11:16:20 +01:00
|
|
|
lines = content.split("\n")
|
|
|
|
new_lines = []
|
|
|
|
for line in lines:
|
|
|
|
m_youtube = p_youtube.search(line)
|
|
|
|
m_googlevideo = p_googlevideo.search(line)
|
|
|
|
if m_youtube:
|
2010-12-23 17:42:03 +01:00
|
|
|
html_object = html_media_object("youtube",
|
|
|
|
m_youtube.group(1),
|
|
|
|
m_youtube.group(2))
|
2010-12-23 11:16:20 +01:00
|
|
|
new_lines.append(html_object)
|
|
|
|
elif m_googlevideo:
|
2010-12-23 17:42:03 +01:00
|
|
|
html_object = html_media_object("googlevideo",
|
|
|
|
m_googlevideo.group(1),
|
|
|
|
m_googlevideo.group(2))
|
2010-12-23 11:16:20 +01:00
|
|
|
new_lines.append(html_object)
|
|
|
|
else:
|
|
|
|
new_lines.append(line)
|
|
|
|
return "\n".join(new_lines)
|
|
|
|
|
2010-12-22 16:12:07 +01:00
|
|
|
def make_dir(path):
|
|
|
|
os.mkdir(path)
|
|
|
|
return path + "/"
|
|
|
|
|
|
|
|
def write_file(path, content):
|
|
|
|
f = open(path, "w")
|
|
|
|
f.write(content.encode("utf-8"))
|
|
|
|
f.close()
|
|
|
|
|
|
|
|
def write_comment(comment, dir):
|
|
|
|
def make_metadata():
|
|
|
|
out = ""
|
|
|
|
keys = {"comment_id" : "id",
|
|
|
|
"author" : "author",
|
|
|
|
"author_email" : "author_email",
|
|
|
|
"author_url" : "author_url",
|
|
|
|
"author_ip" : "author_ip",
|
|
|
|
"status" : "approved"}
|
|
|
|
for k in keys:
|
|
|
|
if (k != "status") or ((k == "status") and (comment[k] != "spam")):
|
|
|
|
out += "%s: %s\n" % (k, comment[k])
|
|
|
|
date = comment["date_created_gmt"]
|
|
|
|
out += "timestamp: %s\n" % int(time.mktime(date.timetuple()))
|
|
|
|
return out
|
|
|
|
|
|
|
|
write_file(dir + str(comment["comment_id"]),
|
|
|
|
make_metadata() + "\n" + comment["content"])
|
|
|
|
|
|
|
|
def unescape(s):
|
|
|
|
s = s.replace("<", "<")
|
|
|
|
s = s.replace(">", ">")
|
|
|
|
s = s.replace("&", "&")
|
|
|
|
return s
|
|
|
|
|
|
|
|
def make_post_key(post):
|
|
|
|
d = post["dateCreated"].timetuple()
|
|
|
|
pre = "%d/%02d/%02d/%s" % (d.tm_year, d.tm_mon, d.tm_mday, post["wp_slug"])
|
|
|
|
return urllib.quote(pre, "").lower()
|
|
|
|
|
|
|
|
def get_post_images(post, url, imgdir):
|
|
|
|
parser = MyHTMLParser(url, imgdir)
|
|
|
|
parser.feed(post["description"])
|
|
|
|
|
|
|
|
def write_post(post, categories, comments, images_url, new_images_url):
|
|
|
|
print "writing post %s" % unescape(post["title"])
|
2011-01-14 10:25:47 +01:00
|
|
|
def make_metadata(key):
|
2010-12-22 16:12:07 +01:00
|
|
|
out = ""
|
|
|
|
keys = {"postid" : "id",
|
|
|
|
"wp_author_display_name" : "author",
|
|
|
|
"title" : "title",
|
|
|
|
"post_status" : "status",
|
|
|
|
"mt_allow_comments" : "comment_status"}
|
|
|
|
for k in keys:
|
|
|
|
value = post[k]
|
|
|
|
if k == "mt_allow_comments":
|
|
|
|
if value == 1:
|
|
|
|
value = "open"
|
|
|
|
else:
|
|
|
|
value = "closed"
|
|
|
|
out += "%s: %s\n" % (keys[k], value)
|
2011-01-14 10:25:47 +01:00
|
|
|
out += "name: %s\n" % key
|
2010-12-22 16:12:07 +01:00
|
|
|
out += "tags: %s\n" % ", ".join(categories)
|
|
|
|
out += "timestamp: %s\n" % int(time.mktime(post["dateCreated"].timetuple()))
|
|
|
|
return unicode(out)
|
|
|
|
|
|
|
|
key = make_post_key(post)
|
|
|
|
d = make_dir(key)
|
|
|
|
content = string.replace(post["description"], images_url, new_images_url)
|
2010-12-23 11:16:20 +01:00
|
|
|
content = analyze_media(content)
|
2010-12-22 16:12:07 +01:00
|
|
|
write_file(d + "content", content)
|
2011-01-14 10:25:47 +01:00
|
|
|
write_file(d + "metadata", make_metadata(key))
|
2010-12-22 16:12:07 +01:00
|
|
|
if comments:
|
|
|
|
c = make_dir(d + "comments")
|
|
|
|
for comment in comments:
|
|
|
|
write_comment(comment, c)
|
|
|
|
|
|
|
|
def main(args):
|
|
|
|
d_posts = tempfile.mkdtemp(prefix="wp2dir")
|
|
|
|
d_images = tempfile.mkdtemp(prefix="wp2img")
|
|
|
|
print "creating directories %s and %s" % (d_posts, d_images)
|
|
|
|
|
2011-01-03 18:14:10 +01:00
|
|
|
try:
|
|
|
|
_, url, images_url, new_images_url, user, passwd = args
|
|
|
|
except ValueError:
|
|
|
|
print
|
|
|
|
print "usage: wordpress-xmlrpc-to-dir.py url images_url new_images_url user password"
|
|
|
|
print
|
|
|
|
print " url Wordpress XMLRPC URL"
|
|
|
|
print " images_url Current Wordpress images URL (without http://)"
|
|
|
|
print " new_images_url New tekuti images URL (without http://)"
|
|
|
|
print " user Wordpress user"
|
|
|
|
print " password Wordpress user password"
|
|
|
|
print
|
|
|
|
sys.exit(1)
|
2010-12-22 16:12:07 +01:00
|
|
|
|
|
|
|
wp = WordPress(url, user, passwd)
|
|
|
|
posts = wp.all_posts()
|
|
|
|
for post in posts:
|
|
|
|
comments = wp.post_comments(post)
|
|
|
|
os.chdir(d_posts)
|
|
|
|
write_post(post, post["categories"], comments, images_url, new_images_url)
|
|
|
|
os.chdir(d_images)
|
|
|
|
get_post_images(post, images_url, d_images)
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main(sys.argv)
|
|
|
|
|