commit fb65246575871e0129b80911c3610606884451b0 Author: ryuslash Date: Fri Jan 15 08:22:17 2010 +0100 Initial commit Can download images from /w/ diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b25c15b --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*~ diff --git a/download.py b/download.py new file mode 100644 index 0000000..5c77654 --- /dev/null +++ b/download.py @@ -0,0 +1,118 @@ +import urllib +import sgmllib +import re +import os + +savedir = "/home/slash/Pictures/4grab/" + +class MyParser(sgmllib.SGMLParser): + "A simple parser class." + + def parse(self, s): + "Parse the given string 's'." + self.feed(s) + self.close() + + def __init__(self, verbose=0): + "Initialise an object, passing 'verbose' to the superclass." + + sgmllib.SGMLParser.__init__(self, verbose) + self.hyperlinks = [] + + self.url_reg = re.compile('res/\d+\Z') + self.img_reg = re.compile('/\d+\.(jpg|gif|bmp|png|jpeg)\Z') + + def start_a(self, attributes): + "Process a hyperlink and its 'attributes'." + + for name, value in attributes: + if name == "href": + if self.url_reg.search(value) != None: + self.hyperlinks.append(value) + + def get_hyperlinks(self): + "Return the list of hyperlinks." + + return self.hyperlinks + +class MySubParser(MyParser): + def __init__(self, verbose=0): + MyParser.__init__(self, verbose) + self.url_reg = re.compile('/src/\d+\.\w{3,4}\Z') + +if __name__ == "__main__": + # Get a file-like object for the 4chan.org w/imgboard + base_url = "http://boards.4chan.org/w/" + myparser = MyParser() + total = 10 + for i in range(0, total): + if i > 0: + url = base_url + str(i) + else: + url = base_url + + tries = 10 + while tries > 0: + try: + f = urllib.urlopen(url) + break + except IOError: + tries = tries - 1 + print "Try of", url, "failed,", tries, "tries left" + if not f is None: + # Read the object + s = f.read() + f.close() + + # Try and process the page. + # The class should have been defined first, remember. + myparser.parse(s) + print "Parsed", url, "-", i + 1, "of", total + else: + "Opening of", url, "did not succeed, trying next one..." + + # Get the hyperlinks. + t = myparser.get_hyperlinks() + mysubparser = MySubParser() + total = len(t) + i = 1 + for link in t: + img_url = base_url + link + tries = 10 + while tries > 0: + try: + f = urllib.urlopen(img_url) + break + except IOError: + tries = tries - 1 + print "Try of", img_url, "failed,", tries, "tries left" + if not f is None: + s = f.read() + f.close() + + mysubparser.parse(s) + print "Parsed", img_url, "-", i, "of", total + else: + print "Opening of", img_url, "did not succeed, trying next one..." + i = i + 1 + + t = mysubparser.get_hyperlinks() + total = len(t) + i = 1 + for link in t: + filename = os.path.join(savedir, os.path.split(link)[1]) + if not os.path.exists(filename): + tries = 10 + while tries > 0: + try: + urllib.urlretrieve(link, filename) + print "Retrieved", link, "-", i, "of", total + break + except IOError: + tries = tries - 1 + print "Downloading of", link, "failed,", tries, "left" + + else: + print "Not downloading", link, "already downloaded" + i = i + 1 +