import urllib import sgmllib import re import os savedir = "/home/slash/Pictures/4grab/" class MyParser(sgmllib.SGMLParser): "A simple parser class." def parse(self, s): "Parse the given string 's'." self.feed(s) self.close() def __init__(self, verbose=0): "Initialise an object, passing 'verbose' to the superclass." sgmllib.SGMLParser.__init__(self, verbose) self.hyperlinks = [] self.url_reg = re.compile('res/\d+\Z') self.img_reg = re.compile('/\d+\.(jpg|gif|bmp|png|jpeg)\Z') def start_a(self, attributes): "Process a hyperlink and its 'attributes'." for name, value in attributes: if name == "href": if self.url_reg.search(value) != None: self.hyperlinks.append(value) def get_hyperlinks(self): "Return the list of hyperlinks." return self.hyperlinks class MySubParser(MyParser): def __init__(self, verbose=0): MyParser.__init__(self, verbose) self.url_reg = re.compile('/src/\d+\.\w{3,4}\Z') if __name__ == "__main__": # Get a file-like object for the 4chan.org w/imgboard base_url = "http://boards.4chan.org/w/" myparser = MyParser() total = 10 for i in range(0, total): if i > 0: url = base_url + str(i) else: url = base_url tries = 10 while tries > 0: try: f = urllib.urlopen(url) break except IOError: tries = tries - 1 print "Try of", url, "failed,", tries, "tries left" if not f is None: # Read the object s = f.read() f.close() # Try and process the page. # The class should have been defined first, remember. myparser.parse(s) print "Parsed", url, "-", i + 1, "of", total else: "Opening of", url, "did not succeed, trying next one..." # Get the hyperlinks. t = myparser.get_hyperlinks() mysubparser = MySubParser() total = len(t) i = 1 for link in t: img_url = base_url + link tries = 10 while tries > 0: try: f = urllib.urlopen(img_url) break except IOError: tries = tries - 1 print "Try of", img_url, "failed,", tries, "tries left" if not f is None: s = f.read() f.close() mysubparser.parse(s) print "Parsed", img_url, "-", i, "of", total else: print "Opening of", img_url, "did not succeed, trying next one..." i = i + 1 t = mysubparser.get_hyperlinks() total = len(t) i = 1 for link in t: filename = os.path.join(savedir, os.path.split(link)[1]) if not os.path.exists(filename): tries = 10 while tries > 0: try: urllib.urlretrieve(link, filename) print "Retrieved", link, "-", i, "of", total break except IOError: tries = tries - 1 print "Downloading of", link, "failed,", tries, "left" else: print "Not downloading", link, "already downloaded" i = i + 1