From 025a723a5a201009da392bca4c27c4eb25e9e734 Mon Sep 17 00:00:00 2001 From: ryuslash Date: Sun, 17 Jan 2010 03:48:23 +0100 Subject: Parser seperation, progress bar * Seperated the parser from the downloader code. * Added a progressbar class, to make it look fancier * Created some functions to do all the work in downloader.py, cleaner now * Changed parser.py to htmlparser.py, since it was conflicting with a built-in module --- download.py | 125 +++++++++++++++++++++++++----------------------------------- 1 file changed, 51 insertions(+), 74 deletions(-) (limited to 'download.py') diff --git a/download.py b/download.py index 5c77654..f23d64c 100644 --- a/download.py +++ b/download.py @@ -1,82 +1,50 @@ import urllib -import sgmllib -import re import os +import htmlparser +import progressbar savedir = "/home/slash/Pictures/4grab/" - -class MyParser(sgmllib.SGMLParser): - "A simple parser class." - - def parse(self, s): - "Parse the given string 's'." - self.feed(s) - self.close() - - def __init__(self, verbose=0): - "Initialise an object, passing 'verbose' to the superclass." - - sgmllib.SGMLParser.__init__(self, verbose) - self.hyperlinks = [] - - self.url_reg = re.compile('res/\d+\Z') - self.img_reg = re.compile('/\d+\.(jpg|gif|bmp|png|jpeg)\Z') - - def start_a(self, attributes): - "Process a hyperlink and its 'attributes'." - - for name, value in attributes: - if name == "href": - if self.url_reg.search(value) != None: - self.hyperlinks.append(value) - - def get_hyperlinks(self): - "Return the list of hyperlinks." - - return self.hyperlinks - -class MySubParser(MyParser): - def __init__(self, verbose=0): - MyParser.__init__(self, verbose) - self.url_reg = re.compile('/src/\d+\.\w{3,4}\Z') -if __name__ == "__main__": - # Get a file-like object for the 4chan.org w/imgboard - base_url = "http://boards.4chan.org/w/" - myparser = MyParser() - total = 10 - for i in range(0, total): - if i > 0: - url = base_url + str(i) - else: - url = base_url - +def get_thread_links(baseurl): + myparser = htmlparser.MyParser() + t = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"] + i = 1 + total = len(t) + progress = progressbar.Progress(total) + + for pagenum in t: + progress.show_progress(i) + + url = base_url + pagenum tries = 10 while tries > 0: - try: + try: f = urllib.urlopen(url) break except IOError: - tries = tries - 1 - print "Try of", url, "failed,", tries, "tries left" + tries -= 1 + print "\rTry of", url, "failed,", tries, "tries left" if not f is None: - # Read the object + # Read the response s = f.read() f.close() - - # Try and process the page. - # The class should have been defined first, remember. + + # Process the page. myparser.parse(s) - print "Parsed", url, "-", i + 1, "of", total else: - "Opening of", url, "did not succeed, trying next one..." - - # Get the hyperlinks. - t = myparser.get_hyperlinks() - mysubparser = MySubParser() + "\rOpening of", url, "did not succeed, trying next one..." + i += 1 + return myparser.get_hyperlinks() + +def get_image_links(baseurl, t = []): + mysubparser = htmlparser.MySubParser() total = len(t) + progress = progressbar.Progress(total) i = 1 + for link in t: + progress.show_progress(i) + img_url = base_url + link tries = 10 while tries > 0: @@ -84,35 +52,44 @@ if __name__ == "__main__": f = urllib.urlopen(img_url) break except IOError: - tries = tries - 1 - print "Try of", img_url, "failed,", tries, "tries left" + tries -= 1 + print "\rTry of", img_url, "failed,", tries, "tries left" if not f is None: s = f.read() f.close() mysubparser.parse(s) - print "Parsed", img_url, "-", i, "of", total else: - print "Opening of", img_url, "did not succeed, trying next one..." - i = i + 1 + print "\rOpening of", img_url, "did not succeed, trying next one..." + i += 1 - t = mysubparser.get_hyperlinks() + return mysubparser.get_hyperlinks() + +def get_images(t = []): total = len(t) + progress = progressbar.Progress(total) i = 1 for link in t: + progress.show_progress(i) filename = os.path.join(savedir, os.path.split(link)[1]) if not os.path.exists(filename): tries = 10 while tries > 0: try: urllib.urlretrieve(link, filename) - print "Retrieved", link, "-", i, "of", total break except IOError: - tries = tries - 1 - print "Downloading of", link, "failed,", tries, "left" - + tries -= 1 + print "\rDownloading of", link, "failed,", tries, "left" else: - print "Not downloading", link, "already downloaded" - i = i + 1 - + print "\rNot downloading", link, "already downloaded" + i += 1 + +if __name__ == "__main__": + # Get a file-like object for the 4chan.org w/imgboard + base_url = "http://boards.4chan.org/w/" + + # Get the hyperlinks. + t = get_thread_links(base_url) + t = get_image_links(base_url, t) + get_images(t) -- cgit v1.2.3-54-g00ecf