From 025a723a5a201009da392bca4c27c4eb25e9e734 Mon Sep 17 00:00:00 2001 From: ryuslash Date: Sun, 17 Jan 2010 03:48:23 +0100 Subject: [PATCH] Parser seperation, progress bar * Seperated the parser from the downloader code. * Added a progressbar class, to make it look fancier * Created some functions to do all the work in downloader.py, cleaner now * Changed parser.py to htmlparser.py, since it was conflicting with a built-in module --- .gitignore | 1 + download.py | 125 ++++++++++++++++++++----------------------------- htmlparser.py | 30 ++++++++++++ progressbar.py | 27 +++++++++++ 4 files changed, 109 insertions(+), 74 deletions(-) create mode 100644 htmlparser.py create mode 100644 progressbar.py diff --git a/.gitignore b/.gitignore index b25c15b..2f836aa 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ *~ +*.pyc diff --git a/download.py b/download.py index 5c77654..f23d64c 100644 --- a/download.py +++ b/download.py @@ -1,82 +1,50 @@ import urllib -import sgmllib -import re import os +import htmlparser +import progressbar savedir = "/home/slash/Pictures/4grab/" - -class MyParser(sgmllib.SGMLParser): - "A simple parser class." - - def parse(self, s): - "Parse the given string 's'." - self.feed(s) - self.close() - - def __init__(self, verbose=0): - "Initialise an object, passing 'verbose' to the superclass." - - sgmllib.SGMLParser.__init__(self, verbose) - self.hyperlinks = [] - - self.url_reg = re.compile('res/\d+\Z') - self.img_reg = re.compile('/\d+\.(jpg|gif|bmp|png|jpeg)\Z') - - def start_a(self, attributes): - "Process a hyperlink and its 'attributes'." - - for name, value in attributes: - if name == "href": - if self.url_reg.search(value) != None: - self.hyperlinks.append(value) - - def get_hyperlinks(self): - "Return the list of hyperlinks." - - return self.hyperlinks - -class MySubParser(MyParser): - def __init__(self, verbose=0): - MyParser.__init__(self, verbose) - self.url_reg = re.compile('/src/\d+\.\w{3,4}\Z') -if __name__ == "__main__": - # Get a file-like object for the 4chan.org w/imgboard - base_url = "http://boards.4chan.org/w/" - myparser = MyParser() - total = 10 - for i in range(0, total): - if i > 0: - url = base_url + str(i) - else: - url = base_url - +def get_thread_links(baseurl): + myparser = htmlparser.MyParser() + t = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"] + i = 1 + total = len(t) + progress = progressbar.Progress(total) + + for pagenum in t: + progress.show_progress(i) + + url = base_url + pagenum tries = 10 while tries > 0: - try: + try: f = urllib.urlopen(url) break except IOError: - tries = tries - 1 - print "Try of", url, "failed,", tries, "tries left" + tries -= 1 + print "\rTry of", url, "failed,", tries, "tries left" if not f is None: - # Read the object + # Read the response s = f.read() f.close() - - # Try and process the page. - # The class should have been defined first, remember. + + # Process the page. myparser.parse(s) - print "Parsed", url, "-", i + 1, "of", total else: - "Opening of", url, "did not succeed, trying next one..." - - # Get the hyperlinks. - t = myparser.get_hyperlinks() - mysubparser = MySubParser() + "\rOpening of", url, "did not succeed, trying next one..." + i += 1 + return myparser.get_hyperlinks() + +def get_image_links(baseurl, t = []): + mysubparser = htmlparser.MySubParser() total = len(t) + progress = progressbar.Progress(total) i = 1 + for link in t: + progress.show_progress(i) + img_url = base_url + link tries = 10 while tries > 0: @@ -84,35 +52,44 @@ if __name__ == "__main__": f = urllib.urlopen(img_url) break except IOError: - tries = tries - 1 - print "Try of", img_url, "failed,", tries, "tries left" + tries -= 1 + print "\rTry of", img_url, "failed,", tries, "tries left" if not f is None: s = f.read() f.close() mysubparser.parse(s) - print "Parsed", img_url, "-", i, "of", total else: - print "Opening of", img_url, "did not succeed, trying next one..." - i = i + 1 + print "\rOpening of", img_url, "did not succeed, trying next one..." + i += 1 - t = mysubparser.get_hyperlinks() + return mysubparser.get_hyperlinks() + +def get_images(t = []): total = len(t) + progress = progressbar.Progress(total) i = 1 for link in t: + progress.show_progress(i) filename = os.path.join(savedir, os.path.split(link)[1]) if not os.path.exists(filename): tries = 10 while tries > 0: try: urllib.urlretrieve(link, filename) - print "Retrieved", link, "-", i, "of", total break except IOError: - tries = tries - 1 - print "Downloading of", link, "failed,", tries, "left" - + tries -= 1 + print "\rDownloading of", link, "failed,", tries, "left" else: - print "Not downloading", link, "already downloaded" - i = i + 1 - + print "\rNot downloading", link, "already downloaded" + i += 1 + +if __name__ == "__main__": + # Get a file-like object for the 4chan.org w/imgboard + base_url = "http://boards.4chan.org/w/" + + # Get the hyperlinks. + t = get_thread_links(base_url) + t = get_image_links(base_url, t) + get_images(t) diff --git a/htmlparser.py b/htmlparser.py new file mode 100644 index 0000000..73338dd --- /dev/null +++ b/htmlparser.py @@ -0,0 +1,30 @@ +import sgmllib +import re + +class MyParser(sgmllib.SGMLParser): + def __init__(self, verbose=0): + sgmllib.SGMLParser.__init__(self, verbose) + + self.hyperlinks = [] + self.url_reg = re.compile('res/\d+\Z') + self.prev = "" + + def parse(self, s): + self.feed(s) + self.close() + + def start_a(self, attributes): + for name, value in attributes: + if name == "href": + if self.url_reg.search(value) != None: + if self.prev != value: + self.hyperlinks.append(value) + self.prev = value + + def get_hyperlinks(self): + return self.hyperlinks + +class MySubParser(MyParser): + def __init__(self, verbose=0): + MyParser.__init__(self, verbose) + self.url_reg = re.compile('/src/\d+\.\w{3,4}\Z') diff --git a/progressbar.py b/progressbar.py new file mode 100644 index 0000000..a2ea711 --- /dev/null +++ b/progressbar.py @@ -0,0 +1,27 @@ +import sys +import time + +class Progress(): + def __init__(self, maxvalue, maxwidth=80, fd=sys.stdout): + self.maxwidth = maxwidth + self.maxvalue = maxvalue + self.fd = fd + self.fill_char = '#' + + self.show_progress(0) + + def show_progress(self, value): + str_value = str(value) + str_maxvalue = str(self.maxvalue) + true_maxwidth = self.maxwidth - 4 - len(str_value) - len(str_maxvalue) + progress = int(round((true_maxwidth/float(self.maxvalue))*value)) + self.fd.write("\r%s/%s [%s%s]" % (str_value, str_maxvalue, self.fill_char * progress, " " * (true_maxwidth - progress))) + self.fd.flush() + if value == self.maxvalue: + self.fd.write("\n") + +if __name__ == "__main__": + prog = Progress(200) + for i in range(1, 201): + prog.show_progress(i) + time.sleep(1)