diff --git a/4grab.py b/4grab.py index ae550b4..7240f16 100755 --- a/4grab.py +++ b/4grab.py @@ -24,9 +24,11 @@ import sys import config import download +import progressbar base_url = "http://boards.4chan.org/" parser = optparse.OptionParser() +downloader = download.Downloader(progressbar.Progress) def walk_with_wizard(baseurl): print "Alright, let me put on my robe and wizard hat." @@ -42,19 +44,19 @@ def walk_with_wizard(baseurl): if inp == "single": inp = raw_input("Which thread would you like to download? ") if inp[:7] == "http://": - t = download.get_image_links("", [inp]) + t = downloader.get_image_links("", [inp]) else: thread = inp inp = raw_input("Which category is this thread in? ") - t = download.get_image_links("%s%s/res/" % (baseurl, inp), [thread]) + t = downloader.get_image_links("%s%s/res/" % (baseurl, inp), [thread]) else: inp = raw_input("Which category would you like to download? ") config.Configuration().set_category(inp) baseurl = "%s%s/" % (baseurl, config.Configuration().get_category()) - t = download.get_thread_links(baseurl) - t = download.get_image_links(baseurl, t) - (skipped, failed, downloaded, total) = download.get_images(t) + t = downloader.get_thread_links(baseurl) + t = downloader.get_image_links(baseurl, t) + (skipped, failed, downloaded, total) = downloader.get_images(t) print "Downloaded: ", downloaded print "Skipped: ", skipped print "Failed: ", failed @@ -67,10 +69,26 @@ parser.set_usage( This program comes with ABSOLUTELY NO WARRANTY. This is free software, and you are welcome to redistribute it under certain conditions.""") -parser.add_option("-e", nargs=2, dest="confval", metavar="CONF VALUE", help="Set configuration option CONF to be VALUE") -parser.add_option("-c", "--category", dest="tempcat", metavar="CATEGORY", help="Set the category to CATEGORY only for this run") -parser.add_option("-t", "--thread", dest="thread", metavar="THREAD", help="Download only THREAD. If THREAD is only an ID, CATEGORY must also be set. Otherwise, no problem :-)") -parser.add_option("-w", "--wizard", action="store_true", dest="wizard", help="I'll put on my robe and wizard hat and help you get some of those pictures you like") +parser.add_option("-e", + nargs=2, + dest="confval", + metavar="CONF VALUE", + help="Set configuration option CONF to be VALUE") +parser.add_option("-c", + "--category", + dest="tempcat", + metavar="CATEGORY", + help="Set the category to CATEGORY only for this run") +parser.add_option("-t", + "--thread", + dest="thread", + metavar="THREAD", + help="Download only THREAD. If THREAD is only an ID, CATEGORY must also be set. Otherwise, no problem :-)") +parser.add_option("-w", + "--wizard", + action="store_true", + dest="wizard", + help="I'll put on my robe and wizard hat and help you get some of those pictures you like") (options, args) = parser.parse_args() @@ -94,14 +112,14 @@ elif options.wizard: exit(0) elif options.thread: if options.thread[:7] == "http://": - t = download.get_image_links("", [options.thread]) + t = downloader.get_image_links("", [options.thread]) elif options.tempcat: url = "%s%s/res/" % (base_url, options.tempcat) - t = download.get_image_links(url, [options.thread]) + t = downloader.get_image_links(url, [options.thread]) else: print "if THREAD is not an absolute URL, CATEGORY must also be specified" exit(1) - (skipped, failed, downloaded, total) = download.get_images(t) + (skipped, failed, downloaded, total) = downloader.get_images(t) print "Downloaded: ", downloaded print "Skipped: ", skipped print "Failed: ", failed @@ -112,9 +130,9 @@ elif options.tempcat: base_url = "%s%s/" % (base_url, config.Configuration().get_category()) -t = download.get_thread_links(base_url) -t = download.get_image_links(base_url, t) -(skipped, failed, downloaded, total) = download.get_images(t) +t = downloader.get_thread_links(base_url) +t = downloader.get_image_links(base_url, t) +(skipped, failed, downloaded, total) = downloader.get_images(t) print "Downloaded: ", downloaded print "Skipped: ", skipped print "Failed: ", failed diff --git a/download.py b/download.py index b9f9f72..2405805 100644 --- a/download.py +++ b/download.py @@ -22,104 +22,108 @@ import urllib import os import htmlparser -import progressbar +#import progressbar import config savedir = config.Configuration().get_download_location() if not os.path.exists(savedir): os.makedirs(savedir) -def get_thread_links(baseurl): - myparser = htmlparser.MyParser() - t = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"] - i = 1 - total = len(t) - progress = progressbar.Progress(total) +class Downloader(object): + def __init__(self, progress_reporter): + self.progress_reporter = progress_reporter - for pagenum in t: - progress.show_progress(i) - - url = baseurl + pagenum - tries = 10 - while tries > 0: - try: - f = urllib.urlopen(url) - break - except IOError: - tries -= 1 - print "\rTry of", url, "failed,", tries, "tries left" - if not f is None: - # Read the response - s = f.read() - f.close() - - # Process the page. - myparser.parse(s) - else: - "\rOpening of", url, "did not succeed, trying next one..." - i += 1 - - progress.complete() - return myparser.get_hyperlinks() - -def get_image_links(baseurl, t = []): - mysubparser = htmlparser.MySubParser() - total = len(t) - progress = progressbar.Progress(total) - i = 1 - - for link in t: - progress.show_progress(i) - - img_url = baseurl + link - tries = 10 - while tries > 0: - try: - f = urllib.urlopen(img_url) - break - except IOError: - tries -= 1 - print "\rTry of", img_url, "failed,", tries, "tries left" - if not f is None: - s = f.read() - f.close() - - mysubparser.parse(s) - else: - print "\rOpening of", img_url, "did not succeed, trying next one..." - i += 1 - - progress.complete() - return mysubparser.get_hyperlinks() - -def get_images(t = []): - skipped = 0 - failed = 0 - downloaded = 0 - total = len(t) - progress = progressbar.Progress(total) - i = 1 - for link in t: - progress.show_progress(i) - filename = os.path.join(savedir, os.path.split(link)[1]) - if not os.path.exists(filename): + def get_thread_links(self, baseurl): + myparser = htmlparser.MyParser() + t = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"] + i = 1 + total = len(t) + progress = self.progress_reporter(total) + + for pagenum in t: + progress.show_progress(i) + + url = baseurl + pagenum tries = 10 while tries > 0: try: - urllib.urlretrieve(link, filename) + f = urllib.urlopen(url) break except IOError: tries -= 1 - if tries == 0: - failed += 1 + print "\rTry of", url, "failed,", tries, "tries left" + if not f is None: + # Read the response + s = f.read() + f.close() + + # Process the page. + myparser.parse(s) else: - downloaded += 1 - else: - skipped += 1 - i += 1 + "\rOpening of", url, "did not succeed, trying next one..." + i += 1 - progress.complete() - return (skipped, failed, downloaded, total) + progress.complete() + return myparser.get_hyperlinks() + + def get_image_links(self, baseurl, t = []): + mysubparser = htmlparser.MySubParser() + total = len(t) + progress = self.progress_reporter(total) + i = 1 + + for link in t: + progress.show_progress(i) + + img_url = baseurl + link + tries = 10 + while tries > 0: + try: + f = urllib.urlopen(img_url) + break + except IOError: + tries -= 1 + print "\rTry of", img_url, "failed,", tries, "tries left" + if not f is None: + s = f.read() + f.close() + + mysubparser.parse(s) + else: + print "\rOpening of", img_url, "did not succeed, trying next one..." + i += 1 + + progress.complete() + return mysubparser.get_hyperlinks() + + def get_images(self, t = []): + skipped = 0 + failed = 0 + downloaded = 0 + total = len(t) + progress = self.progress_reporter(total) + i = 1 + for link in t: + progress.show_progress(i) + filename = os.path.join(savedir, os.path.split(link)[1]) + if not os.path.exists(filename): + tries = 10 + while tries > 0: + try: + urllib.urlretrieve(link, filename) + break + except IOError: + tries -= 1 + if tries == 0: + failed += 1 + else: + downloaded += 1 + else: + skipped += 1 + i += 1 + + progress.complete() + return (skipped, failed, downloaded, total) if __name__ == "__main__": # Get a file-like object for the 4chan.org w/imgboard