import urllib import os import htmlparser import progressbar savedir = "/home/slash/Pictures/4grab/" def get_thread_links(baseurl): myparser = htmlparser.MyParser() t = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"] i = 1 total = len(t) progress = progressbar.Progress(total) for pagenum in t: progress.show_progress(i) url = base_url + pagenum tries = 10 while tries > 0: try: f = urllib.urlopen(url) break except IOError: tries -= 1 print "\rTry of", url, "failed,", tries, "tries left" if not f is None: # Read the response s = f.read() f.close() # Process the page. myparser.parse(s) else: "\rOpening of", url, "did not succeed, trying next one..." i += 1 return myparser.get_hyperlinks() def get_image_links(baseurl, t = []): mysubparser = htmlparser.MySubParser() total = len(t) progress = progressbar.Progress(total) i = 1 for link in t: progress.show_progress(i) img_url = base_url + link tries = 10 while tries > 0: try: f = urllib.urlopen(img_url) break except IOError: tries -= 1 print "\rTry of", img_url, "failed,", tries, "tries left" if not f is None: s = f.read() f.close() mysubparser.parse(s) else: print "\rOpening of", img_url, "did not succeed, trying next one..." i += 1 return mysubparser.get_hyperlinks() def get_images(t = []): total = len(t) progress = progressbar.Progress(total) i = 1 for link in t: progress.show_progress(i) filename = os.path.join(savedir, os.path.split(link)[1]) if not os.path.exists(filename): tries = 10 while tries > 0: try: urllib.urlretrieve(link, filename) break except IOError: tries -= 1 print "\rDownloading of", link, "failed,", tries, "left" else: print "\rNot downloading", link, "already downloaded" i += 1 if __name__ == "__main__": # Get a file-like object for the 4chan.org w/imgboard base_url = "http://boards.4chan.org/w/" # Get the hyperlinks. t = get_thread_links(base_url) t = get_image_links(base_url, t) get_images(t)