#!/usr/bin/env python ###################################################################### # Copyright 2009, 2010 ryuslash # # This file is part of 4grab. # # 4grab is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # 4grab is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with 4grab. If not, see . ###################################################################### import urllib import os import htmlparser #import progressbar import config savedir = config.Configuration().get_download_location() if not os.path.exists(savedir): os.makedirs(savedir) class Downloader(object): def __init__(self, progress_reporter): self.progress_reporter = progress_reporter def get_thread_links(self, baseurl): myparser = htmlparser.MyParser() t = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"] i = 1 total = len(t) progress = self.progress_reporter(total) for pagenum in t: progress.show_progress(i) url = baseurl + pagenum tries = 10 while tries > 0: try: f = urllib.urlopen(url) break except IOError: tries -= 1 print "\rTry of", url, "failed,", tries, "tries left" if not f is None: # Read the response s = f.read() f.close() # Process the page. myparser.parse(s) else: "\rOpening of", url, "did not succeed, trying next one..." i += 1 progress.complete() return myparser.get_hyperlinks() def get_image_links(self, baseurl, t = []): mysubparser = htmlparser.MySubParser() total = len(t) progress = self.progress_reporter(total) i = 1 for link in t: progress.show_progress(i) img_url = baseurl + link tries = 10 while tries > 0: try: f = urllib.urlopen(img_url) break except IOError: tries -= 1 print "\rTry of", img_url, "failed,", tries, "tries left" if not f is None: s = f.read() f.close() mysubparser.parse(s) else: print "\rOpening of", img_url, "did not succeed, trying next one..." i += 1 progress.complete() return mysubparser.get_hyperlinks() def get_images(self, t = []): skipped = 0 failed = 0 downloaded = 0 total = len(t) progress = self.progress_reporter(total) i = 1 for link in t: progress.show_progress(i) filename = os.path.join(savedir, os.path.split(link)[1]) if not os.path.exists(filename): tries = 10 while tries > 0: try: urllib.urlretrieve(link, filename) break except IOError: tries -= 1 if tries == 0: failed += 1 else: downloaded += 1 else: skipped += 1 i += 1 progress.complete() return (skipped, failed, downloaded, total) if __name__ == "__main__": # Get a file-like object for the 4chan.org w/imgboard base_url = "http://boards.4chan.org/" + config.Configuration().get_category() + "/" # Get the hyperlinks. t = get_thread_links(base_url) t = get_image_links(base_url, t) get_images(t)