#!/usr/bin/env python ###################################################################### # Copyright 2009, 2010 ryuslash # # This file is part of 4grab. # # 4grab is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # 4grab is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with 4grab. If not, see . ###################################################################### import urllib import os import htmlparser import config def get_savedir(): conf = config.Configuration() savedir = conf.get_download_location() if not os.path.exists(savedir): os.makedirs(savedir) return savedir def check_archive(fullpath): conf = config.Configuration() archive = conf.get_archive_location() filename = os.path.basename(fullpath) archfile = os.path.join(archive, filename) return os.path.exists(archfile) class Downloader(object): def __init__(self, progress_reporter): self.progress_reporter = progress_reporter self.on_downloaded = None def set_on_downloaded(self, on_downloaded): self.on_downloaded = on_downloaded def get_thread_links(self, baseurl): myparser = htmlparser.MyParser() t = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15"] i = 1 total = len(t) progress = self.progress_reporter(total) for pagenum in t: progress.show_progress(i) url = baseurl + pagenum tries = 10 while tries > 0: try: f = urllib.urlopen(url) break except IOError: tries -= 1 print "\rTry of", url, "failed,", tries, "tries left" if not f is None: # Read the response s = f.read() f.close() # Process the page. myparser.parse(s) else: "\rOpening of", url, "did not succeed, trying next one..." i += 1 progress.complete() return myparser.get_hyperlinks() def get_image_links(self, baseurl, t = []): mysubparser = htmlparser.MySubParser() total = len(t) progress = self.progress_reporter(total) i = 1 for link in t: progress.show_progress(i) img_url = baseurl + link tries = 10 while tries > 0: try: f = urllib.urlopen(img_url) break except IOError: tries -= 1 print "\rTry of", img_url, "failed,", tries, "tries left" if not f is None: s = f.read() f.close() mysubparser.parse(s) else: print "\rOpening of", img_url, "did not succeed, trying next one..." i += 1 progress.complete() return mysubparser.get_hyperlinks() def get_images(self, t = []): skipped = 0 failed = 0 downloaded = 0 total = len(t) progress = self.progress_reporter(total) i = 1 for link in t: progress.show_progress(i) filename = os.path.join(get_savedir(), os.path.split(link)[1]) if not check_archive(filename): tries = 10 while tries > 0: try: urllib.urlretrieve(link, filename) break except IOError: tries -= 1 if tries == 0: failed += 1 else: downloaded += 1 if self.on_downloaded is not None: if not self.on_downloaded(filename): failed += 1 else: skipped += 1 i += 1 progress.complete() return (skipped, failed, downloaded, total) if __name__ == "__main__": print "Don't run me, run 4grab.py"