###################################################################### # Copyright 2009, 2010 ryuslash # # This file is part of 4grab. # # 4grab is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # 4grab is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with 4grab. If not, see . ###################################################################### import urllib import os import htmlparser import config import sys def get_savedir(): conf = config.Configuration() savedir = conf.get_download_location() if not os.path.exists(savedir): os.makedirs(savedir) return savedir def check_archive(fullpath): conf = config.Configuration() archive = conf.get_archive_location() filename = os.path.basename(fullpath) archfile = os.path.join(archive, filename) #sys.stderr.write("%s %d\n" % (archfile, os.path.exists(archfile))) return os.path.exists(archfile) def write(message): sys.stdout.write(message) sys.stdout.flush() class Downloader(object): def __init__(self, progress_reporter): self.progress_reporter = progress_reporter self.on_downloaded = None def set_on_downloaded(self, on_downloaded): self.on_downloaded = on_downloaded def download(self, url): f = None tries = 10 while tries > 0: try: f = urllib.urlopen(url) break except IOError: tries -= 1 write("\rTry of %s failed, %d tries left" % (url, tries)) return f def get_thread_links(self, baseurl): myparser = htmlparser.MyParser() i = 0 code = 0 url = None while code != 404: url = baseurl + str(i) f = self.download(url) if not f is None: code = f.getcode() if code == 404: write("\rCollected %d pages\n" % i) f.close() continue # Read the response s = f.read() f.close() # Process the page. myparser.parse(s) else: write("\rOpening of %s did not succeed, trying next one..." \ % url) i += 1 write("\rCollected %d pages" % i) return myparser.get_hyperlinks() def get_image_links(self, baseurl, t = []): mysubparser = htmlparser.MySubParser() total = len(t) progress = self.progress_reporter(total) i = 1 for link in t: progress.show_progress(i) img_url = baseurl + link f = self.download(img_url) if not f is None: s = f.read() f.close() mysubparser.parse(s) else: write("\rOpening of %s did not succeed, " \ "trying next one..." % img_url) i += 1 progress.complete() return mysubparser.get_hyperlinks() def get_images(self, t = []): skipped = 0 failed = 0 downloaded = 0 total = len(t) progress = self.progress_reporter(total) i = 1 for link in t: progress.show_progress(i) filename = os.path.join(get_savedir(), os.path.split(link)[1]) if not check_archive(filename): tries = 10 while tries > 0: try: urllib.urlretrieve(link, filename) break except IOError: tries -= 1 if tries == 0: failed += 1 else: downloaded += 1 if self.on_downloaded is not None: if not self.on_downloaded(filename): failed += 1 else: skipped += 1 i += 1 progress.complete() return (skipped, failed, downloaded, total) if __name__ == "__main__": print "Don't run me, run 4grab.py"