4grab/download.py

#!/usr/bin/env python

######################################################################
# Copyright 2009, 2010 ryuslash
#
# This file is part of 4grab.
#
# 4grab is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# 4grab is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with 4grab.  If not, see <http://www.gnu.org/licenses/>.
######################################################################

import urllib
import os
import htmlparser
import config
import sys

def get_savedir():
    conf = config.Configuration()
    savedir = conf.get_download_location()
    if not os.path.exists(savedir):
        os.makedirs(savedir)
    return savedir
def check_archive(fullpath):
    conf = config.Configuration()
    archive = conf.get_archive_location()
    filename = os.path.basename(fullpath)
    archfile = os.path.join(archive, filename)
    return os.path.exists(archfile)
def write(message):
    sys.stdout.write(message)
    sys.stdout.flush()

class Downloader(object):
    def __init__(self, progress_reporter):
        self.progress_reporter = progress_reporter
        self.on_downloaded = None

    def set_on_downloaded(self, on_downloaded):
        self.on_downloaded = on_downloaded

    def get_thread_links(self, baseurl):
        myparser = htmlparser.MyParser()
        i = 0
        code = 0
        url = None

        while code != 404:
            url = baseurl + str(i)
            tries = 10
            while tries > 0:
                try:
                    f = urllib.urlopen(url)
                    break
                except IOError:
                    tries -= 1
                    write("\rTry of %s failed, %d tries left" % (url, tries))
            if not f is None:
                code = f.getcode()
                if code == 404:
                    write("\rCollected %d pages\n" % i)
                    f.close()
                    continue
                # Read the response
                s = f.read()
                f.close()

                # Process the page.
                myparser.parse(s)
            else:
                write("\rOpening of %s did not succeed, trying next one..." \
                          % url)
            i += 1
            write("\rCollected %d pages" % i)

        return myparser.get_hyperlinks()

    def get_image_links(self, baseurl, t = []):
        mysubparser = htmlparser.MySubParser()
        total = len(t)
        progress = self.progress_reporter(total)
        i = 1

        for link in t:
            progress.show_progress(i)

            img_url = baseurl + link
            tries = 10
            while tries > 0:
                try:
                    f = urllib.urlopen(img_url)
                    break
                except IOError:
                    tries -= 1
                    write("\rTry of %s failed, %d tries left" \
                              % (img_url, tries))
            if not f is None:
                s = f.read()
                f.close()

                mysubparser.parse(s)
            else:
                write("\rOpening of %s did not succeed, " \
                    "trying next one..." % img_url)
            i += 1

        progress.complete()
        return mysubparser.get_hyperlinks()

    def get_images(self, t = []):
        skipped = 0
        failed = 0
        downloaded = 0
        total = len(t)
        progress = self.progress_reporter(total)
        i = 1
        for link in t:
            progress.show_progress(i)
            filename = os.path.join(get_savedir(), os.path.split(link)[1])
            if not check_archive(filename):
                tries = 10
                while tries > 0:
                    try:
                        urllib.urlretrieve(link, filename)
                        break
                    except IOError:
                        tries -= 1
                if tries == 0:
                    failed += 1
                else:
                    downloaded += 1
                    if self.on_downloaded is not None:
                        if not self.on_downloaded(filename):
                            failed += 1
            else:
                skipped += 1
            i += 1

        progress.complete()
        return (skipped, failed, downloaded, total)

if __name__ == "__main__":
    print "Don't run me, run 4grab.py"