4grab/download.py

######################################################################
# Copyright 2009, 2010 ryuslash
#
# This file is part of 4grab.
#
# 4grab is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# 4grab is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with 4grab.  If not, see <http://www.gnu.org/licenses/>.
######################################################################

import urllib
import os
import htmlparser
import config
import sys

def get_savedir():
    conf = config.Configuration()
    savedir = conf.get_download_location()
    if not os.path.exists(savedir):
        os.makedirs(savedir)
    return savedir
def check_archive(fullpath):
    conf = config.Configuration()
    archive = conf.get_archive_location()
    filename = os.path.basename(fullpath)
    archfile = os.path.join(archive, filename)
    #sys.stderr.write("%s %d\n" % (archfile, os.path.exists(archfile)))
    return os.path.exists(archfile)
def write(message):
    sys.stdout.write(message)
    sys.stdout.flush()

class Downloader(object):
    def __init__(self, progress_reporter):
        self.progress_reporter = progress_reporter
        self.on_downloaded = None

    def set_on_downloaded(self, on_downloaded):
        self.on_downloaded = on_downloaded

    def download(self, url):
        f = None
        tries = 10
        while tries > 0:
            try:
                f = urllib.urlopen(url)
                break
            except IOError:
                tries -= 1
                write("\rTry of %s failed, %d tries left" % (url, tries))
        return f

    def get_thread_links(self, baseurl):
        myparser = htmlparser.MyParser()
        i = 0
        code = 0
        url = None

        while code != 404:
            url = baseurl + str(i)
            f = self.download(url)

            if not f is None:
                code = f.getcode()
                if code == 404:
                    write("\rCollected %d pages\n" % i)
                    f.close()
                    continue
                # Read the response
                s = f.read()
                f.close()
                
                # Process the page.
                myparser.parse(s)
            else:
                write("\rOpening of %s did not succeed, trying next one..." \
                          % url)
            i += 1
            write("\rCollected %d pages" % i)
    
        return myparser.get_hyperlinks()

    def get_image_links(self, baseurl, t = []):
        mysubparser = htmlparser.MySubParser()
        total = len(t)
        progress = self.progress_reporter(total)
        i = 1
    
        for link in t:
            progress.show_progress(i)

            img_url = baseurl + link
            f = self.download(img_url)

            if not f is None:
                s = f.read()
                f.close()

                mysubparser.parse(s)
            else:
                write("\rOpening of %s did not succeed, " \
                    "trying next one..." % img_url)
            i += 1

        progress.complete()
        return mysubparser.get_hyperlinks()

    def get_images(self, t = []):
        skipped = 0
        failed = 0
        downloaded = 0
        total = len(t)
        progress = self.progress_reporter(total)
        i = 1
        for link in t:
            progress.show_progress(i)
            filename = os.path.join(get_savedir(), os.path.split(link)[1])
            if not check_archive(filename):
                tries = 10
                while tries > 0:
                    try:
                        urllib.urlretrieve(link, filename)
                        break
                    except IOError:
                        tries -= 1
                if tries == 0:
                    failed += 1
                else:
                    downloaded += 1
                    if self.on_downloaded is not None:
                        if not self.on_downloaded(filename):
                            failed += 1
            else:
                skipped += 1
            i += 1
    
        progress.complete()
        return (skipped, failed, downloaded, total)

if __name__ == "__main__":
    print "Don't run me, run 4grab.py"
Added license info and README 2010-02-09 02:45:56 +01:00			`######################################################################`
			`# Copyright 2009, 2010 ryuslash`
			`#`
			`# This file is part of 4grab.`
			`#`
			`# 4grab is free software: you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License as published by`
			`# the Free Software Foundation, either version 3 of the License, or`
			`# (at your option) any later version.`
			`#`
			`# 4grab is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU General Public License for more details.`
			`#`
			`# You should have received a copy of the GNU General Public License`
			`# along with 4grab. If not, see <http://www.gnu.org/licenses/>.`
			`######################################################################`

Initial commit Can download images from /w/ 2010-01-15 08:22:17 +01:00			`import urllib`
			`import os`
Parser seperation, progress bar * Seperated the parser from the downloader code. * Added a progressbar class, to make it look fancier * Created some functions to do all the work in downloader.py, cleaner now * Changed parser.py to htmlparser.py, since it was conflicting with a built-in module 2010-01-17 03:48:23 +01:00			`import htmlparser`
Started working on configuration file 2010-01-18 20:08:38 +01:00			`import config`
non-fixed page count 4grab no longer assumes either 11 or 16 pages, it will keep trying to collect new pages up to the moment it receives a 404 error 2010-03-25 22:28:08 +01:00			`import sys`
Initial commit Can download images from /w/ 2010-01-15 08:22:17 +01:00
optioncreator, extra options config now has an optioncreator property that will allow another module to select how to prompt for a property resolutions and archive options have been added to config file for sorting functionality 2010-03-17 23:11:18 +01:00			`def get_savedir():`
			`conf = config.Configuration()`
			`savedir = conf.get_download_location()`
			`if not os.path.exists(savedir):`
			`os.makedirs(savedir)`
			`return savedir`
Sorting, multi category, multi resolution After a file has been downloaded a callback function can now be called. The callback function I call checks to see if the resolution of the image appears in the collection of resolutions that has been entered in the configuration file and deletes/moves accordingly. If a file can not be read (which I have noticed happens sometimes), it is removed, not copied and not archived so that it can be retried later. 4grab got a new command-line option, -s --sorter, to sort out old images, running python sorter.py has the same effect, but this seemed pretties. theoretically multiple categories could now be entered into the configuration file seperated by ',', but this hasn't been tested yet. mutliple resolutions could be entered into the configuration file, seperated by ',' like so: 1680x1050,1920x1200. Configuration now checks to see if all the necessary properties are available in the configuration file, if one is missing, it tries to create it. 2010-03-19 00:18:04 +01:00			`def check_archive(fullpath):`
			`conf = config.Configuration()`
			`archive = conf.get_archive_location()`
			`filename = os.path.basename(fullpath)`
			`archfile = os.path.join(archive, filename)`
Error with archive The archive function in sorter.py didn't archive to .arch, now it does 2010-04-07 23:30:59 +02:00			`#sys.stderr.write("%s %d\n" % (archfile, os.path.exists(archfile)))`
Sorting, multi category, multi resolution After a file has been downloaded a callback function can now be called. The callback function I call checks to see if the resolution of the image appears in the collection of resolutions that has been entered in the configuration file and deletes/moves accordingly. If a file can not be read (which I have noticed happens sometimes), it is removed, not copied and not archived so that it can be retried later. 4grab got a new command-line option, -s --sorter, to sort out old images, running python sorter.py has the same effect, but this seemed pretties. theoretically multiple categories could now be entered into the configuration file seperated by ',', but this hasn't been tested yet. mutliple resolutions could be entered into the configuration file, seperated by ',' like so: 1680x1050,1920x1200. Configuration now checks to see if all the necessary properties are available in the configuration file, if one is missing, it tries to create it. 2010-03-19 00:18:04 +01:00			`return os.path.exists(archfile)`
non-fixed page count 4grab no longer assumes either 11 or 16 pages, it will keep trying to collect new pages up to the moment it receives a 404 error 2010-03-25 22:28:08 +01:00			`def write(message):`
			`sys.stdout.write(message)`
			`sys.stdout.flush()`
Initial commit Can download images from /w/ 2010-01-15 08:22:17 +01:00
Removed download.py dependency download.py no longer requires progressbar.py, it now contains a class that accepts another class as a parameter, this may be useful later when adding different ways of interaction 2010-03-07 00:20:37 +01:00			`class Downloader(object):`
			`def __init__(self, progress_reporter):`
			`self.progress_reporter = progress_reporter`
Sorting, multi category, multi resolution After a file has been downloaded a callback function can now be called. The callback function I call checks to see if the resolution of the image appears in the collection of resolutions that has been entered in the configuration file and deletes/moves accordingly. If a file can not be read (which I have noticed happens sometimes), it is removed, not copied and not archived so that it can be retried later. 4grab got a new command-line option, -s --sorter, to sort out old images, running python sorter.py has the same effect, but this seemed pretties. theoretically multiple categories could now be entered into the configuration file seperated by ',', but this hasn't been tested yet. mutliple resolutions could be entered into the configuration file, seperated by ',' like so: 1680x1050,1920x1200. Configuration now checks to see if all the necessary properties are available in the configuration file, if one is missing, it tries to create it. 2010-03-19 00:18:04 +01:00			`self.on_downloaded = None`

			`def set_on_downloaded(self, on_downloaded):`
			`self.on_downloaded = on_downloaded`
Parser seperation, progress bar * Seperated the parser from the downloader code. * Added a progressbar class, to make it look fancier * Created some functions to do all the work in downloader.py, cleaner now * Changed parser.py to htmlparser.py, since it was conflicting with a built-in module 2010-01-17 03:48:23 +01:00
Seperate download function download function has been seperated from get_thread_links and get_image_links 2010-03-30 15:59:15 +02:00			`def download(self, url):`
			`f = None`
			`tries = 10`
			`while tries > 0:`
			`try:`
			`f = urllib.urlopen(url)`
			`break`
			`except IOError:`
			`tries -= 1`
			`write("\rTry of %s failed, %d tries left" % (url, tries))`
			`return f`

Removed download.py dependency download.py no longer requires progressbar.py, it now contains a class that accepts another class as a parameter, this may be useful later when adding different ways of interaction 2010-03-07 00:20:37 +01:00			`def get_thread_links(self, baseurl):`
			`myparser = htmlparser.MyParser()`
non-fixed page count 4grab no longer assumes either 11 or 16 pages, it will keep trying to collect new pages up to the moment it receives a 404 error 2010-03-25 22:28:08 +01:00			`i = 0`
			`code = 0`
			`url = None`

			`while code != 404:`
			`url = baseurl + str(i)`
Seperate download function download function has been seperated from get_thread_links and get_image_links 2010-03-30 15:59:15 +02:00			`f = self.download(url)`

Removed download.py dependency download.py no longer requires progressbar.py, it now contains a class that accepts another class as a parameter, this may be useful later when adding different ways of interaction 2010-03-07 00:20:37 +01:00			`if not f is None:`
non-fixed page count 4grab no longer assumes either 11 or 16 pages, it will keep trying to collect new pages up to the moment it receives a 404 error 2010-03-25 22:28:08 +01:00			`code = f.getcode()`
			`if code == 404:`
			`write("\rCollected %d pages\n" % i)`
			`f.close()`
			`continue`
Removed download.py dependency download.py no longer requires progressbar.py, it now contains a class that accepts another class as a parameter, this may be useful later when adding different ways of interaction 2010-03-07 00:20:37 +01:00			`# Read the response`
			`s = f.read()`
			`f.close()`

			`# Process the page.`
			`myparser.parse(s)`
			`else:`
non-fixed page count 4grab no longer assumes either 11 or 16 pages, it will keep trying to collect new pages up to the moment it receives a 404 error 2010-03-25 22:28:08 +01:00			`write("\rOpening of %s did not succeed, trying next one..." \`
			`% url)`
Removed download.py dependency download.py no longer requires progressbar.py, it now contains a class that accepts another class as a parameter, this may be useful later when adding different ways of interaction 2010-03-07 00:20:37 +01:00			`i += 1`
non-fixed page count 4grab no longer assumes either 11 or 16 pages, it will keep trying to collect new pages up to the moment it receives a 404 error 2010-03-25 22:28:08 +01:00			`write("\rCollected %d pages" % i)`
Added --thread With --thread a signle thread ID or thread URL can be entered. If thread is a URL, it will download it. If thread is an ID, a category must also be set. 2010-02-11 22:05:37 +01:00
Removed download.py dependency download.py no longer requires progressbar.py, it now contains a class that accepts another class as a parameter, this may be useful later when adding different ways of interaction 2010-03-07 00:20:37 +01:00			`return myparser.get_hyperlinks()`
Parser seperation, progress bar * Seperated the parser from the downloader code. * Added a progressbar class, to make it look fancier * Created some functions to do all the work in downloader.py, cleaner now * Changed parser.py to htmlparser.py, since it was conflicting with a built-in module 2010-01-17 03:48:23 +01:00
Removed download.py dependency download.py no longer requires progressbar.py, it now contains a class that accepts another class as a parameter, this may be useful later when adding different ways of interaction 2010-03-07 00:20:37 +01:00			`def get_image_links(self, baseurl, t = []):`
			`mysubparser = htmlparser.MySubParser()`
			`total = len(t)`
			`progress = self.progress_reporter(total)`
			`i = 1`
Parser seperation, progress bar * Seperated the parser from the downloader code. * Added a progressbar class, to make it look fancier * Created some functions to do all the work in downloader.py, cleaner now * Changed parser.py to htmlparser.py, since it was conflicting with a built-in module 2010-01-17 03:48:23 +01:00
Removed download.py dependency download.py no longer requires progressbar.py, it now contains a class that accepts another class as a parameter, this may be useful later when adding different ways of interaction 2010-03-07 00:20:37 +01:00			`for link in t:`
			`progress.show_progress(i)`
Parser seperation, progress bar * Seperated the parser from the downloader code. * Added a progressbar class, to make it look fancier * Created some functions to do all the work in downloader.py, cleaner now * Changed parser.py to htmlparser.py, since it was conflicting with a built-in module 2010-01-17 03:48:23 +01:00
Removed download.py dependency download.py no longer requires progressbar.py, it now contains a class that accepts another class as a parameter, this may be useful later when adding different ways of interaction 2010-03-07 00:20:37 +01:00			`img_url = baseurl + link`
Seperate download function download function has been seperated from get_thread_links and get_image_links 2010-03-30 15:59:15 +02:00			`f = self.download(img_url)`

Removed download.py dependency download.py no longer requires progressbar.py, it now contains a class that accepts another class as a parameter, this may be useful later when adding different ways of interaction 2010-03-07 00:20:37 +01:00			`if not f is None:`
			`s = f.read()`
			`f.close()`

			`mysubparser.parse(s)`
			`else:`
non-fixed page count 4grab no longer assumes either 11 or 16 pages, it will keep trying to collect new pages up to the moment it receives a 404 error 2010-03-25 22:28:08 +01:00			`write("\rOpening of %s did not succeed, " \`
			`"trying next one..." % img_url)`
Removed download.py dependency download.py no longer requires progressbar.py, it now contains a class that accepts another class as a parameter, this may be useful later when adding different ways of interaction 2010-03-07 00:20:37 +01:00			`i += 1`

			`progress.complete()`
			`return mysubparser.get_hyperlinks()`

			`def get_images(self, t = []):`
			`skipped = 0`
			`failed = 0`
			`downloaded = 0`
			`total = len(t)`
			`progress = self.progress_reporter(total)`
			`i = 1`
			`for link in t:`
			`progress.show_progress(i)`
optioncreator, extra options config now has an optioncreator property that will allow another module to select how to prompt for a property resolutions and archive options have been added to config file for sorting functionality 2010-03-17 23:11:18 +01:00			`filename = os.path.join(get_savedir(), os.path.split(link)[1])`
Sorting, multi category, multi resolution After a file has been downloaded a callback function can now be called. The callback function I call checks to see if the resolution of the image appears in the collection of resolutions that has been entered in the configuration file and deletes/moves accordingly. If a file can not be read (which I have noticed happens sometimes), it is removed, not copied and not archived so that it can be retried later. 4grab got a new command-line option, -s --sorter, to sort out old images, running python sorter.py has the same effect, but this seemed pretties. theoretically multiple categories could now be entered into the configuration file seperated by ',', but this hasn't been tested yet. mutliple resolutions could be entered into the configuration file, seperated by ',' like so: 1680x1050,1920x1200. Configuration now checks to see if all the necessary properties are available in the configuration file, if one is missing, it tries to create it. 2010-03-19 00:18:04 +01:00			`if not check_archive(filename):`
Removed download.py dependency download.py no longer requires progressbar.py, it now contains a class that accepts another class as a parameter, this may be useful later when adding different ways of interaction 2010-03-07 00:20:37 +01:00			`tries = 10`
			`while tries > 0:`
			`try:`
			`urllib.urlretrieve(link, filename)`
			`break`
			`except IOError:`
			`tries -= 1`
			`if tries == 0:`
			`failed += 1`
			`else:`
			`downloaded += 1`
Sorting, multi category, multi resolution After a file has been downloaded a callback function can now be called. The callback function I call checks to see if the resolution of the image appears in the collection of resolutions that has been entered in the configuration file and deletes/moves accordingly. If a file can not be read (which I have noticed happens sometimes), it is removed, not copied and not archived so that it can be retried later. 4grab got a new command-line option, -s --sorter, to sort out old images, running python sorter.py has the same effect, but this seemed pretties. theoretically multiple categories could now be entered into the configuration file seperated by ',', but this hasn't been tested yet. mutliple resolutions could be entered into the configuration file, seperated by ',' like so: 1680x1050,1920x1200. Configuration now checks to see if all the necessary properties are available in the configuration file, if one is missing, it tries to create it. 2010-03-19 00:18:04 +01:00			`if self.on_downloaded is not None:`
FAIL and print If source and dest in copy are the same, it is no longer reported If an image can't be read, it is counted as failed 2010-03-19 16:08:39 +01:00			`if not self.on_downloaded(filename):`
			`failed += 1`
Stopped saying Failed and Skipped things Now it summarizes at the end 2010-02-12 00:04:34 +01:00			`else:`
Removed download.py dependency download.py no longer requires progressbar.py, it now contains a class that accepts another class as a parameter, this may be useful later when adding different ways of interaction 2010-03-07 00:20:37 +01:00			`skipped += 1`
			`i += 1`
Added --thread With --thread a signle thread ID or thread URL can be entered. If thread is a URL, it will download it. If thread is an ID, a category must also be set. 2010-02-11 22:05:37 +01:00
Removed download.py dependency download.py no longer requires progressbar.py, it now contains a class that accepts another class as a parameter, this may be useful later when adding different ways of interaction 2010-03-07 00:20:37 +01:00			`progress.complete()`
			`return (skipped, failed, downloaded, total)`
Parser seperation, progress bar * Seperated the parser from the downloader code. * Added a progressbar class, to make it look fancier * Created some functions to do all the work in downloader.py, cleaner now * Changed parser.py to htmlparser.py, since it was conflicting with a built-in module 2010-01-17 03:48:23 +01:00
			`if __name__ == "__main__":`
optioncreator, extra options config now has an optioncreator property that will allow another module to select how to prompt for a property resolutions and archive options have been added to config file for sorting functionality 2010-03-17 23:11:18 +01:00			`print "Don't run me, run 4grab.py"`