4grab/download.py

#!/usr/bin/env python

######################################################################
# Copyright 2009, 2010 ryuslash
#
# This file is part of 4grab.
#
# 4grab is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# 4grab is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with 4grab.  If not, see <http://www.gnu.org/licenses/>.
######################################################################

import urllib
import os
import htmlparser
import config

def get_savedir():
    conf = config.Configuration()
    savedir = conf.get_download_location()
    if not os.path.exists(savedir):
        os.makedirs(savedir)
    return savedir
def check_archive(fullpath):
    conf = config.Configuration()
    archive = conf.get_archive_location()
    filename = os.path.basename(fullpath)
    archfile = os.path.join(archive, filename)
    #print "Path", archfile, "exists:", os.path.exists(archfile)
    return os.path.exists(archfile)

class Downloader(object):
    def __init__(self, progress_reporter):
        self.progress_reporter = progress_reporter
        self.on_downloaded = None

    def set_on_downloaded(self, on_downloaded):
        self.on_downloaded = on_downloaded

    def get_thread_links(self, baseurl):
        myparser = htmlparser.MyParser()
        t = ["0", "1", "2", "3", "4",
             "5", "6", "7", "8", "9",
             "10", "11", "12", "13", "14", "15"]
        i = 1
        total = len(t)
        progress = self.progress_reporter(total)
        
        for pagenum in t:
            progress.show_progress(i)
            
            url = baseurl + pagenum
            tries = 10
            while tries > 0:
                try:
                    f = urllib.urlopen(url)
                    break
                except IOError:
                    tries -= 1
                    print "\rTry of", url, "failed,", tries, "tries left"
            if not f is None:
                # Read the response
                s = f.read()
                f.close()
                
                # Process the page.
                myparser.parse(s)
            else:
                "\rOpening of", url, "did not succeed, trying next one..."
            i += 1
    
        progress.complete()
        return myparser.get_hyperlinks()

    def get_image_links(self, baseurl, t = []):
        mysubparser = htmlparser.MySubParser()
        total = len(t)
        progress = self.progress_reporter(total)
        i = 1
    
        for link in t:
            progress.show_progress(i)

            img_url = baseurl + link
            tries = 10
            while tries > 0:
                try:
                    f = urllib.urlopen(img_url)
                    break
                except IOError:
                    tries -= 1
                    print "\rTry of", img_url, "failed,", tries, "tries left"
            if not f is None:
                s = f.read()
                f.close()

                mysubparser.parse(s)
            else:
                print "\rOpening of", img_url, "did not succeed, trying next one..."
            i += 1

        progress.complete()
        return mysubparser.get_hyperlinks()

    def get_images(self, t = []):
        skipped = 0
        failed = 0
        downloaded = 0
        total = len(t)
        progress = self.progress_reporter(total)
        i = 1
        for link in t:
            progress.show_progress(i)
            filename = os.path.join(get_savedir(), os.path.split(link)[1])
            if not check_archive(filename):
                tries = 10
                while tries > 0:
                    try:
                        urllib.urlretrieve(link, filename)
                        break
                    except IOError:
                        tries -= 1
                if tries == 0:
                    failed += 1
                else:
                    downloaded += 1
                    if self.on_downloaded is not None:
                        self.on_downloaded(filename)
            else:
                skipped += 1
            i += 1
    
        progress.complete()
        return (skipped, failed, downloaded, total)

if __name__ == "__main__":
    print "Don't run me, run 4grab.py"
added crunchbang 2010-01-17 03:56:00 +01:00			`#!/usr/bin/env python`
Added license info and README 2010-02-09 02:45:56 +01:00
			`######################################################################`
			`# Copyright 2009, 2010 ryuslash`
			`#`
			`# This file is part of 4grab.`
			`#`
			`# 4grab is free software: you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License as published by`
			`# the Free Software Foundation, either version 3 of the License, or`
			`# (at your option) any later version.`
			`#`
			`# 4grab is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU General Public License for more details.`
			`#`
			`# You should have received a copy of the GNU General Public License`
			`# along with 4grab. If not, see <http://www.gnu.org/licenses/>.`
			`######################################################################`

Initial commit Can download images from /w/ 2010-01-15 08:22:17 +01:00			`import urllib`
			`import os`
Parser seperation, progress bar * Seperated the parser from the downloader code. * Added a progressbar class, to make it look fancier * Created some functions to do all the work in downloader.py, cleaner now * Changed parser.py to htmlparser.py, since it was conflicting with a built-in module 2010-01-17 03:48:23 +01:00			`import htmlparser`
Started working on configuration file 2010-01-18 20:08:38 +01:00			`import config`
Initial commit Can download images from /w/ 2010-01-15 08:22:17 +01:00
optioncreator, extra options config now has an optioncreator property that will allow another module to select how to prompt for a property resolutions and archive options have been added to config file for sorting functionality 2010-03-17 23:11:18 +01:00			`def get_savedir():`
			`conf = config.Configuration()`
			`savedir = conf.get_download_location()`
			`if not os.path.exists(savedir):`
			`os.makedirs(savedir)`
			`return savedir`
Sorting, multi category, multi resolution After a file has been downloaded a callback function can now be called. The callback function I call checks to see if the resolution of the image appears in the collection of resolutions that has been entered in the configuration file and deletes/moves accordingly. If a file can not be read (which I have noticed happens sometimes), it is removed, not copied and not archived so that it can be retried later. 4grab got a new command-line option, -s --sorter, to sort out old images, running python sorter.py has the same effect, but this seemed pretties. theoretically multiple categories could now be entered into the configuration file seperated by ',', but this hasn't been tested yet. mutliple resolutions could be entered into the configuration file, seperated by ',' like so: 1680x1050,1920x1200. Configuration now checks to see if all the necessary properties are available in the configuration file, if one is missing, it tries to create it. 2010-03-19 00:18:04 +01:00			`def check_archive(fullpath):`
			`conf = config.Configuration()`
			`archive = conf.get_archive_location()`
			`filename = os.path.basename(fullpath)`
			`archfile = os.path.join(archive, filename)`
			`#print "Path", archfile, "exists:", os.path.exists(archfile)`
			`return os.path.exists(archfile)`
Initial commit Can download images from /w/ 2010-01-15 08:22:17 +01:00
Removed download.py dependency download.py no longer requires progressbar.py, it now contains a class that accepts another class as a parameter, this may be useful later when adding different ways of interaction 2010-03-07 00:20:37 +01:00			`class Downloader(object):`
			`def __init__(self, progress_reporter):`
			`self.progress_reporter = progress_reporter`
Sorting, multi category, multi resolution After a file has been downloaded a callback function can now be called. The callback function I call checks to see if the resolution of the image appears in the collection of resolutions that has been entered in the configuration file and deletes/moves accordingly. If a file can not be read (which I have noticed happens sometimes), it is removed, not copied and not archived so that it can be retried later. 4grab got a new command-line option, -s --sorter, to sort out old images, running python sorter.py has the same effect, but this seemed pretties. theoretically multiple categories could now be entered into the configuration file seperated by ',', but this hasn't been tested yet. mutliple resolutions could be entered into the configuration file, seperated by ',' like so: 1680x1050,1920x1200. Configuration now checks to see if all the necessary properties are available in the configuration file, if one is missing, it tries to create it. 2010-03-19 00:18:04 +01:00			`self.on_downloaded = None`

			`def set_on_downloaded(self, on_downloaded):`
			`self.on_downloaded = on_downloaded`
Parser seperation, progress bar * Seperated the parser from the downloader code. * Added a progressbar class, to make it look fancier * Created some functions to do all the work in downloader.py, cleaner now * Changed parser.py to htmlparser.py, since it was conflicting with a built-in module 2010-01-17 03:48:23 +01:00
Removed download.py dependency download.py no longer requires progressbar.py, it now contains a class that accepts another class as a parameter, this may be useful later when adding different ways of interaction 2010-03-07 00:20:37 +01:00			`def get_thread_links(self, baseurl):`
			`myparser = htmlparser.MyParser()`
Sorting, multi category, multi resolution After a file has been downloaded a callback function can now be called. The callback function I call checks to see if the resolution of the image appears in the collection of resolutions that has been entered in the configuration file and deletes/moves accordingly. If a file can not be read (which I have noticed happens sometimes), it is removed, not copied and not archived so that it can be retried later. 4grab got a new command-line option, -s --sorter, to sort out old images, running python sorter.py has the same effect, but this seemed pretties. theoretically multiple categories could now be entered into the configuration file seperated by ',', but this hasn't been tested yet. mutliple resolutions could be entered into the configuration file, seperated by ',' like so: 1680x1050,1920x1200. Configuration now checks to see if all the necessary properties are available in the configuration file, if one is missing, it tries to create it. 2010-03-19 00:18:04 +01:00			`t = ["0", "1", "2", "3", "4",`
			`"5", "6", "7", "8", "9",`
			`"10", "11", "12", "13", "14", "15"]`
Removed download.py dependency download.py no longer requires progressbar.py, it now contains a class that accepts another class as a parameter, this may be useful later when adding different ways of interaction 2010-03-07 00:20:37 +01:00			`i = 1`
			`total = len(t)`
			`progress = self.progress_reporter(total)`

			`for pagenum in t:`
			`progress.show_progress(i)`

			`url = baseurl + pagenum`
			`tries = 10`
			`while tries > 0:`
			`try:`
			`f = urllib.urlopen(url)`
			`break`
			`except IOError:`
			`tries -= 1`
			`print "\rTry of", url, "failed,", tries, "tries left"`
			`if not f is None:`
			`# Read the response`
			`s = f.read()`
			`f.close()`

			`# Process the page.`
			`myparser.parse(s)`
			`else:`
			`"\rOpening of", url, "did not succeed, trying next one..."`
			`i += 1`
Added --thread With --thread a signle thread ID or thread URL can be entered. If thread is a URL, it will download it. If thread is an ID, a category must also be set. 2010-02-11 22:05:37 +01:00
Removed download.py dependency download.py no longer requires progressbar.py, it now contains a class that accepts another class as a parameter, this may be useful later when adding different ways of interaction 2010-03-07 00:20:37 +01:00			`progress.complete()`
			`return myparser.get_hyperlinks()`
Parser seperation, progress bar * Seperated the parser from the downloader code. * Added a progressbar class, to make it look fancier * Created some functions to do all the work in downloader.py, cleaner now * Changed parser.py to htmlparser.py, since it was conflicting with a built-in module 2010-01-17 03:48:23 +01:00
Removed download.py dependency download.py no longer requires progressbar.py, it now contains a class that accepts another class as a parameter, this may be useful later when adding different ways of interaction 2010-03-07 00:20:37 +01:00			`def get_image_links(self, baseurl, t = []):`
			`mysubparser = htmlparser.MySubParser()`
			`total = len(t)`
			`progress = self.progress_reporter(total)`
			`i = 1`
Parser seperation, progress bar * Seperated the parser from the downloader code. * Added a progressbar class, to make it look fancier * Created some functions to do all the work in downloader.py, cleaner now * Changed parser.py to htmlparser.py, since it was conflicting with a built-in module 2010-01-17 03:48:23 +01:00
Removed download.py dependency download.py no longer requires progressbar.py, it now contains a class that accepts another class as a parameter, this may be useful later when adding different ways of interaction 2010-03-07 00:20:37 +01:00			`for link in t:`
			`progress.show_progress(i)`
Parser seperation, progress bar * Seperated the parser from the downloader code. * Added a progressbar class, to make it look fancier * Created some functions to do all the work in downloader.py, cleaner now * Changed parser.py to htmlparser.py, since it was conflicting with a built-in module 2010-01-17 03:48:23 +01:00
Removed download.py dependency download.py no longer requires progressbar.py, it now contains a class that accepts another class as a parameter, this may be useful later when adding different ways of interaction 2010-03-07 00:20:37 +01:00			`img_url = baseurl + link`
Initial commit Can download images from /w/ 2010-01-15 08:22:17 +01:00			`tries = 10`
			`while tries > 0:`
			`try:`
Removed download.py dependency download.py no longer requires progressbar.py, it now contains a class that accepts another class as a parameter, this may be useful later when adding different ways of interaction 2010-03-07 00:20:37 +01:00			`f = urllib.urlopen(img_url)`
Initial commit Can download images from /w/ 2010-01-15 08:22:17 +01:00			`break`
			`except IOError:`
Parser seperation, progress bar * Seperated the parser from the downloader code. * Added a progressbar class, to make it look fancier * Created some functions to do all the work in downloader.py, cleaner now * Changed parser.py to htmlparser.py, since it was conflicting with a built-in module 2010-01-17 03:48:23 +01:00			`tries -= 1`
Removed download.py dependency download.py no longer requires progressbar.py, it now contains a class that accepts another class as a parameter, this may be useful later when adding different ways of interaction 2010-03-07 00:20:37 +01:00			`print "\rTry of", img_url, "failed,", tries, "tries left"`
			`if not f is None:`
			`s = f.read()`
			`f.close()`

			`mysubparser.parse(s)`
			`else:`
			`print "\rOpening of", img_url, "did not succeed, trying next one..."`
			`i += 1`

			`progress.complete()`
			`return mysubparser.get_hyperlinks()`

			`def get_images(self, t = []):`
			`skipped = 0`
			`failed = 0`
			`downloaded = 0`
			`total = len(t)`
			`progress = self.progress_reporter(total)`
			`i = 1`
			`for link in t:`
			`progress.show_progress(i)`
optioncreator, extra options config now has an optioncreator property that will allow another module to select how to prompt for a property resolutions and archive options have been added to config file for sorting functionality 2010-03-17 23:11:18 +01:00			`filename = os.path.join(get_savedir(), os.path.split(link)[1])`
Sorting, multi category, multi resolution After a file has been downloaded a callback function can now be called. The callback function I call checks to see if the resolution of the image appears in the collection of resolutions that has been entered in the configuration file and deletes/moves accordingly. If a file can not be read (which I have noticed happens sometimes), it is removed, not copied and not archived so that it can be retried later. 4grab got a new command-line option, -s --sorter, to sort out old images, running python sorter.py has the same effect, but this seemed pretties. theoretically multiple categories could now be entered into the configuration file seperated by ',', but this hasn't been tested yet. mutliple resolutions could be entered into the configuration file, seperated by ',' like so: 1680x1050,1920x1200. Configuration now checks to see if all the necessary properties are available in the configuration file, if one is missing, it tries to create it. 2010-03-19 00:18:04 +01:00			`if not check_archive(filename):`
Removed download.py dependency download.py no longer requires progressbar.py, it now contains a class that accepts another class as a parameter, this may be useful later when adding different ways of interaction 2010-03-07 00:20:37 +01:00			`tries = 10`
			`while tries > 0:`
			`try:`
			`urllib.urlretrieve(link, filename)`
			`break`
			`except IOError:`
			`tries -= 1`
			`if tries == 0:`
			`failed += 1`
			`else:`
			`downloaded += 1`
Sorting, multi category, multi resolution After a file has been downloaded a callback function can now be called. The callback function I call checks to see if the resolution of the image appears in the collection of resolutions that has been entered in the configuration file and deletes/moves accordingly. If a file can not be read (which I have noticed happens sometimes), it is removed, not copied and not archived so that it can be retried later. 4grab got a new command-line option, -s --sorter, to sort out old images, running python sorter.py has the same effect, but this seemed pretties. theoretically multiple categories could now be entered into the configuration file seperated by ',', but this hasn't been tested yet. mutliple resolutions could be entered into the configuration file, seperated by ',' like so: 1680x1050,1920x1200. Configuration now checks to see if all the necessary properties are available in the configuration file, if one is missing, it tries to create it. 2010-03-19 00:18:04 +01:00			`if self.on_downloaded is not None:`
			`self.on_downloaded(filename)`
Stopped saying Failed and Skipped things Now it summarizes at the end 2010-02-12 00:04:34 +01:00			`else:`
Removed download.py dependency download.py no longer requires progressbar.py, it now contains a class that accepts another class as a parameter, this may be useful later when adding different ways of interaction 2010-03-07 00:20:37 +01:00			`skipped += 1`
			`i += 1`
Added --thread With --thread a signle thread ID or thread URL can be entered. If thread is a URL, it will download it. If thread is an ID, a category must also be set. 2010-02-11 22:05:37 +01:00
Removed download.py dependency download.py no longer requires progressbar.py, it now contains a class that accepts another class as a parameter, this may be useful later when adding different ways of interaction 2010-03-07 00:20:37 +01:00			`progress.complete()`
			`return (skipped, failed, downloaded, total)`
Parser seperation, progress bar * Seperated the parser from the downloader code. * Added a progressbar class, to make it look fancier * Created some functions to do all the work in downloader.py, cleaner now * Changed parser.py to htmlparser.py, since it was conflicting with a built-in module 2010-01-17 03:48:23 +01:00
			`if __name__ == "__main__":`
optioncreator, extra options config now has an optioncreator property that will allow another module to select how to prompt for a property resolutions and archive options have been added to config file for sorting functionality 2010-03-17 23:11:18 +01:00			`print "Don't run me, run 4grab.py"`