4grab/download.py

#!/usr/bin/env python

######################################################################
# Copyright 2009, 2010 ryuslash
#
# This file is part of 4grab.
#
# 4grab is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# 4grab is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with 4grab.  If not, see <http://www.gnu.org/licenses/>.
######################################################################

import urllib
import os
import htmlparser
import progressbar
import config

savedir = config.Configuration().get_download_location()
if not os.path.exists(savedir):
    os.makedirs(savedir)

def get_thread_links(baseurl):
    myparser = htmlparser.MyParser()
    t = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]
    i = 1
    total = len(t)
    progress = progressbar.Progress(total)

    for pagenum in t:
        progress.show_progress(i)

        url = baseurl + pagenum
        tries = 10
        while tries > 0:
            try:
                f = urllib.urlopen(url)
                break
            except IOError:
                tries -= 1
                print "\rTry of", url, "failed,", tries, "tries left"
        if not f is None:
            # Read the response
            s = f.read()
            f.close()

            # Process the page.
            myparser.parse(s)
        else:
            "\rOpening of", url, "did not succeed, trying next one..."
        i += 1
    
    progress.complete()
    return myparser.get_hyperlinks()

def get_image_links(baseurl, t = []):
    mysubparser = htmlparser.MySubParser()
    total = len(t)
    progress = progressbar.Progress(total)
    i = 1
    
    for link in t:
        progress.show_progress(i)

        img_url = baseurl + link
        tries = 10
        while tries > 0:
            try:
                f = urllib.urlopen(img_url)
                break
            except IOError:
                tries -= 1
                print "\rTry of", img_url, "failed,", tries, "tries left"
        if not f is None:
            s = f.read()
            f.close()

            mysubparser.parse(s)
        else:
            print "\rOpening of", img_url, "did not succeed, trying next one..."
        i += 1

    progress.complete()
    return mysubparser.get_hyperlinks()

def get_images(t = []):
    total = len(t)
    progress = progressbar.Progress(total)
    i = 1
    for link in t:
        progress.show_progress(i)
        filename = os.path.join(savedir, os.path.split(link)[1])
        if not os.path.exists(filename):
            tries = 10
            while tries > 0:
                try:
                    urllib.urlretrieve(link, filename)
                    break
                except IOError:
                    tries -= 1
                    print "\rDownloading of", link, "failed,", tries, "left"
        else:
            print "\rNot downloading", link, "already downloaded"
        i += 1
    
    progress.complete()

if __name__ == "__main__":
    # Get a file-like object for the 4chan.org w/imgboard
    base_url = "http://boards.4chan.org/" + config.Configuration().get_category() + "/"
 
    # Get the hyperlinks.
    t = get_thread_links(base_url)
    t = get_image_links(base_url, t)
    get_images(t)
added crunchbang 2010-01-17 03:56:00 +01:00			`#!/usr/bin/env python`
Added license info and README 2010-02-09 02:45:56 +01:00
			`######################################################################`
			`# Copyright 2009, 2010 ryuslash`
			`#`
			`# This file is part of 4grab.`
			`#`
			`# 4grab is free software: you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License as published by`
			`# the Free Software Foundation, either version 3 of the License, or`
			`# (at your option) any later version.`
			`#`
			`# 4grab is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU General Public License for more details.`
			`#`
			`# You should have received a copy of the GNU General Public License`
			`# along with 4grab. If not, see <http://www.gnu.org/licenses/>.`
			`######################################################################`

Initial commit Can download images from /w/ 2010-01-15 08:22:17 +01:00			`import urllib`
			`import os`
Parser seperation, progress bar * Seperated the parser from the downloader code. * Added a progressbar class, to make it look fancier * Created some functions to do all the work in downloader.py, cleaner now * Changed parser.py to htmlparser.py, since it was conflicting with a built-in module 2010-01-17 03:48:23 +01:00			`import htmlparser`
			`import progressbar`
Started working on configuration file 2010-01-18 20:08:38 +01:00			`import config`
Initial commit Can download images from /w/ 2010-01-15 08:22:17 +01:00
Configuration * A configuration file is made if one hasn't been found * Certain values (currently download category and location) are stored in config file. 2010-01-26 20:20:39 +01:00			`savedir = config.Configuration().get_download_location()`
Still wasn't working on windows if savedir did not exists, then downloading would never work. Now if it does not exist, it is created, or it crashes and burns if it isn't allowed. 2010-02-08 01:48:12 +01:00			`if not os.path.exists(savedir):`
			`os.makedirs(savedir)`
Initial commit Can download images from /w/ 2010-01-15 08:22:17 +01:00
Parser seperation, progress bar * Seperated the parser from the downloader code. * Added a progressbar class, to make it look fancier * Created some functions to do all the work in downloader.py, cleaner now * Changed parser.py to htmlparser.py, since it was conflicting with a built-in module 2010-01-17 03:48:23 +01:00			`def get_thread_links(baseurl):`
			`myparser = htmlparser.MyParser()`
			`t = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]`
			`i = 1`
			`total = len(t)`
			`progress = progressbar.Progress(total)`

			`for pagenum in t:`
			`progress.show_progress(i)`

configuration settings and startfile Configuration Settings can now be changed with the -e command line argument execution of 4grab has been moved to 4grab.py, but should also still work with download.py 2010-02-09 02:10:17 +01:00			`url = baseurl + pagenum`
Initial commit Can download images from /w/ 2010-01-15 08:22:17 +01:00			`tries = 10`
			`while tries > 0:`
Parser seperation, progress bar * Seperated the parser from the downloader code. * Added a progressbar class, to make it look fancier * Created some functions to do all the work in downloader.py, cleaner now * Changed parser.py to htmlparser.py, since it was conflicting with a built-in module 2010-01-17 03:48:23 +01:00			`try:`
Initial commit Can download images from /w/ 2010-01-15 08:22:17 +01:00			`f = urllib.urlopen(url)`
			`break`
			`except IOError:`
Parser seperation, progress bar * Seperated the parser from the downloader code. * Added a progressbar class, to make it look fancier * Created some functions to do all the work in downloader.py, cleaner now * Changed parser.py to htmlparser.py, since it was conflicting with a built-in module 2010-01-17 03:48:23 +01:00			`tries -= 1`
			`print "\rTry of", url, "failed,", tries, "tries left"`
Initial commit Can download images from /w/ 2010-01-15 08:22:17 +01:00			`if not f is None:`
Parser seperation, progress bar * Seperated the parser from the downloader code. * Added a progressbar class, to make it look fancier * Created some functions to do all the work in downloader.py, cleaner now * Changed parser.py to htmlparser.py, since it was conflicting with a built-in module 2010-01-17 03:48:23 +01:00			`# Read the response`
Initial commit Can download images from /w/ 2010-01-15 08:22:17 +01:00			`s = f.read()`
			`f.close()`
Parser seperation, progress bar * Seperated the parser from the downloader code. * Added a progressbar class, to make it look fancier * Created some functions to do all the work in downloader.py, cleaner now * Changed parser.py to htmlparser.py, since it was conflicting with a built-in module 2010-01-17 03:48:23 +01:00
			`# Process the page.`
Initial commit Can download images from /w/ 2010-01-15 08:22:17 +01:00			`myparser.parse(s)`
			`else:`
Parser seperation, progress bar * Seperated the parser from the downloader code. * Added a progressbar class, to make it look fancier * Created some functions to do all the work in downloader.py, cleaner now * Changed parser.py to htmlparser.py, since it was conflicting with a built-in module 2010-01-17 03:48:23 +01:00			`"\rOpening of", url, "did not succeed, trying next one..."`
			`i += 1`
Added --thread With --thread a signle thread ID or thread URL can be entered. If thread is a URL, it will download it. If thread is an ID, a category must also be set. 2010-02-11 22:05:37 +01:00
			`progress.complete()`
Parser seperation, progress bar * Seperated the parser from the downloader code. * Added a progressbar class, to make it look fancier * Created some functions to do all the work in downloader.py, cleaner now * Changed parser.py to htmlparser.py, since it was conflicting with a built-in module 2010-01-17 03:48:23 +01:00			`return myparser.get_hyperlinks()`

			`def get_image_links(baseurl, t = []):`
			`mysubparser = htmlparser.MySubParser()`
Initial commit Can download images from /w/ 2010-01-15 08:22:17 +01:00			`total = len(t)`
Parser seperation, progress bar * Seperated the parser from the downloader code. * Added a progressbar class, to make it look fancier * Created some functions to do all the work in downloader.py, cleaner now * Changed parser.py to htmlparser.py, since it was conflicting with a built-in module 2010-01-17 03:48:23 +01:00			`progress = progressbar.Progress(total)`
Initial commit Can download images from /w/ 2010-01-15 08:22:17 +01:00			`i = 1`
Parser seperation, progress bar * Seperated the parser from the downloader code. * Added a progressbar class, to make it look fancier * Created some functions to do all the work in downloader.py, cleaner now * Changed parser.py to htmlparser.py, since it was conflicting with a built-in module 2010-01-17 03:48:23 +01:00
Initial commit Can download images from /w/ 2010-01-15 08:22:17 +01:00			`for link in t:`
Parser seperation, progress bar * Seperated the parser from the downloader code. * Added a progressbar class, to make it look fancier * Created some functions to do all the work in downloader.py, cleaner now * Changed parser.py to htmlparser.py, since it was conflicting with a built-in module 2010-01-17 03:48:23 +01:00			`progress.show_progress(i)`

configuration settings and startfile Configuration Settings can now be changed with the -e command line argument execution of 4grab has been moved to 4grab.py, but should also still work with download.py 2010-02-09 02:10:17 +01:00			`img_url = baseurl + link`
Initial commit Can download images from /w/ 2010-01-15 08:22:17 +01:00			`tries = 10`
			`while tries > 0:`
			`try:`
			`f = urllib.urlopen(img_url)`
			`break`
			`except IOError:`
Parser seperation, progress bar * Seperated the parser from the downloader code. * Added a progressbar class, to make it look fancier * Created some functions to do all the work in downloader.py, cleaner now * Changed parser.py to htmlparser.py, since it was conflicting with a built-in module 2010-01-17 03:48:23 +01:00			`tries -= 1`
			`print "\rTry of", img_url, "failed,", tries, "tries left"`
Initial commit Can download images from /w/ 2010-01-15 08:22:17 +01:00			`if not f is None:`
			`s = f.read()`
			`f.close()`

			`mysubparser.parse(s)`
			`else:`
Parser seperation, progress bar * Seperated the parser from the downloader code. * Added a progressbar class, to make it look fancier * Created some functions to do all the work in downloader.py, cleaner now * Changed parser.py to htmlparser.py, since it was conflicting with a built-in module 2010-01-17 03:48:23 +01:00			`print "\rOpening of", img_url, "did not succeed, trying next one..."`
			`i += 1`
Initial commit Can download images from /w/ 2010-01-15 08:22:17 +01:00
Added --thread With --thread a signle thread ID or thread URL can be entered. If thread is a URL, it will download it. If thread is an ID, a category must also be set. 2010-02-11 22:05:37 +01:00			`progress.complete()`
Parser seperation, progress bar * Seperated the parser from the downloader code. * Added a progressbar class, to make it look fancier * Created some functions to do all the work in downloader.py, cleaner now * Changed parser.py to htmlparser.py, since it was conflicting with a built-in module 2010-01-17 03:48:23 +01:00			`return mysubparser.get_hyperlinks()`

			`def get_images(t = []):`
Initial commit Can download images from /w/ 2010-01-15 08:22:17 +01:00			`total = len(t)`
Parser seperation, progress bar * Seperated the parser from the downloader code. * Added a progressbar class, to make it look fancier * Created some functions to do all the work in downloader.py, cleaner now * Changed parser.py to htmlparser.py, since it was conflicting with a built-in module 2010-01-17 03:48:23 +01:00			`progress = progressbar.Progress(total)`
Initial commit Can download images from /w/ 2010-01-15 08:22:17 +01:00			`i = 1`
			`for link in t:`
Parser seperation, progress bar * Seperated the parser from the downloader code. * Added a progressbar class, to make it look fancier * Created some functions to do all the work in downloader.py, cleaner now * Changed parser.py to htmlparser.py, since it was conflicting with a built-in module 2010-01-17 03:48:23 +01:00			`progress.show_progress(i)`
Initial commit Can download images from /w/ 2010-01-15 08:22:17 +01:00			`filename = os.path.join(savedir, os.path.split(link)[1])`
			`if not os.path.exists(filename):`
			`tries = 10`
			`while tries > 0:`
			`try:`
			`urllib.urlretrieve(link, filename)`
			`break`
			`except IOError:`
Parser seperation, progress bar * Seperated the parser from the downloader code. * Added a progressbar class, to make it look fancier * Created some functions to do all the work in downloader.py, cleaner now * Changed parser.py to htmlparser.py, since it was conflicting with a built-in module 2010-01-17 03:48:23 +01:00			`tries -= 1`
			`print "\rDownloading of", link, "failed,", tries, "left"`
Initial commit Can download images from /w/ 2010-01-15 08:22:17 +01:00			`else:`
Parser seperation, progress bar * Seperated the parser from the downloader code. * Added a progressbar class, to make it look fancier * Created some functions to do all the work in downloader.py, cleaner now * Changed parser.py to htmlparser.py, since it was conflicting with a built-in module 2010-01-17 03:48:23 +01:00			`print "\rNot downloading", link, "already downloaded"`
			`i += 1`
Added --thread With --thread a signle thread ID or thread URL can be entered. If thread is a URL, it will download it. If thread is an ID, a category must also be set. 2010-02-11 22:05:37 +01:00
			`progress.complete()`
Parser seperation, progress bar * Seperated the parser from the downloader code. * Added a progressbar class, to make it look fancier * Created some functions to do all the work in downloader.py, cleaner now * Changed parser.py to htmlparser.py, since it was conflicting with a built-in module 2010-01-17 03:48:23 +01:00
			`if __name__ == "__main__":`
			`# Get a file-like object for the 4chan.org w/imgboard`
Configuration * A configuration file is made if one hasn't been found * Certain values (currently download category and location) are stored in config file. 2010-01-26 20:20:39 +01:00			`base_url = "http://boards.4chan.org/" + config.Configuration().get_category() + "/"`
Parser seperation, progress bar * Seperated the parser from the downloader code. * Added a progressbar class, to make it look fancier * Created some functions to do all the work in downloader.py, cleaner now * Changed parser.py to htmlparser.py, since it was conflicting with a built-in module 2010-01-17 03:48:23 +01:00
			`# Get the hyperlinks.`
			`t = get_thread_links(base_url)`
			`t = get_image_links(base_url, t)`
			`get_images(t)`