4grab/download.py
ryuslash f09ea95c84 Added --thread
With --thread a signle thread ID or thread URL can be entered.
If thread is a URL, it will download it.
If thread is an ID, a category must also be set.
2010-02-11 22:05:37 +01:00

124 lines
3.6 KiB
Python

#!/usr/bin/env python
######################################################################
# Copyright 2009, 2010 ryuslash
#
# This file is part of 4grab.
#
# 4grab is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# 4grab is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with 4grab. If not, see <http://www.gnu.org/licenses/>.
######################################################################
import urllib
import os
import htmlparser
import progressbar
import config
savedir = config.Configuration().get_download_location()
if not os.path.exists(savedir):
os.makedirs(savedir)
def get_thread_links(baseurl):
myparser = htmlparser.MyParser()
t = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]
i = 1
total = len(t)
progress = progressbar.Progress(total)
for pagenum in t:
progress.show_progress(i)
url = baseurl + pagenum
tries = 10
while tries > 0:
try:
f = urllib.urlopen(url)
break
except IOError:
tries -= 1
print "\rTry of", url, "failed,", tries, "tries left"
if not f is None:
# Read the response
s = f.read()
f.close()
# Process the page.
myparser.parse(s)
else:
"\rOpening of", url, "did not succeed, trying next one..."
i += 1
progress.complete()
return myparser.get_hyperlinks()
def get_image_links(baseurl, t = []):
mysubparser = htmlparser.MySubParser()
total = len(t)
progress = progressbar.Progress(total)
i = 1
for link in t:
progress.show_progress(i)
img_url = baseurl + link
tries = 10
while tries > 0:
try:
f = urllib.urlopen(img_url)
break
except IOError:
tries -= 1
print "\rTry of", img_url, "failed,", tries, "tries left"
if not f is None:
s = f.read()
f.close()
mysubparser.parse(s)
else:
print "\rOpening of", img_url, "did not succeed, trying next one..."
i += 1
progress.complete()
return mysubparser.get_hyperlinks()
def get_images(t = []):
total = len(t)
progress = progressbar.Progress(total)
i = 1
for link in t:
progress.show_progress(i)
filename = os.path.join(savedir, os.path.split(link)[1])
if not os.path.exists(filename):
tries = 10
while tries > 0:
try:
urllib.urlretrieve(link, filename)
break
except IOError:
tries -= 1
print "\rDownloading of", link, "failed,", tries, "left"
else:
print "\rNot downloading", link, "already downloaded"
i += 1
progress.complete()
if __name__ == "__main__":
# Get a file-like object for the 4chan.org w/imgboard
base_url = "http://boards.4chan.org/" + config.Configuration().get_category() + "/"
# Get the hyperlinks.
t = get_thread_links(base_url)
t = get_image_links(base_url, t)
get_images(t)