4grab/download.py
ryuslash cc571e6d80 Still wasn't working on windows
if savedir did not exists, then downloading would never work. Now if it does not exist, it is created, or it crashes and burns if it isn't allowed.
2010-02-08 01:48:12 +01:00

99 lines
2.7 KiB
Python

#!/usr/bin/env python
import urllib
import os
import htmlparser
import progressbar
import config
savedir = config.Configuration().get_download_location()
if not os.path.exists(savedir):
os.makedirs(savedir)
def get_thread_links(baseurl):
myparser = htmlparser.MyParser()
t = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]
i = 1
total = len(t)
progress = progressbar.Progress(total)
for pagenum in t:
progress.show_progress(i)
url = base_url + pagenum
tries = 10
while tries > 0:
try:
f = urllib.urlopen(url)
break
except IOError:
tries -= 1
print "\rTry of", url, "failed,", tries, "tries left"
if not f is None:
# Read the response
s = f.read()
f.close()
# Process the page.
myparser.parse(s)
else:
"\rOpening of", url, "did not succeed, trying next one..."
i += 1
return myparser.get_hyperlinks()
def get_image_links(baseurl, t = []):
mysubparser = htmlparser.MySubParser()
total = len(t)
progress = progressbar.Progress(total)
i = 1
for link in t:
progress.show_progress(i)
img_url = base_url + link
tries = 10
while tries > 0:
try:
f = urllib.urlopen(img_url)
break
except IOError:
tries -= 1
print "\rTry of", img_url, "failed,", tries, "tries left"
if not f is None:
s = f.read()
f.close()
mysubparser.parse(s)
else:
print "\rOpening of", img_url, "did not succeed, trying next one..."
i += 1
return mysubparser.get_hyperlinks()
def get_images(t = []):
total = len(t)
progress = progressbar.Progress(total)
i = 1
for link in t:
progress.show_progress(i)
filename = os.path.join(savedir, os.path.split(link)[1])
if not os.path.exists(filename):
tries = 10
while tries > 0:
try:
urllib.urlretrieve(link, filename)
break
except IOError:
tries -= 1
print "\rDownloading of", link, "failed,", tries, "left"
else:
print "\rNot downloading", link, "already downloaded"
i += 1
if __name__ == "__main__":
# Get a file-like object for the 4chan.org w/imgboard
base_url = "http://boards.4chan.org/" + config.Configuration().get_category() + "/"
# Get the hyperlinks.
t = get_thread_links(base_url)
t = get_image_links(base_url, t)
get_images(t)