4grab/download.py
ryuslash 4b70374e9d FAIL and print
If source and dest in copy are the same, it is no longer reported
If an image can't be read, it is counted as failed
2010-03-19 16:08:39 +01:00

146 lines
4.5 KiB
Python

#!/usr/bin/env python
######################################################################
# Copyright 2009, 2010 ryuslash
#
# This file is part of 4grab.
#
# 4grab is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# 4grab is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with 4grab. If not, see <http://www.gnu.org/licenses/>.
######################################################################
import urllib
import os
import htmlparser
import config
def get_savedir():
conf = config.Configuration()
savedir = conf.get_download_location()
if not os.path.exists(savedir):
os.makedirs(savedir)
return savedir
def check_archive(fullpath):
conf = config.Configuration()
archive = conf.get_archive_location()
filename = os.path.basename(fullpath)
archfile = os.path.join(archive, filename)
return os.path.exists(archfile)
class Downloader(object):
def __init__(self, progress_reporter):
self.progress_reporter = progress_reporter
self.on_downloaded = None
def set_on_downloaded(self, on_downloaded):
self.on_downloaded = on_downloaded
def get_thread_links(self, baseurl):
myparser = htmlparser.MyParser()
t = ["0", "1", "2", "3", "4",
"5", "6", "7", "8", "9",
"10", "11", "12", "13", "14", "15"]
i = 1
total = len(t)
progress = self.progress_reporter(total)
for pagenum in t:
progress.show_progress(i)
url = baseurl + pagenum
tries = 10
while tries > 0:
try:
f = urllib.urlopen(url)
break
except IOError:
tries -= 1
print "\rTry of", url, "failed,", tries, "tries left"
if not f is None:
# Read the response
s = f.read()
f.close()
# Process the page.
myparser.parse(s)
else:
"\rOpening of", url, "did not succeed, trying next one..."
i += 1
progress.complete()
return myparser.get_hyperlinks()
def get_image_links(self, baseurl, t = []):
mysubparser = htmlparser.MySubParser()
total = len(t)
progress = self.progress_reporter(total)
i = 1
for link in t:
progress.show_progress(i)
img_url = baseurl + link
tries = 10
while tries > 0:
try:
f = urllib.urlopen(img_url)
break
except IOError:
tries -= 1
print "\rTry of", img_url, "failed,", tries, "tries left"
if not f is None:
s = f.read()
f.close()
mysubparser.parse(s)
else:
print "\rOpening of", img_url, "did not succeed, trying next one..."
i += 1
progress.complete()
return mysubparser.get_hyperlinks()
def get_images(self, t = []):
skipped = 0
failed = 0
downloaded = 0
total = len(t)
progress = self.progress_reporter(total)
i = 1
for link in t:
progress.show_progress(i)
filename = os.path.join(get_savedir(), os.path.split(link)[1])
if not check_archive(filename):
tries = 10
while tries > 0:
try:
urllib.urlretrieve(link, filename)
break
except IOError:
tries -= 1
if tries == 0:
failed += 1
else:
downloaded += 1
if self.on_downloaded is not None:
if not self.on_downloaded(filename):
failed += 1
else:
skipped += 1
i += 1
progress.complete()
return (skipped, failed, downloaded, total)
if __name__ == "__main__":
print "Don't run me, run 4grab.py"