af529bcd4e
Using kirbybase to store downloaded images A nasty bug that always returned the wrong result when checking whether an image had already been downloaded has been fixed
154 lines
4.4 KiB
Python
154 lines
4.4 KiB
Python
######################################################################
|
|
# Copyright 2009, 2010 ryuslash
|
|
#
|
|
# This file is part of 4grab.
|
|
#
|
|
# 4grab is free software: you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# 4grab is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with 4grab. If not, see <http://www.gnu.org/licenses/>.
|
|
######################################################################
|
|
|
|
import urllib
|
|
import os
|
|
import htmlparser
|
|
import config
|
|
import sys
|
|
import backend
|
|
|
|
def get_savedir():
|
|
conf = config.Configuration()
|
|
savedir = conf.get_download_location()
|
|
if not os.path.exists(savedir):
|
|
os.makedirs(savedir)
|
|
return savedir
|
|
def check_archive(fullpath):
|
|
filename = os.path.basename(fullpath)
|
|
be = backend.Backend()
|
|
return be.check(filename)
|
|
|
|
def write(message):
|
|
sys.stdout.write(message)
|
|
sys.stdout.flush()
|
|
|
|
class Downloader(object):
|
|
def __init__(self, progress_reporter):
|
|
self.progress_reporter = progress_reporter
|
|
self.on_downloaded = None
|
|
|
|
def set_on_downloaded(self, on_downloaded):
|
|
self.on_downloaded = on_downloaded
|
|
|
|
def download(self, url):
|
|
f = None
|
|
tries = 10
|
|
while tries > 0:
|
|
try:
|
|
f = urllib.urlopen(url)
|
|
break
|
|
except IOError:
|
|
tries -= 1
|
|
write("\rTry of %s failed, %d tries left" % (url, tries))
|
|
return f
|
|
|
|
def get_thread_links(self, baseurl):
|
|
myparser = htmlparser.MyParser()
|
|
i = 0
|
|
code = 0
|
|
url = None
|
|
|
|
while code != 404:
|
|
url = baseurl + str(i)
|
|
f = self.download(url)
|
|
|
|
if not f is None:
|
|
code = f.getcode()
|
|
if code == 404:
|
|
write("\rCollected %d pages\n" % i)
|
|
f.close()
|
|
continue
|
|
# Read the response
|
|
s = f.read()
|
|
f.close()
|
|
|
|
# Process the page.
|
|
myparser.parse(s)
|
|
else:
|
|
write("\rOpening of %s did not succeed, trying next one..." \
|
|
% url)
|
|
i += 1
|
|
write("\rCollected %d pages" % i)
|
|
|
|
return myparser.get_hyperlinks()
|
|
|
|
def get_image_links(self, baseurl, t = []):
|
|
mysubparser = htmlparser.MySubParser()
|
|
total = len(t)
|
|
progress = self.progress_reporter(total)
|
|
i = 1
|
|
|
|
for link in t:
|
|
progress.show_progress(i)
|
|
|
|
img_url = baseurl + link
|
|
f = self.download(img_url)
|
|
|
|
if not f is None:
|
|
s = f.read()
|
|
f.close()
|
|
|
|
mysubparser.parse(s)
|
|
else:
|
|
write("\rOpening of %s did not succeed, " \
|
|
"trying next one..." % img_url)
|
|
i += 1
|
|
|
|
progress.complete()
|
|
return mysubparser.get_hyperlinks()
|
|
|
|
def get_images(self, t = []):
|
|
skipped = 0
|
|
failed = 0
|
|
downloaded = 0
|
|
total = len(t)
|
|
progress = self.progress_reporter(total)
|
|
i = 1
|
|
for link in t:
|
|
progress.show_progress(i)
|
|
filename = os.path.join(get_savedir(), os.path.split(link)[1])
|
|
if not check_archive(filename):
|
|
tries = 10
|
|
while tries > 0:
|
|
try:
|
|
urllib.urlretrieve(link, filename)
|
|
break
|
|
except IOError:
|
|
tries -= 1
|
|
if tries == 0:
|
|
failed += 1
|
|
else:
|
|
downloaded += 1
|
|
if self.on_downloaded is not None:
|
|
if not self.on_downloaded(filename):
|
|
failed += 1
|
|
else:
|
|
skipped += 1
|
|
i += 1
|
|
|
|
progress.complete()
|
|
|
|
be = backend.Backend()
|
|
be.save()
|
|
|
|
return (skipped, failed, downloaded, total)
|
|
|
|
if __name__ == "__main__":
|
|
print "Don't run me, run 4grab.py"
|