4grab/download.py
ryuslash 14e2b0cc54 non-fixed page count
4grab no longer assumes either 11 or 16 pages, it will keep trying to collect new pages up to the moment it receives a 404 error
2010-03-25 22:28:08 +01:00

153 lines
4.7 KiB
Python

#!/usr/bin/env python
######################################################################
# Copyright 2009, 2010 ryuslash
#
# This file is part of 4grab.
#
# 4grab is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# 4grab is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with 4grab. If not, see <http://www.gnu.org/licenses/>.
######################################################################
import urllib
import os
import htmlparser
import config
import sys
def get_savedir():
conf = config.Configuration()
savedir = conf.get_download_location()
if not os.path.exists(savedir):
os.makedirs(savedir)
return savedir
def check_archive(fullpath):
conf = config.Configuration()
archive = conf.get_archive_location()
filename = os.path.basename(fullpath)
archfile = os.path.join(archive, filename)
return os.path.exists(archfile)
def write(message):
sys.stdout.write(message)
sys.stdout.flush()
class Downloader(object):
def __init__(self, progress_reporter):
self.progress_reporter = progress_reporter
self.on_downloaded = None
def set_on_downloaded(self, on_downloaded):
self.on_downloaded = on_downloaded
def get_thread_links(self, baseurl):
myparser = htmlparser.MyParser()
i = 0
code = 0
url = None
while code != 404:
url = baseurl + str(i)
tries = 10
while tries > 0:
try:
f = urllib.urlopen(url)
break
except IOError:
tries -= 1
write("\rTry of %s failed, %d tries left" % (url, tries))
if not f is None:
code = f.getcode()
if code == 404:
write("\rCollected %d pages\n" % i)
f.close()
continue
# Read the response
s = f.read()
f.close()
# Process the page.
myparser.parse(s)
else:
write("\rOpening of %s did not succeed, trying next one..." \
% url)
i += 1
write("\rCollected %d pages" % i)
return myparser.get_hyperlinks()
def get_image_links(self, baseurl, t = []):
mysubparser = htmlparser.MySubParser()
total = len(t)
progress = self.progress_reporter(total)
i = 1
for link in t:
progress.show_progress(i)
img_url = baseurl + link
tries = 10
while tries > 0:
try:
f = urllib.urlopen(img_url)
break
except IOError:
tries -= 1
write("\rTry of %s failed, %d tries left" \
% (img_url, tries))
if not f is None:
s = f.read()
f.close()
mysubparser.parse(s)
else:
write("\rOpening of %s did not succeed, " \
"trying next one..." % img_url)
i += 1
progress.complete()
return mysubparser.get_hyperlinks()
def get_images(self, t = []):
skipped = 0
failed = 0
downloaded = 0
total = len(t)
progress = self.progress_reporter(total)
i = 1
for link in t:
progress.show_progress(i)
filename = os.path.join(get_savedir(), os.path.split(link)[1])
if not check_archive(filename):
tries = 10
while tries > 0:
try:
urllib.urlretrieve(link, filename)
break
except IOError:
tries -= 1
if tries == 0:
failed += 1
else:
downloaded += 1
if self.on_downloaded is not None:
if not self.on_downloaded(filename):
failed += 1
else:
skipped += 1
i += 1
progress.complete()
return (skipped, failed, downloaded, total)
if __name__ == "__main__":
print "Don't run me, run 4grab.py"