2010-01-17 03:56:00 +01:00
|
|
|
#!/usr/bin/env python
|
2010-02-09 02:45:56 +01:00
|
|
|
|
|
|
|
######################################################################
|
|
|
|
# Copyright 2009, 2010 ryuslash
|
|
|
|
#
|
|
|
|
# This file is part of 4grab.
|
|
|
|
#
|
|
|
|
# 4grab is free software: you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License as published by
|
|
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
|
|
# (at your option) any later version.
|
|
|
|
#
|
|
|
|
# 4grab is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
# GNU General Public License for more details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU General Public License
|
|
|
|
# along with 4grab. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
######################################################################
|
|
|
|
|
2010-01-15 08:22:17 +01:00
|
|
|
import urllib
|
|
|
|
import os
|
2010-01-17 03:48:23 +01:00
|
|
|
import htmlparser
|
2010-01-18 20:08:38 +01:00
|
|
|
import config
|
2010-03-25 22:28:08 +01:00
|
|
|
import sys
|
2010-01-15 08:22:17 +01:00
|
|
|
|
2010-03-17 23:11:18 +01:00
|
|
|
def get_savedir():
|
|
|
|
conf = config.Configuration()
|
|
|
|
savedir = conf.get_download_location()
|
|
|
|
if not os.path.exists(savedir):
|
|
|
|
os.makedirs(savedir)
|
|
|
|
return savedir
|
Sorting, multi category, multi resolution
After a file has been downloaded a callback function can now be called.
The callback function I call checks to see if the resolution of the image appears in the collection of resolutions that has been entered in the configuration file and deletes/moves accordingly.
If a file can not be read (which I have noticed happens sometimes), it is removed, not copied and not archived so that it can be retried later.
4grab got a new command-line option, -s --sorter, to sort out old images, running python sorter.py has the same effect, but this seemed pretties.
theoretically multiple categories could now be entered into the configuration file seperated by ',', but this hasn't been tested yet.
mutliple resolutions could be entered into the configuration file, seperated by ',' like so: 1680x1050,1920x1200.
Configuration now checks to see if all the necessary properties are available in the configuration file, if one is missing, it tries to create it.
2010-03-19 00:18:04 +01:00
|
|
|
def check_archive(fullpath):
|
|
|
|
conf = config.Configuration()
|
|
|
|
archive = conf.get_archive_location()
|
|
|
|
filename = os.path.basename(fullpath)
|
|
|
|
archfile = os.path.join(archive, filename)
|
|
|
|
return os.path.exists(archfile)
|
2010-03-25 22:28:08 +01:00
|
|
|
def write(message):
|
|
|
|
sys.stdout.write(message)
|
|
|
|
sys.stdout.flush()
|
2010-01-15 08:22:17 +01:00
|
|
|
|
2010-03-07 00:20:37 +01:00
|
|
|
class Downloader(object):
|
|
|
|
def __init__(self, progress_reporter):
|
|
|
|
self.progress_reporter = progress_reporter
|
Sorting, multi category, multi resolution
After a file has been downloaded a callback function can now be called.
The callback function I call checks to see if the resolution of the image appears in the collection of resolutions that has been entered in the configuration file and deletes/moves accordingly.
If a file can not be read (which I have noticed happens sometimes), it is removed, not copied and not archived so that it can be retried later.
4grab got a new command-line option, -s --sorter, to sort out old images, running python sorter.py has the same effect, but this seemed pretties.
theoretically multiple categories could now be entered into the configuration file seperated by ',', but this hasn't been tested yet.
mutliple resolutions could be entered into the configuration file, seperated by ',' like so: 1680x1050,1920x1200.
Configuration now checks to see if all the necessary properties are available in the configuration file, if one is missing, it tries to create it.
2010-03-19 00:18:04 +01:00
|
|
|
self.on_downloaded = None
|
|
|
|
|
|
|
|
def set_on_downloaded(self, on_downloaded):
|
|
|
|
self.on_downloaded = on_downloaded
|
2010-01-17 03:48:23 +01:00
|
|
|
|
2010-03-07 00:20:37 +01:00
|
|
|
def get_thread_links(self, baseurl):
|
|
|
|
myparser = htmlparser.MyParser()
|
2010-03-25 22:28:08 +01:00
|
|
|
i = 0
|
|
|
|
code = 0
|
|
|
|
url = None
|
|
|
|
|
|
|
|
while code != 404:
|
|
|
|
url = baseurl + str(i)
|
2010-03-07 00:20:37 +01:00
|
|
|
tries = 10
|
|
|
|
while tries > 0:
|
|
|
|
try:
|
|
|
|
f = urllib.urlopen(url)
|
|
|
|
break
|
|
|
|
except IOError:
|
|
|
|
tries -= 1
|
2010-03-25 22:28:08 +01:00
|
|
|
write("\rTry of %s failed, %d tries left" % (url, tries))
|
2010-03-07 00:20:37 +01:00
|
|
|
if not f is None:
|
2010-03-25 22:28:08 +01:00
|
|
|
code = f.getcode()
|
|
|
|
if code == 404:
|
|
|
|
write("\rCollected %d pages\n" % i)
|
|
|
|
f.close()
|
|
|
|
continue
|
2010-03-07 00:20:37 +01:00
|
|
|
# Read the response
|
|
|
|
s = f.read()
|
|
|
|
f.close()
|
|
|
|
|
|
|
|
# Process the page.
|
|
|
|
myparser.parse(s)
|
|
|
|
else:
|
2010-03-25 22:28:08 +01:00
|
|
|
write("\rOpening of %s did not succeed, trying next one..." \
|
|
|
|
% url)
|
2010-03-07 00:20:37 +01:00
|
|
|
i += 1
|
2010-03-25 22:28:08 +01:00
|
|
|
write("\rCollected %d pages" % i)
|
2010-02-11 22:05:37 +01:00
|
|
|
|
2010-03-07 00:20:37 +01:00
|
|
|
return myparser.get_hyperlinks()
|
2010-01-17 03:48:23 +01:00
|
|
|
|
2010-03-07 00:20:37 +01:00
|
|
|
def get_image_links(self, baseurl, t = []):
|
|
|
|
mysubparser = htmlparser.MySubParser()
|
|
|
|
total = len(t)
|
|
|
|
progress = self.progress_reporter(total)
|
|
|
|
i = 1
|
2010-01-17 03:48:23 +01:00
|
|
|
|
2010-03-07 00:20:37 +01:00
|
|
|
for link in t:
|
|
|
|
progress.show_progress(i)
|
2010-01-17 03:48:23 +01:00
|
|
|
|
2010-03-07 00:20:37 +01:00
|
|
|
img_url = baseurl + link
|
2010-01-15 08:22:17 +01:00
|
|
|
tries = 10
|
|
|
|
while tries > 0:
|
|
|
|
try:
|
2010-03-07 00:20:37 +01:00
|
|
|
f = urllib.urlopen(img_url)
|
2010-01-15 08:22:17 +01:00
|
|
|
break
|
|
|
|
except IOError:
|
2010-01-17 03:48:23 +01:00
|
|
|
tries -= 1
|
2010-03-25 22:28:08 +01:00
|
|
|
write("\rTry of %s failed, %d tries left" \
|
|
|
|
% (img_url, tries))
|
2010-03-07 00:20:37 +01:00
|
|
|
if not f is None:
|
|
|
|
s = f.read()
|
|
|
|
f.close()
|
|
|
|
|
|
|
|
mysubparser.parse(s)
|
|
|
|
else:
|
2010-03-25 22:28:08 +01:00
|
|
|
write("\rOpening of %s did not succeed, " \
|
|
|
|
"trying next one..." % img_url)
|
2010-03-07 00:20:37 +01:00
|
|
|
i += 1
|
|
|
|
|
|
|
|
progress.complete()
|
|
|
|
return mysubparser.get_hyperlinks()
|
|
|
|
|
|
|
|
def get_images(self, t = []):
|
|
|
|
skipped = 0
|
|
|
|
failed = 0
|
|
|
|
downloaded = 0
|
|
|
|
total = len(t)
|
|
|
|
progress = self.progress_reporter(total)
|
|
|
|
i = 1
|
|
|
|
for link in t:
|
|
|
|
progress.show_progress(i)
|
2010-03-17 23:11:18 +01:00
|
|
|
filename = os.path.join(get_savedir(), os.path.split(link)[1])
|
Sorting, multi category, multi resolution
After a file has been downloaded a callback function can now be called.
The callback function I call checks to see if the resolution of the image appears in the collection of resolutions that has been entered in the configuration file and deletes/moves accordingly.
If a file can not be read (which I have noticed happens sometimes), it is removed, not copied and not archived so that it can be retried later.
4grab got a new command-line option, -s --sorter, to sort out old images, running python sorter.py has the same effect, but this seemed pretties.
theoretically multiple categories could now be entered into the configuration file seperated by ',', but this hasn't been tested yet.
mutliple resolutions could be entered into the configuration file, seperated by ',' like so: 1680x1050,1920x1200.
Configuration now checks to see if all the necessary properties are available in the configuration file, if one is missing, it tries to create it.
2010-03-19 00:18:04 +01:00
|
|
|
if not check_archive(filename):
|
2010-03-07 00:20:37 +01:00
|
|
|
tries = 10
|
|
|
|
while tries > 0:
|
|
|
|
try:
|
|
|
|
urllib.urlretrieve(link, filename)
|
|
|
|
break
|
|
|
|
except IOError:
|
|
|
|
tries -= 1
|
|
|
|
if tries == 0:
|
|
|
|
failed += 1
|
|
|
|
else:
|
|
|
|
downloaded += 1
|
Sorting, multi category, multi resolution
After a file has been downloaded a callback function can now be called.
The callback function I call checks to see if the resolution of the image appears in the collection of resolutions that has been entered in the configuration file and deletes/moves accordingly.
If a file can not be read (which I have noticed happens sometimes), it is removed, not copied and not archived so that it can be retried later.
4grab got a new command-line option, -s --sorter, to sort out old images, running python sorter.py has the same effect, but this seemed pretties.
theoretically multiple categories could now be entered into the configuration file seperated by ',', but this hasn't been tested yet.
mutliple resolutions could be entered into the configuration file, seperated by ',' like so: 1680x1050,1920x1200.
Configuration now checks to see if all the necessary properties are available in the configuration file, if one is missing, it tries to create it.
2010-03-19 00:18:04 +01:00
|
|
|
if self.on_downloaded is not None:
|
2010-03-19 16:08:39 +01:00
|
|
|
if not self.on_downloaded(filename):
|
|
|
|
failed += 1
|
2010-02-12 00:04:34 +01:00
|
|
|
else:
|
2010-03-07 00:20:37 +01:00
|
|
|
skipped += 1
|
|
|
|
i += 1
|
2010-02-11 22:05:37 +01:00
|
|
|
|
2010-03-07 00:20:37 +01:00
|
|
|
progress.complete()
|
|
|
|
return (skipped, failed, downloaded, total)
|
2010-01-17 03:48:23 +01:00
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2010-03-17 23:11:18 +01:00
|
|
|
print "Don't run me, run 4grab.py"
|