2010-01-17 03:56:00 +01:00
|
|
|
#!/usr/bin/env python
|
2010-02-09 02:45:56 +01:00
|
|
|
|
|
|
|
######################################################################
|
|
|
|
# Copyright 2009, 2010 ryuslash
|
|
|
|
#
|
|
|
|
# This file is part of 4grab.
|
|
|
|
#
|
|
|
|
# 4grab is free software: you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License as published by
|
|
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
|
|
# (at your option) any later version.
|
|
|
|
#
|
|
|
|
# 4grab is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
# GNU General Public License for more details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU General Public License
|
|
|
|
# along with 4grab. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
######################################################################
|
|
|
|
|
2010-01-15 08:22:17 +01:00
|
|
|
import urllib
|
|
|
|
import os
|
2010-01-17 03:48:23 +01:00
|
|
|
import htmlparser
|
|
|
|
import progressbar
|
2010-01-18 20:08:38 +01:00
|
|
|
import config
|
2010-01-15 08:22:17 +01:00
|
|
|
|
2010-01-26 20:20:39 +01:00
|
|
|
savedir = config.Configuration().get_download_location()
|
2010-02-08 01:48:12 +01:00
|
|
|
if not os.path.exists(savedir):
|
|
|
|
os.makedirs(savedir)
|
2010-01-15 08:22:17 +01:00
|
|
|
|
2010-01-17 03:48:23 +01:00
|
|
|
def get_thread_links(baseurl):
|
|
|
|
myparser = htmlparser.MyParser()
|
|
|
|
t = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]
|
|
|
|
i = 1
|
|
|
|
total = len(t)
|
|
|
|
progress = progressbar.Progress(total)
|
|
|
|
|
|
|
|
for pagenum in t:
|
|
|
|
progress.show_progress(i)
|
|
|
|
|
2010-02-09 02:10:17 +01:00
|
|
|
url = baseurl + pagenum
|
2010-01-15 08:22:17 +01:00
|
|
|
tries = 10
|
|
|
|
while tries > 0:
|
2010-01-17 03:48:23 +01:00
|
|
|
try:
|
2010-01-15 08:22:17 +01:00
|
|
|
f = urllib.urlopen(url)
|
|
|
|
break
|
|
|
|
except IOError:
|
2010-01-17 03:48:23 +01:00
|
|
|
tries -= 1
|
|
|
|
print "\rTry of", url, "failed,", tries, "tries left"
|
2010-01-15 08:22:17 +01:00
|
|
|
if not f is None:
|
2010-01-17 03:48:23 +01:00
|
|
|
# Read the response
|
2010-01-15 08:22:17 +01:00
|
|
|
s = f.read()
|
|
|
|
f.close()
|
2010-01-17 03:48:23 +01:00
|
|
|
|
|
|
|
# Process the page.
|
2010-01-15 08:22:17 +01:00
|
|
|
myparser.parse(s)
|
|
|
|
else:
|
2010-01-17 03:48:23 +01:00
|
|
|
"\rOpening of", url, "did not succeed, trying next one..."
|
|
|
|
i += 1
|
|
|
|
return myparser.get_hyperlinks()
|
|
|
|
|
|
|
|
def get_image_links(baseurl, t = []):
|
|
|
|
mysubparser = htmlparser.MySubParser()
|
2010-01-15 08:22:17 +01:00
|
|
|
total = len(t)
|
2010-01-17 03:48:23 +01:00
|
|
|
progress = progressbar.Progress(total)
|
2010-01-15 08:22:17 +01:00
|
|
|
i = 1
|
2010-01-17 03:48:23 +01:00
|
|
|
|
2010-01-15 08:22:17 +01:00
|
|
|
for link in t:
|
2010-01-17 03:48:23 +01:00
|
|
|
progress.show_progress(i)
|
|
|
|
|
2010-02-09 02:10:17 +01:00
|
|
|
img_url = baseurl + link
|
2010-01-15 08:22:17 +01:00
|
|
|
tries = 10
|
|
|
|
while tries > 0:
|
|
|
|
try:
|
|
|
|
f = urllib.urlopen(img_url)
|
|
|
|
break
|
|
|
|
except IOError:
|
2010-01-17 03:48:23 +01:00
|
|
|
tries -= 1
|
|
|
|
print "\rTry of", img_url, "failed,", tries, "tries left"
|
2010-01-15 08:22:17 +01:00
|
|
|
if not f is None:
|
|
|
|
s = f.read()
|
|
|
|
f.close()
|
|
|
|
|
|
|
|
mysubparser.parse(s)
|
|
|
|
else:
|
2010-01-17 03:48:23 +01:00
|
|
|
print "\rOpening of", img_url, "did not succeed, trying next one..."
|
|
|
|
i += 1
|
2010-01-15 08:22:17 +01:00
|
|
|
|
2010-01-17 03:48:23 +01:00
|
|
|
return mysubparser.get_hyperlinks()
|
|
|
|
|
|
|
|
def get_images(t = []):
|
2010-01-15 08:22:17 +01:00
|
|
|
total = len(t)
|
2010-01-17 03:48:23 +01:00
|
|
|
progress = progressbar.Progress(total)
|
2010-01-15 08:22:17 +01:00
|
|
|
i = 1
|
|
|
|
for link in t:
|
2010-01-17 03:48:23 +01:00
|
|
|
progress.show_progress(i)
|
2010-01-15 08:22:17 +01:00
|
|
|
filename = os.path.join(savedir, os.path.split(link)[1])
|
|
|
|
if not os.path.exists(filename):
|
|
|
|
tries = 10
|
|
|
|
while tries > 0:
|
|
|
|
try:
|
|
|
|
urllib.urlretrieve(link, filename)
|
|
|
|
break
|
|
|
|
except IOError:
|
2010-01-17 03:48:23 +01:00
|
|
|
tries -= 1
|
|
|
|
print "\rDownloading of", link, "failed,", tries, "left"
|
2010-01-15 08:22:17 +01:00
|
|
|
else:
|
2010-01-17 03:48:23 +01:00
|
|
|
print "\rNot downloading", link, "already downloaded"
|
|
|
|
i += 1
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
# Get a file-like object for the 4chan.org w/imgboard
|
2010-01-26 20:20:39 +01:00
|
|
|
base_url = "http://boards.4chan.org/" + config.Configuration().get_category() + "/"
|
2010-01-17 03:48:23 +01:00
|
|
|
|
|
|
|
# Get the hyperlinks.
|
|
|
|
t = get_thread_links(base_url)
|
|
|
|
t = get_image_links(base_url, t)
|
|
|
|
get_images(t)
|