fb65246575
Can download images from /w/
118 lines
3.4 KiB
Python
118 lines
3.4 KiB
Python
import urllib
|
|
import sgmllib
|
|
import re
|
|
import os
|
|
|
|
savedir = "/home/slash/Pictures/4grab/"
|
|
|
|
class MyParser(sgmllib.SGMLParser):
|
|
"A simple parser class."
|
|
|
|
def parse(self, s):
|
|
"Parse the given string 's'."
|
|
self.feed(s)
|
|
self.close()
|
|
|
|
def __init__(self, verbose=0):
|
|
"Initialise an object, passing 'verbose' to the superclass."
|
|
|
|
sgmllib.SGMLParser.__init__(self, verbose)
|
|
self.hyperlinks = []
|
|
|
|
self.url_reg = re.compile('res/\d+\Z')
|
|
self.img_reg = re.compile('/\d+\.(jpg|gif|bmp|png|jpeg)\Z')
|
|
|
|
def start_a(self, attributes):
|
|
"Process a hyperlink and its 'attributes'."
|
|
|
|
for name, value in attributes:
|
|
if name == "href":
|
|
if self.url_reg.search(value) != None:
|
|
self.hyperlinks.append(value)
|
|
|
|
def get_hyperlinks(self):
|
|
"Return the list of hyperlinks."
|
|
|
|
return self.hyperlinks
|
|
|
|
class MySubParser(MyParser):
|
|
def __init__(self, verbose=0):
|
|
MyParser.__init__(self, verbose)
|
|
self.url_reg = re.compile('/src/\d+\.\w{3,4}\Z')
|
|
|
|
if __name__ == "__main__":
|
|
# Get a file-like object for the 4chan.org w/imgboard
|
|
base_url = "http://boards.4chan.org/w/"
|
|
myparser = MyParser()
|
|
total = 10
|
|
for i in range(0, total):
|
|
if i > 0:
|
|
url = base_url + str(i)
|
|
else:
|
|
url = base_url
|
|
|
|
tries = 10
|
|
while tries > 0:
|
|
try:
|
|
f = urllib.urlopen(url)
|
|
break
|
|
except IOError:
|
|
tries = tries - 1
|
|
print "Try of", url, "failed,", tries, "tries left"
|
|
if not f is None:
|
|
# Read the object
|
|
s = f.read()
|
|
f.close()
|
|
|
|
# Try and process the page.
|
|
# The class should have been defined first, remember.
|
|
myparser.parse(s)
|
|
print "Parsed", url, "-", i + 1, "of", total
|
|
else:
|
|
"Opening of", url, "did not succeed, trying next one..."
|
|
|
|
# Get the hyperlinks.
|
|
t = myparser.get_hyperlinks()
|
|
mysubparser = MySubParser()
|
|
total = len(t)
|
|
i = 1
|
|
for link in t:
|
|
img_url = base_url + link
|
|
tries = 10
|
|
while tries > 0:
|
|
try:
|
|
f = urllib.urlopen(img_url)
|
|
break
|
|
except IOError:
|
|
tries = tries - 1
|
|
print "Try of", img_url, "failed,", tries, "tries left"
|
|
if not f is None:
|
|
s = f.read()
|
|
f.close()
|
|
|
|
mysubparser.parse(s)
|
|
print "Parsed", img_url, "-", i, "of", total
|
|
else:
|
|
print "Opening of", img_url, "did not succeed, trying next one..."
|
|
i = i + 1
|
|
|
|
t = mysubparser.get_hyperlinks()
|
|
total = len(t)
|
|
i = 1
|
|
for link in t:
|
|
filename = os.path.join(savedir, os.path.split(link)[1])
|
|
if not os.path.exists(filename):
|
|
tries = 10
|
|
while tries > 0:
|
|
try:
|
|
urllib.urlretrieve(link, filename)
|
|
print "Retrieved", link, "-", i, "of", total
|
|
break
|
|
except IOError:
|
|
tries = tries - 1
|
|
print "Downloading of", link, "failed,", tries, "left"
|
|
|
|
else:
|
|
print "Not downloading", link, "already downloaded"
|
|
i = i + 1
|
|
|