Parser seperation, progress bar
* Seperated the parser from the downloader code. * Added a progressbar class, to make it look fancier * Created some functions to do all the work in downloader.py, cleaner now * Changed parser.py to htmlparser.py, since it was conflicting with a built-in module
This commit is contained in:
parent
fb65246575
commit
025a723a5a
4 changed files with 109 additions and 74 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -1 +1,2 @@
|
||||||
*~
|
*~
|
||||||
|
*.pyc
|
||||||
|
|
113
download.py
113
download.py
|
@ -1,82 +1,50 @@
|
||||||
import urllib
|
import urllib
|
||||||
import sgmllib
|
|
||||||
import re
|
|
||||||
import os
|
import os
|
||||||
|
import htmlparser
|
||||||
|
import progressbar
|
||||||
|
|
||||||
savedir = "/home/slash/Pictures/4grab/"
|
savedir = "/home/slash/Pictures/4grab/"
|
||||||
|
|
||||||
class MyParser(sgmllib.SGMLParser):
|
def get_thread_links(baseurl):
|
||||||
"A simple parser class."
|
myparser = htmlparser.MyParser()
|
||||||
|
t = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]
|
||||||
|
i = 1
|
||||||
|
total = len(t)
|
||||||
|
progress = progressbar.Progress(total)
|
||||||
|
|
||||||
def parse(self, s):
|
for pagenum in t:
|
||||||
"Parse the given string 's'."
|
progress.show_progress(i)
|
||||||
self.feed(s)
|
|
||||||
self.close()
|
|
||||||
|
|
||||||
def __init__(self, verbose=0):
|
|
||||||
"Initialise an object, passing 'verbose' to the superclass."
|
|
||||||
|
|
||||||
sgmllib.SGMLParser.__init__(self, verbose)
|
|
||||||
self.hyperlinks = []
|
|
||||||
|
|
||||||
self.url_reg = re.compile('res/\d+\Z')
|
|
||||||
self.img_reg = re.compile('/\d+\.(jpg|gif|bmp|png|jpeg)\Z')
|
|
||||||
|
|
||||||
def start_a(self, attributes):
|
|
||||||
"Process a hyperlink and its 'attributes'."
|
|
||||||
|
|
||||||
for name, value in attributes:
|
|
||||||
if name == "href":
|
|
||||||
if self.url_reg.search(value) != None:
|
|
||||||
self.hyperlinks.append(value)
|
|
||||||
|
|
||||||
def get_hyperlinks(self):
|
|
||||||
"Return the list of hyperlinks."
|
|
||||||
|
|
||||||
return self.hyperlinks
|
|
||||||
|
|
||||||
class MySubParser(MyParser):
|
|
||||||
def __init__(self, verbose=0):
|
|
||||||
MyParser.__init__(self, verbose)
|
|
||||||
self.url_reg = re.compile('/src/\d+\.\w{3,4}\Z')
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
# Get a file-like object for the 4chan.org w/imgboard
|
|
||||||
base_url = "http://boards.4chan.org/w/"
|
|
||||||
myparser = MyParser()
|
|
||||||
total = 10
|
|
||||||
for i in range(0, total):
|
|
||||||
if i > 0:
|
|
||||||
url = base_url + str(i)
|
|
||||||
else:
|
|
||||||
url = base_url
|
|
||||||
|
|
||||||
|
url = base_url + pagenum
|
||||||
tries = 10
|
tries = 10
|
||||||
while tries > 0:
|
while tries > 0:
|
||||||
try:
|
try:
|
||||||
f = urllib.urlopen(url)
|
f = urllib.urlopen(url)
|
||||||
break
|
break
|
||||||
except IOError:
|
except IOError:
|
||||||
tries = tries - 1
|
tries -= 1
|
||||||
print "Try of", url, "failed,", tries, "tries left"
|
print "\rTry of", url, "failed,", tries, "tries left"
|
||||||
if not f is None:
|
if not f is None:
|
||||||
# Read the object
|
# Read the response
|
||||||
s = f.read()
|
s = f.read()
|
||||||
f.close()
|
f.close()
|
||||||
|
|
||||||
# Try and process the page.
|
# Process the page.
|
||||||
# The class should have been defined first, remember.
|
|
||||||
myparser.parse(s)
|
myparser.parse(s)
|
||||||
print "Parsed", url, "-", i + 1, "of", total
|
|
||||||
else:
|
else:
|
||||||
"Opening of", url, "did not succeed, trying next one..."
|
"\rOpening of", url, "did not succeed, trying next one..."
|
||||||
|
i += 1
|
||||||
|
return myparser.get_hyperlinks()
|
||||||
|
|
||||||
# Get the hyperlinks.
|
def get_image_links(baseurl, t = []):
|
||||||
t = myparser.get_hyperlinks()
|
mysubparser = htmlparser.MySubParser()
|
||||||
mysubparser = MySubParser()
|
|
||||||
total = len(t)
|
total = len(t)
|
||||||
|
progress = progressbar.Progress(total)
|
||||||
i = 1
|
i = 1
|
||||||
|
|
||||||
for link in t:
|
for link in t:
|
||||||
|
progress.show_progress(i)
|
||||||
|
|
||||||
img_url = base_url + link
|
img_url = base_url + link
|
||||||
tries = 10
|
tries = 10
|
||||||
while tries > 0:
|
while tries > 0:
|
||||||
|
@ -84,35 +52,44 @@ if __name__ == "__main__":
|
||||||
f = urllib.urlopen(img_url)
|
f = urllib.urlopen(img_url)
|
||||||
break
|
break
|
||||||
except IOError:
|
except IOError:
|
||||||
tries = tries - 1
|
tries -= 1
|
||||||
print "Try of", img_url, "failed,", tries, "tries left"
|
print "\rTry of", img_url, "failed,", tries, "tries left"
|
||||||
if not f is None:
|
if not f is None:
|
||||||
s = f.read()
|
s = f.read()
|
||||||
f.close()
|
f.close()
|
||||||
|
|
||||||
mysubparser.parse(s)
|
mysubparser.parse(s)
|
||||||
print "Parsed", img_url, "-", i, "of", total
|
|
||||||
else:
|
else:
|
||||||
print "Opening of", img_url, "did not succeed, trying next one..."
|
print "\rOpening of", img_url, "did not succeed, trying next one..."
|
||||||
i = i + 1
|
i += 1
|
||||||
|
|
||||||
t = mysubparser.get_hyperlinks()
|
return mysubparser.get_hyperlinks()
|
||||||
|
|
||||||
|
def get_images(t = []):
|
||||||
total = len(t)
|
total = len(t)
|
||||||
|
progress = progressbar.Progress(total)
|
||||||
i = 1
|
i = 1
|
||||||
for link in t:
|
for link in t:
|
||||||
|
progress.show_progress(i)
|
||||||
filename = os.path.join(savedir, os.path.split(link)[1])
|
filename = os.path.join(savedir, os.path.split(link)[1])
|
||||||
if not os.path.exists(filename):
|
if not os.path.exists(filename):
|
||||||
tries = 10
|
tries = 10
|
||||||
while tries > 0:
|
while tries > 0:
|
||||||
try:
|
try:
|
||||||
urllib.urlretrieve(link, filename)
|
urllib.urlretrieve(link, filename)
|
||||||
print "Retrieved", link, "-", i, "of", total
|
|
||||||
break
|
break
|
||||||
except IOError:
|
except IOError:
|
||||||
tries = tries - 1
|
tries -= 1
|
||||||
print "Downloading of", link, "failed,", tries, "left"
|
print "\rDownloading of", link, "failed,", tries, "left"
|
||||||
|
|
||||||
else:
|
else:
|
||||||
print "Not downloading", link, "already downloaded"
|
print "\rNot downloading", link, "already downloaded"
|
||||||
i = i + 1
|
i += 1
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Get a file-like object for the 4chan.org w/imgboard
|
||||||
|
base_url = "http://boards.4chan.org/w/"
|
||||||
|
|
||||||
|
# Get the hyperlinks.
|
||||||
|
t = get_thread_links(base_url)
|
||||||
|
t = get_image_links(base_url, t)
|
||||||
|
get_images(t)
|
||||||
|
|
30
htmlparser.py
Normal file
30
htmlparser.py
Normal file
|
@ -0,0 +1,30 @@
|
||||||
|
import sgmllib
|
||||||
|
import re
|
||||||
|
|
||||||
|
class MyParser(sgmllib.SGMLParser):
|
||||||
|
def __init__(self, verbose=0):
|
||||||
|
sgmllib.SGMLParser.__init__(self, verbose)
|
||||||
|
|
||||||
|
self.hyperlinks = []
|
||||||
|
self.url_reg = re.compile('res/\d+\Z')
|
||||||
|
self.prev = ""
|
||||||
|
|
||||||
|
def parse(self, s):
|
||||||
|
self.feed(s)
|
||||||
|
self.close()
|
||||||
|
|
||||||
|
def start_a(self, attributes):
|
||||||
|
for name, value in attributes:
|
||||||
|
if name == "href":
|
||||||
|
if self.url_reg.search(value) != None:
|
||||||
|
if self.prev != value:
|
||||||
|
self.hyperlinks.append(value)
|
||||||
|
self.prev = value
|
||||||
|
|
||||||
|
def get_hyperlinks(self):
|
||||||
|
return self.hyperlinks
|
||||||
|
|
||||||
|
class MySubParser(MyParser):
|
||||||
|
def __init__(self, verbose=0):
|
||||||
|
MyParser.__init__(self, verbose)
|
||||||
|
self.url_reg = re.compile('/src/\d+\.\w{3,4}\Z')
|
27
progressbar.py
Normal file
27
progressbar.py
Normal file
|
@ -0,0 +1,27 @@
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
|
||||||
|
class Progress():
|
||||||
|
def __init__(self, maxvalue, maxwidth=80, fd=sys.stdout):
|
||||||
|
self.maxwidth = maxwidth
|
||||||
|
self.maxvalue = maxvalue
|
||||||
|
self.fd = fd
|
||||||
|
self.fill_char = '#'
|
||||||
|
|
||||||
|
self.show_progress(0)
|
||||||
|
|
||||||
|
def show_progress(self, value):
|
||||||
|
str_value = str(value)
|
||||||
|
str_maxvalue = str(self.maxvalue)
|
||||||
|
true_maxwidth = self.maxwidth - 4 - len(str_value) - len(str_maxvalue)
|
||||||
|
progress = int(round((true_maxwidth/float(self.maxvalue))*value))
|
||||||
|
self.fd.write("\r%s/%s [%s%s]" % (str_value, str_maxvalue, self.fill_char * progress, " " * (true_maxwidth - progress)))
|
||||||
|
self.fd.flush()
|
||||||
|
if value == self.maxvalue:
|
||||||
|
self.fd.write("\n")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
prog = Progress(200)
|
||||||
|
for i in range(1, 201):
|
||||||
|
prog.show_progress(i)
|
||||||
|
time.sleep(1)
|
Loading…
Reference in a new issue