Parser seperation, progress bar

* Seperated the parser from the downloader code.
* Added a progressbar class, to make it look fancier
* Created some functions to do all the work in downloader.py, cleaner now
* Changed parser.py to htmlparser.py, since it was conflicting with a built-in module
This commit is contained in:
ryuslash 2010-01-17 03:48:23 +01:00
parent fb65246575
commit 025a723a5a
4 changed files with 109 additions and 74 deletions

1
.gitignore vendored
View file

@ -1 +1,2 @@
*~
*.pyc

View file

@ -1,82 +1,50 @@
import urllib
import sgmllib
import re
import os
import htmlparser
import progressbar
savedir = "/home/slash/Pictures/4grab/"
class MyParser(sgmllib.SGMLParser):
"A simple parser class."
def get_thread_links(baseurl):
myparser = htmlparser.MyParser()
t = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]
i = 1
total = len(t)
progress = progressbar.Progress(total)
def parse(self, s):
"Parse the given string 's'."
self.feed(s)
self.close()
def __init__(self, verbose=0):
"Initialise an object, passing 'verbose' to the superclass."
sgmllib.SGMLParser.__init__(self, verbose)
self.hyperlinks = []
self.url_reg = re.compile('res/\d+\Z')
self.img_reg = re.compile('/\d+\.(jpg|gif|bmp|png|jpeg)\Z')
def start_a(self, attributes):
"Process a hyperlink and its 'attributes'."
for name, value in attributes:
if name == "href":
if self.url_reg.search(value) != None:
self.hyperlinks.append(value)
def get_hyperlinks(self):
"Return the list of hyperlinks."
return self.hyperlinks
class MySubParser(MyParser):
def __init__(self, verbose=0):
MyParser.__init__(self, verbose)
self.url_reg = re.compile('/src/\d+\.\w{3,4}\Z')
if __name__ == "__main__":
# Get a file-like object for the 4chan.org w/imgboard
base_url = "http://boards.4chan.org/w/"
myparser = MyParser()
total = 10
for i in range(0, total):
if i > 0:
url = base_url + str(i)
else:
url = base_url
for pagenum in t:
progress.show_progress(i)
url = base_url + pagenum
tries = 10
while tries > 0:
try:
f = urllib.urlopen(url)
break
except IOError:
tries = tries - 1
print "Try of", url, "failed,", tries, "tries left"
tries -= 1
print "\rTry of", url, "failed,", tries, "tries left"
if not f is None:
# Read the object
# Read the response
s = f.read()
f.close()
# Try and process the page.
# The class should have been defined first, remember.
# Process the page.
myparser.parse(s)
print "Parsed", url, "-", i + 1, "of", total
else:
"Opening of", url, "did not succeed, trying next one..."
"\rOpening of", url, "did not succeed, trying next one..."
i += 1
return myparser.get_hyperlinks()
# Get the hyperlinks.
t = myparser.get_hyperlinks()
mysubparser = MySubParser()
def get_image_links(baseurl, t = []):
mysubparser = htmlparser.MySubParser()
total = len(t)
progress = progressbar.Progress(total)
i = 1
for link in t:
progress.show_progress(i)
img_url = base_url + link
tries = 10
while tries > 0:
@ -84,35 +52,44 @@ if __name__ == "__main__":
f = urllib.urlopen(img_url)
break
except IOError:
tries = tries - 1
print "Try of", img_url, "failed,", tries, "tries left"
tries -= 1
print "\rTry of", img_url, "failed,", tries, "tries left"
if not f is None:
s = f.read()
f.close()
mysubparser.parse(s)
print "Parsed", img_url, "-", i, "of", total
else:
print "Opening of", img_url, "did not succeed, trying next one..."
i = i + 1
print "\rOpening of", img_url, "did not succeed, trying next one..."
i += 1
t = mysubparser.get_hyperlinks()
return mysubparser.get_hyperlinks()
def get_images(t = []):
total = len(t)
progress = progressbar.Progress(total)
i = 1
for link in t:
progress.show_progress(i)
filename = os.path.join(savedir, os.path.split(link)[1])
if not os.path.exists(filename):
tries = 10
while tries > 0:
try:
urllib.urlretrieve(link, filename)
print "Retrieved", link, "-", i, "of", total
break
except IOError:
tries = tries - 1
print "Downloading of", link, "failed,", tries, "left"
tries -= 1
print "\rDownloading of", link, "failed,", tries, "left"
else:
print "Not downloading", link, "already downloaded"
i = i + 1
print "\rNot downloading", link, "already downloaded"
i += 1
if __name__ == "__main__":
# Get a file-like object for the 4chan.org w/imgboard
base_url = "http://boards.4chan.org/w/"
# Get the hyperlinks.
t = get_thread_links(base_url)
t = get_image_links(base_url, t)
get_images(t)

30
htmlparser.py Normal file
View file

@ -0,0 +1,30 @@
import sgmllib
import re
class MyParser(sgmllib.SGMLParser):
def __init__(self, verbose=0):
sgmllib.SGMLParser.__init__(self, verbose)
self.hyperlinks = []
self.url_reg = re.compile('res/\d+\Z')
self.prev = ""
def parse(self, s):
self.feed(s)
self.close()
def start_a(self, attributes):
for name, value in attributes:
if name == "href":
if self.url_reg.search(value) != None:
if self.prev != value:
self.hyperlinks.append(value)
self.prev = value
def get_hyperlinks(self):
return self.hyperlinks
class MySubParser(MyParser):
def __init__(self, verbose=0):
MyParser.__init__(self, verbose)
self.url_reg = re.compile('/src/\d+\.\w{3,4}\Z')

27
progressbar.py Normal file
View file

@ -0,0 +1,27 @@
import sys
import time
class Progress():
def __init__(self, maxvalue, maxwidth=80, fd=sys.stdout):
self.maxwidth = maxwidth
self.maxvalue = maxvalue
self.fd = fd
self.fill_char = '#'
self.show_progress(0)
def show_progress(self, value):
str_value = str(value)
str_maxvalue = str(self.maxvalue)
true_maxwidth = self.maxwidth - 4 - len(str_value) - len(str_maxvalue)
progress = int(round((true_maxwidth/float(self.maxvalue))*value))
self.fd.write("\r%s/%s [%s%s]" % (str_value, str_maxvalue, self.fill_char * progress, " " * (true_maxwidth - progress)))
self.fd.flush()
if value == self.maxvalue:
self.fd.write("\n")
if __name__ == "__main__":
prog = Progress(200)
for i in range(1, 201):
prog.show_progress(i)
time.sleep(1)