Parser seperation, progress bar

* Seperated the parser from the downloader code. * Added a progressbar class, to make it look fancier * Created some functions to do all the work in downloader.py, cleaner now * Changed parser.py to htmlparser.py, since it was conflicting with a built-in module
2010-01-17 03:48:23 +01:00 · 2010-01-17 03:48:23 +01:00 · 025a723a5a
commit 025a723a5a
parent fb65246575
4 changed files with 109 additions and 74 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,2 @@
 *~
 *.pyc
--- a/download.py
+++ b/download.py
@ -1,82 +1,50 @@
 import urllib
 import sgmllib
 import re
 import os
 import htmlparser
 import progressbar
 savedir = "/home/slash/Pictures/4grab/"
-class MyParser(sgmllib.SGMLParser):
+def get_thread_links(baseurl):
-    "A simple parser class."
+    myparser = htmlparser.MyParser()
    t = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]
    i = 1
    total = len(t)
    progress = progressbar.Progress(total)
-    def parse(self, s):
+    for pagenum in t:
-        "Parse the given string 's'."
+        progress.show_progress(i)
        self.feed(s)
        self.close()
    def __init__(self, verbose=0):
        "Initialise an object, passing 'verbose' to the superclass."
        sgmllib.SGMLParser.__init__(self, verbose)
        self.hyperlinks = []
        self.url_reg = re.compile('res/\d+\Z')
        self.img_reg = re.compile('/\d+\.(jpg|gif|bmp|png|jpeg)\Z')
    def start_a(self, attributes):
        "Process a hyperlink and its 'attributes'."
        for name, value in attributes:
            if name == "href":
                if self.url_reg.search(value) != None:
                    self.hyperlinks.append(value)
    def get_hyperlinks(self):
        "Return the list of hyperlinks."
        return self.hyperlinks
 class MySubParser(MyParser):
    def __init__(self, verbose=0):
        MyParser.__init__(self, verbose)
        self.url_reg = re.compile('/src/\d+\.\w{3,4}\Z')
 if __name__ == "__main__":
    # Get a file-like object for the 4chan.org w/imgboard
    base_url = "http://boards.4chan.org/w/"
    myparser = MyParser()
    total = 10
    for i in range(0, total):
        if i > 0:
            url = base_url + str(i)
        else:
            url = base_url
        url = base_url + pagenum
        tries = 10
        while tries > 0:
            try:
                f = urllib.urlopen(url)
                break
            except IOError:
-                tries = tries - 1
+                tries -= 1
-                print "Try of", url, "failed,", tries, "tries left"
+                print "\rTry of", url, "failed,", tries, "tries left"
        if not f is None:
-            # Read the object
+            # Read the response
            s = f.read()
            f.close()
-            # Try and process the page.
+            # Process the page.
            # The class should have been defined first, remember.
            myparser.parse(s)
            print "Parsed", url, "-", i + 1, "of", total
        else:
-            "Opening of", url, "did not succeed, trying next one..."
+            "\rOpening of", url, "did not succeed, trying next one..."
        i += 1
    return myparser.get_hyperlinks()
-    # Get the hyperlinks.
+def get_image_links(baseurl, t = []):
-    t = myparser.get_hyperlinks()
+    mysubparser = htmlparser.MySubParser()
    mysubparser = MySubParser()
    total = len(t)
    progress = progressbar.Progress(total)
    i = 1
    for link in t:
        progress.show_progress(i)
        img_url = base_url + link
        tries = 10
        while tries > 0:
@ -84,35 +52,44 @@ if __name__ == "__main__":
                f = urllib.urlopen(img_url)
                break
            except IOError:
-                tries = tries - 1
+                tries -= 1
-                print "Try of", img_url, "failed,", tries, "tries left"
+                print "\rTry of", img_url, "failed,", tries, "tries left"
        if not f is None:
            s = f.read()
            f.close()
            mysubparser.parse(s)
            print "Parsed", img_url, "-", i, "of", total
        else:
-            print "Opening of", img_url, "did not succeed, trying next one..."
+            print "\rOpening of", img_url, "did not succeed, trying next one..."
-        i = i + 1
+        i += 1
-    t = mysubparser.get_hyperlinks()
+    return mysubparser.get_hyperlinks()
 def get_images(t = []):
    total = len(t)
    progress = progressbar.Progress(total)
    i = 1
    for link in t:
        progress.show_progress(i)
        filename = os.path.join(savedir, os.path.split(link)[1])
        if not os.path.exists(filename):
            tries = 10
            while tries > 0:
                try:
                    urllib.urlretrieve(link, filename)
                    print "Retrieved", link, "-", i, "of", total
                    break
                except IOError:
-                    tries = tries - 1
+                    tries -= 1
-                    print "Downloading of", link, "failed,", tries, "left"
+                    print "\rDownloading of", link, "failed,", tries, "left"
        else:
-            print "Not downloading", link, "already downloaded"
+            print "\rNot downloading", link, "already downloaded"
-        i = i + 1
+        i += 1
 if __name__ == "__main__":
    # Get a file-like object for the 4chan.org w/imgboard
    base_url = "http://boards.4chan.org/w/"
    # Get the hyperlinks.
    t = get_thread_links(base_url)
    t = get_image_links(base_url, t)
    get_images(t)
--- a/htmlparser.py
+++ b/htmlparser.py
@ -0,0 +1,30 @@
 import sgmllib
 import re
 class MyParser(sgmllib.SGMLParser):
    def __init__(self, verbose=0):
        sgmllib.SGMLParser.__init__(self, verbose)
        self.hyperlinks = []
        self.url_reg = re.compile('res/\d+\Z')
        self.prev = ""
    def parse(self, s):
        self.feed(s)
        self.close()
    def start_a(self, attributes):
        for name, value in attributes:
            if name == "href":
                if self.url_reg.search(value) != None:
                    if self.prev != value:
                        self.hyperlinks.append(value)
                        self.prev = value
    def get_hyperlinks(self):
        return self.hyperlinks
 class MySubParser(MyParser):
    def __init__(self, verbose=0):
        MyParser.__init__(self, verbose)
        self.url_reg = re.compile('/src/\d+\.\w{3,4}\Z')
--- a/progressbar.py
+++ b/progressbar.py
@ -0,0 +1,27 @@
 import sys
 import time
 class Progress():
    def __init__(self, maxvalue, maxwidth=80, fd=sys.stdout):
        self.maxwidth = maxwidth
        self.maxvalue = maxvalue
        self.fd = fd
        self.fill_char = '#'
        self.show_progress(0)
    def show_progress(self, value):
        str_value = str(value)
        str_maxvalue = str(self.maxvalue)
        true_maxwidth = self.maxwidth - 4 - len(str_value) - len(str_maxvalue)
        progress = int(round((true_maxwidth/float(self.maxvalue))*value))
        self.fd.write("\r%s/%s [%s%s]" % (str_value, str_maxvalue, self.fill_char * progress, " " * (true_maxwidth - progress)))
        self.fd.flush()
        if value == self.maxvalue:
            self.fd.write("\n")
 if __name__ == "__main__":
    prog = Progress(200)
    for i in range(1, 201):
        prog.show_progress(i)
        time.sleep(1)