Parser seperation, progress bar

* Seperated the parser from the downloader code. * Added a progressbar class, to make it look fancier * Created some functions to do all the work in downloader.py, cleaner now * Changed parser.py to htmlparser.py, since it was conflicting with a built-in module
author: ryuslash 2010-01-17 03:48:23 +0100
committer: ryuslash 2010-01-17 03:48:23 +0100
commit: 025a723a5a201009da392bca4c27c4eb25e9e734 (patch)
tree: 8a16a78a0326c78ab6cdca270623986bcc369dcd
parent: fb65246575871e0129b80911c3610606884451b0 (diff)
download: 4grab-025a723a5a201009da392bca4c27c4eb25e9e734.tar.gz
4grab-025a723a5a201009da392bca4c27c4eb25e9e734.zip
4 files changed, 109 insertions, 74 deletions
diff --git a/.gitignore b/.gitignore
index b25c15b..2f836aa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
 *~
+*.pyc
diff --git a/download.py b/download.py
index 5c77654..f23d64c 100644
--- a/download.py
+++ b/download.py
@@ -1,82 +1,50 @@
 import urllib
-import sgmllib
-import re
 import os
+import htmlparser
+import progressbar
 
 savedir = "/home/slash/Pictures/4grab/"
- 
-class MyParser(sgmllib.SGMLParser):
-    "A simple parser class."
-    
-    def parse(self, s):
-        "Parse the given string 's'."
-        self.feed(s)
-        self.close()
- 
-    def __init__(self, verbose=0):
-        "Initialise an object, passing 'verbose' to the superclass."
-        
-        sgmllib.SGMLParser.__init__(self, verbose)
-        self.hyperlinks = []
- 
-        self.url_reg = re.compile('res/\d+\Z')
-        self.img_reg = re.compile('/\d+\.(jpg|gif|bmp|png|jpeg)\Z')
- 
-    def start_a(self, attributes):
-        "Process a hyperlink and its 'attributes'."
- 
-        for name, value in attributes:
-            if name == "href":
-                if self.url_reg.search(value) != None:
-                    self.hyperlinks.append(value)
- 
-    def get_hyperlinks(self):
-        "Return the list of hyperlinks."
- 
-        return self.hyperlinks
- 
-class MySubParser(MyParser):
-    def __init__(self, verbose=0):
-        MyParser.__init__(self, verbose)
-        self.url_reg = re.compile('/src/\d+\.\w{3,4}\Z')
 
-if __name__ == "__main__":
-    # Get a file-like object for the 4chan.org w/imgboard
-    base_url = "http://boards.4chan.org/w/"
-    myparser = MyParser()
-    total = 10
-    for i in range(0, total):
-        if i > 0:
-            url = base_url + str(i)
-        else:
-            url = base_url
-        
+def get_thread_links(baseurl):
+    myparser = htmlparser.MyParser()
+    t = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]
+    i = 1
+    total = len(t)
+    progress = progressbar.Progress(total)
+
+    for pagenum in t:
+        progress.show_progress(i)
+
+        url = base_url + pagenum
         tries = 10
         while tries > 0:
-            try:    
+            try:
                 f = urllib.urlopen(url)
                 break
             except IOError:
-                tries = tries - 1
-                print "Try of", url, "failed,", tries, "tries left"
+                tries -= 1
+                print "\rTry of", url, "failed,", tries, "tries left"
         if not f is None:
-            # Read the object
+            # Read the response
             s = f.read()
             f.close()
-            
-            # Try and process the page.
-            # The class should have been defined first, remember.
+
+            # Process the page.
             myparser.parse(s)
-            print "Parsed", url, "-", i + 1, "of", total
         else:
-            "Opening of", url, "did not succeed, trying next one..."
- 
-    # Get the hyperlinks.
-    t = myparser.get_hyperlinks()
-    mysubparser = MySubParser()
+            "\rOpening of", url, "did not succeed, trying next one..."
+        i += 1
+    return myparser.get_hyperlinks()
+
+def get_image_links(baseurl, t = []):
+    mysubparser = htmlparser.MySubParser()
     total = len(t)
+    progress = progressbar.Progress(total)
     i = 1
+    
     for link in t:
+        progress.show_progress(i)
+
         img_url = base_url + link
         tries = 10
         while tries > 0:
@@ -84,35 +52,44 @@ if __name__ == "__main__":
                 f = urllib.urlopen(img_url)
                 break
             except IOError:
-                tries = tries - 1
-                print "Try of", img_url, "failed,", tries, "tries left"
+                tries -= 1
+                print "\rTry of", img_url, "failed,", tries, "tries left"
         if not f is None:
             s = f.read()
             f.close()
 
             mysubparser.parse(s)
-            print "Parsed", img_url, "-", i, "of", total
         else:
-            print "Opening of", img_url, "did not succeed, trying next one..."
-        i = i + 1
+            print "\rOpening of", img_url, "did not succeed, trying next one..."
+        i += 1
 
-    t = mysubparser.get_hyperlinks()
+    return mysubparser.get_hyperlinks()
+
+def get_images(t = []):
     total = len(t)
+    progress = progressbar.Progress(total)
     i = 1
     for link in t:
+        progress.show_progress(i)
         filename = os.path.join(savedir, os.path.split(link)[1])
         if not os.path.exists(filename):
             tries = 10
             while tries > 0:
                 try:
                     urllib.urlretrieve(link, filename)
-                    print "Retrieved", link, "-", i, "of", total
                     break
                 except IOError:
-                    tries = tries - 1
-                    print "Downloading of", link, "failed,", tries, "left"
-                    
+                    tries -= 1
+                    print "\rDownloading of", link, "failed,", tries, "left"
         else:
-            print "Not downloading", link, "already downloaded"
-        i = i + 1
-            
+            print "\rNot downloading", link, "already downloaded"
+        i += 1
+
+if __name__ == "__main__":
+    # Get a file-like object for the 4chan.org w/imgboard
+    base_url = "http://boards.4chan.org/w/"
+ 
+    # Get the hyperlinks.
+    t = get_thread_links(base_url)
+    t = get_image_links(base_url, t)
+    get_images(t)
diff --git a/htmlparser.py b/htmlparser.py
new file mode 100644
index 0000000..73338dd
--- /dev/null
+++ b/htmlparser.py
@@ -0,0 +1,30 @@
+import sgmllib
+import re
+
+class MyParser(sgmllib.SGMLParser):
+    def __init__(self, verbose=0):
+        sgmllib.SGMLParser.__init__(self, verbose)
+        
+        self.hyperlinks = []
+        self.url_reg = re.compile('res/\d+\Z')
+        self.prev = ""
+        
+    def parse(self, s):
+        self.feed(s)
+        self.close()
+
+    def start_a(self, attributes):
+        for name, value in attributes:
+            if name == "href":
+                if self.url_reg.search(value) != None:
+                    if self.prev != value:
+                        self.hyperlinks.append(value)
+                        self.prev = value
+    
+    def get_hyperlinks(self):
+        return self.hyperlinks
+
+class MySubParser(MyParser):
+    def __init__(self, verbose=0):
+        MyParser.__init__(self, verbose)
+        self.url_reg = re.compile('/src/\d+\.\w{3,4}\Z')
diff --git a/progressbar.py b/progressbar.py
new file mode 100644
index 0000000..a2ea711
--- /dev/null
+++ b/progressbar.py
@@ -0,0 +1,27 @@
+import sys
+import time
+
+class Progress():
+    def __init__(self, maxvalue, maxwidth=80, fd=sys.stdout):
+        self.maxwidth = maxwidth
+        self.maxvalue = maxvalue
+        self.fd = fd
+        self.fill_char = '#'
+
+        self.show_progress(0)
+
+    def show_progress(self, value):
+        str_value = str(value)
+        str_maxvalue = str(self.maxvalue)
+        true_maxwidth = self.maxwidth - 4 - len(str_value) - len(str_maxvalue)
+        progress = int(round((true_maxwidth/float(self.maxvalue))*value))
+        self.fd.write("\r%s/%s [%s%s]" % (str_value, str_maxvalue, self.fill_char * progress, " " * (true_maxwidth - progress)))
+        self.fd.flush()
+        if value == self.maxvalue:
+            self.fd.write("\n")
+
+if __name__ == "__main__":
+    prog = Progress(200)
+    for i in range(1, 201):
+        prog.show_progress(i)
+        time.sleep(1)
author	ryuslash	2010-01-17 03:48:23 +0100
committer	ryuslash	2010-01-17 03:48:23 +0100
commit	025a723a5a201009da392bca4c27c4eb25e9e734 (patch)
tree	8a16a78a0326c78ab6cdca270623986bcc369dcd
parent	fb65246575871e0129b80911c3610606884451b0 (diff)
download	4grab-025a723a5a201009da392bca4c27c4eb25e9e734.tar.gz 4grab-025a723a5a201009da392bca4c27c4eb25e9e734.zip