Initial commit

Can download images from /w/
author: ryuslash 2010-01-15 08:22:17 +0100
committer: ryuslash 2010-01-15 08:22:17 +0100
commit: fb65246575871e0129b80911c3610606884451b0 (patch)
tree: a7909052e7db1de5cc730f3dad686f26a99ed3b0
download: 4grab-fb65246575871e0129b80911c3610606884451b0.tar.gz
4grab-fb65246575871e0129b80911c3610606884451b0.zip
2 files changed, 119 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..b25c15b
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+*~
diff --git a/download.py b/download.py
new file mode 100644
index 0000000..5c77654
--- /dev/null
+++ b/download.py
@@ -0,0 +1,118 @@
+import urllib
+import sgmllib
+import re
+import os
+
+savedir = "/home/slash/Pictures/4grab/"
+ 
+class MyParser(sgmllib.SGMLParser):
+    "A simple parser class."
+    
+    def parse(self, s):
+        "Parse the given string 's'."
+        self.feed(s)
+        self.close()
+ 
+    def __init__(self, verbose=0):
+        "Initialise an object, passing 'verbose' to the superclass."
+        
+        sgmllib.SGMLParser.__init__(self, verbose)
+        self.hyperlinks = []
+ 
+        self.url_reg = re.compile('res/\d+\Z')
+        self.img_reg = re.compile('/\d+\.(jpg|gif|bmp|png|jpeg)\Z')
+ 
+    def start_a(self, attributes):
+        "Process a hyperlink and its 'attributes'."
+ 
+        for name, value in attributes:
+            if name == "href":
+                if self.url_reg.search(value) != None:
+                    self.hyperlinks.append(value)
+ 
+    def get_hyperlinks(self):
+        "Return the list of hyperlinks."
+ 
+        return self.hyperlinks
+ 
+class MySubParser(MyParser):
+    def __init__(self, verbose=0):
+        MyParser.__init__(self, verbose)
+        self.url_reg = re.compile('/src/\d+\.\w{3,4}\Z')
+
+if __name__ == "__main__":
+    # Get a file-like object for the 4chan.org w/imgboard
+    base_url = "http://boards.4chan.org/w/"
+    myparser = MyParser()
+    total = 10
+    for i in range(0, total):
+        if i > 0:
+            url = base_url + str(i)
+        else:
+            url = base_url
+        
+        tries = 10
+        while tries > 0:
+            try:    
+                f = urllib.urlopen(url)
+                break
+            except IOError:
+                tries = tries - 1
+                print "Try of", url, "failed,", tries, "tries left"
+        if not f is None:
+            # Read the object
+            s = f.read()
+            f.close()
+            
+            # Try and process the page.
+            # The class should have been defined first, remember.
+            myparser.parse(s)
+            print "Parsed", url, "-", i + 1, "of", total
+        else:
+            "Opening of", url, "did not succeed, trying next one..."
+ 
+    # Get the hyperlinks.
+    t = myparser.get_hyperlinks()
+    mysubparser = MySubParser()
+    total = len(t)
+    i = 1
+    for link in t:
+        img_url = base_url + link
+        tries = 10
+        while tries > 0:
+            try:
+                f = urllib.urlopen(img_url)
+                break
+            except IOError:
+                tries = tries - 1
+                print "Try of", img_url, "failed,", tries, "tries left"
+        if not f is None:
+            s = f.read()
+            f.close()
+
+            mysubparser.parse(s)
+            print "Parsed", img_url, "-", i, "of", total
+        else:
+            print "Opening of", img_url, "did not succeed, trying next one..."
+        i = i + 1
+
+    t = mysubparser.get_hyperlinks()
+    total = len(t)
+    i = 1
+    for link in t:
+        filename = os.path.join(savedir, os.path.split(link)[1])
+        if not os.path.exists(filename):
+            tries = 10
+            while tries > 0:
+                try:
+                    urllib.urlretrieve(link, filename)
+                    print "Retrieved", link, "-", i, "of", total
+                    break
+                except IOError:
+                    tries = tries - 1
+                    print "Downloading of", link, "failed,", tries, "left"
+                    
+        else:
+            print "Not downloading", link, "already downloaded"
+        i = i + 1
+
author	ryuslash	2010-01-15 08:22:17 +0100
committer	ryuslash	2010-01-15 08:22:17 +0100
commit	fb65246575871e0129b80911c3610606884451b0 (patch)
tree	a7909052e7db1de5cc730f3dad686f26a99ed3b0
download	4grab-fb65246575871e0129b80911c3610606884451b0.tar.gz 4grab-fb65246575871e0129b80911c3610606884451b0.zip