summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGravatar ryuslash2010-01-15 08:22:17 +0100
committerGravatar ryuslash2010-01-15 08:22:17 +0100
commitfb65246575871e0129b80911c3610606884451b0 (patch)
treea7909052e7db1de5cc730f3dad686f26a99ed3b0
download4grab-fb65246575871e0129b80911c3610606884451b0.tar.gz
4grab-fb65246575871e0129b80911c3610606884451b0.zip
Initial commit
Can download images from /w/
-rw-r--r--.gitignore1
-rw-r--r--download.py118
2 files changed, 119 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..b25c15b
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+*~
diff --git a/download.py b/download.py
new file mode 100644
index 0000000..5c77654
--- /dev/null
+++ b/download.py
@@ -0,0 +1,118 @@
+import urllib
+import sgmllib
+import re
+import os
+
+savedir = "/home/slash/Pictures/4grab/"
+
+class MyParser(sgmllib.SGMLParser):
+ "A simple parser class."
+
+ def parse(self, s):
+ "Parse the given string 's'."
+ self.feed(s)
+ self.close()
+
+ def __init__(self, verbose=0):
+ "Initialise an object, passing 'verbose' to the superclass."
+
+ sgmllib.SGMLParser.__init__(self, verbose)
+ self.hyperlinks = []
+
+ self.url_reg = re.compile('res/\d+\Z')
+ self.img_reg = re.compile('/\d+\.(jpg|gif|bmp|png|jpeg)\Z')
+
+ def start_a(self, attributes):
+ "Process a hyperlink and its 'attributes'."
+
+ for name, value in attributes:
+ if name == "href":
+ if self.url_reg.search(value) != None:
+ self.hyperlinks.append(value)
+
+ def get_hyperlinks(self):
+ "Return the list of hyperlinks."
+
+ return self.hyperlinks
+
+class MySubParser(MyParser):
+ def __init__(self, verbose=0):
+ MyParser.__init__(self, verbose)
+ self.url_reg = re.compile('/src/\d+\.\w{3,4}\Z')
+
+if __name__ == "__main__":
+ # Get a file-like object for the 4chan.org w/imgboard
+ base_url = "http://boards.4chan.org/w/"
+ myparser = MyParser()
+ total = 10
+ for i in range(0, total):
+ if i > 0:
+ url = base_url + str(i)
+ else:
+ url = base_url
+
+ tries = 10
+ while tries > 0:
+ try:
+ f = urllib.urlopen(url)
+ break
+ except IOError:
+ tries = tries - 1
+ print "Try of", url, "failed,", tries, "tries left"
+ if not f is None:
+ # Read the object
+ s = f.read()
+ f.close()
+
+ # Try and process the page.
+ # The class should have been defined first, remember.
+ myparser.parse(s)
+ print "Parsed", url, "-", i + 1, "of", total
+ else:
+ "Opening of", url, "did not succeed, trying next one..."
+
+ # Get the hyperlinks.
+ t = myparser.get_hyperlinks()
+ mysubparser = MySubParser()
+ total = len(t)
+ i = 1
+ for link in t:
+ img_url = base_url + link
+ tries = 10
+ while tries > 0:
+ try:
+ f = urllib.urlopen(img_url)
+ break
+ except IOError:
+ tries = tries - 1
+ print "Try of", img_url, "failed,", tries, "tries left"
+ if not f is None:
+ s = f.read()
+ f.close()
+
+ mysubparser.parse(s)
+ print "Parsed", img_url, "-", i, "of", total
+ else:
+ print "Opening of", img_url, "did not succeed, trying next one..."
+ i = i + 1
+
+ t = mysubparser.get_hyperlinks()
+ total = len(t)
+ i = 1
+ for link in t:
+ filename = os.path.join(savedir, os.path.split(link)[1])
+ if not os.path.exists(filename):
+ tries = 10
+ while tries > 0:
+ try:
+ urllib.urlretrieve(link, filename)
+ print "Retrieved", link, "-", i, "of", total
+ break
+ except IOError:
+ tries = tries - 1
+ print "Downloading of", link, "failed,", tries, "left"
+
+ else:
+ print "Not downloading", link, "already downloaded"
+ i = i + 1
+