summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGravatar ryuslash2010-01-17 03:48:23 +0100
committerGravatar ryuslash2010-01-17 03:48:23 +0100
commit025a723a5a201009da392bca4c27c4eb25e9e734 (patch)
tree8a16a78a0326c78ab6cdca270623986bcc369dcd
parentfb65246575871e0129b80911c3610606884451b0 (diff)
download4grab-025a723a5a201009da392bca4c27c4eb25e9e734.tar.gz
4grab-025a723a5a201009da392bca4c27c4eb25e9e734.zip
Parser seperation, progress bar
* Seperated the parser from the downloader code. * Added a progressbar class, to make it look fancier * Created some functions to do all the work in downloader.py, cleaner now * Changed parser.py to htmlparser.py, since it was conflicting with a built-in module
-rw-r--r--.gitignore1
-rw-r--r--download.py125
-rw-r--r--htmlparser.py30
-rw-r--r--progressbar.py27
4 files changed, 109 insertions, 74 deletions
diff --git a/.gitignore b/.gitignore
index b25c15b..2f836aa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
*~
+*.pyc
diff --git a/download.py b/download.py
index 5c77654..f23d64c 100644
--- a/download.py
+++ b/download.py
@@ -1,82 +1,50 @@
import urllib
-import sgmllib
-import re
import os
+import htmlparser
+import progressbar
savedir = "/home/slash/Pictures/4grab/"
-
-class MyParser(sgmllib.SGMLParser):
- "A simple parser class."
-
- def parse(self, s):
- "Parse the given string 's'."
- self.feed(s)
- self.close()
-
- def __init__(self, verbose=0):
- "Initialise an object, passing 'verbose' to the superclass."
-
- sgmllib.SGMLParser.__init__(self, verbose)
- self.hyperlinks = []
-
- self.url_reg = re.compile('res/\d+\Z')
- self.img_reg = re.compile('/\d+\.(jpg|gif|bmp|png|jpeg)\Z')
-
- def start_a(self, attributes):
- "Process a hyperlink and its 'attributes'."
-
- for name, value in attributes:
- if name == "href":
- if self.url_reg.search(value) != None:
- self.hyperlinks.append(value)
-
- def get_hyperlinks(self):
- "Return the list of hyperlinks."
-
- return self.hyperlinks
-
-class MySubParser(MyParser):
- def __init__(self, verbose=0):
- MyParser.__init__(self, verbose)
- self.url_reg = re.compile('/src/\d+\.\w{3,4}\Z')
-if __name__ == "__main__":
- # Get a file-like object for the 4chan.org w/imgboard
- base_url = "http://boards.4chan.org/w/"
- myparser = MyParser()
- total = 10
- for i in range(0, total):
- if i > 0:
- url = base_url + str(i)
- else:
- url = base_url
-
+def get_thread_links(baseurl):
+ myparser = htmlparser.MyParser()
+ t = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]
+ i = 1
+ total = len(t)
+ progress = progressbar.Progress(total)
+
+ for pagenum in t:
+ progress.show_progress(i)
+
+ url = base_url + pagenum
tries = 10
while tries > 0:
- try:
+ try:
f = urllib.urlopen(url)
break
except IOError:
- tries = tries - 1
- print "Try of", url, "failed,", tries, "tries left"
+ tries -= 1
+ print "\rTry of", url, "failed,", tries, "tries left"
if not f is None:
- # Read the object
+ # Read the response
s = f.read()
f.close()
-
- # Try and process the page.
- # The class should have been defined first, remember.
+
+ # Process the page.
myparser.parse(s)
- print "Parsed", url, "-", i + 1, "of", total
else:
- "Opening of", url, "did not succeed, trying next one..."
-
- # Get the hyperlinks.
- t = myparser.get_hyperlinks()
- mysubparser = MySubParser()
+ "\rOpening of", url, "did not succeed, trying next one..."
+ i += 1
+ return myparser.get_hyperlinks()
+
+def get_image_links(baseurl, t = []):
+ mysubparser = htmlparser.MySubParser()
total = len(t)
+ progress = progressbar.Progress(total)
i = 1
+
for link in t:
+ progress.show_progress(i)
+
img_url = base_url + link
tries = 10
while tries > 0:
@@ -84,35 +52,44 @@ if __name__ == "__main__":
f = urllib.urlopen(img_url)
break
except IOError:
- tries = tries - 1
- print "Try of", img_url, "failed,", tries, "tries left"
+ tries -= 1
+ print "\rTry of", img_url, "failed,", tries, "tries left"
if not f is None:
s = f.read()
f.close()
mysubparser.parse(s)
- print "Parsed", img_url, "-", i, "of", total
else:
- print "Opening of", img_url, "did not succeed, trying next one..."
- i = i + 1
+ print "\rOpening of", img_url, "did not succeed, trying next one..."
+ i += 1
- t = mysubparser.get_hyperlinks()
+ return mysubparser.get_hyperlinks()
+
+def get_images(t = []):
total = len(t)
+ progress = progressbar.Progress(total)
i = 1
for link in t:
+ progress.show_progress(i)
filename = os.path.join(savedir, os.path.split(link)[1])
if not os.path.exists(filename):
tries = 10
while tries > 0:
try:
urllib.urlretrieve(link, filename)
- print "Retrieved", link, "-", i, "of", total
break
except IOError:
- tries = tries - 1
- print "Downloading of", link, "failed,", tries, "left"
-
+ tries -= 1
+ print "\rDownloading of", link, "failed,", tries, "left"
else:
- print "Not downloading", link, "already downloaded"
- i = i + 1
-
+ print "\rNot downloading", link, "already downloaded"
+ i += 1
+
+if __name__ == "__main__":
+ # Get a file-like object for the 4chan.org w/imgboard
+ base_url = "http://boards.4chan.org/w/"
+
+ # Get the hyperlinks.
+ t = get_thread_links(base_url)
+ t = get_image_links(base_url, t)
+ get_images(t)
diff --git a/htmlparser.py b/htmlparser.py
new file mode 100644
index 0000000..73338dd
--- /dev/null
+++ b/htmlparser.py
@@ -0,0 +1,30 @@
+import sgmllib
+import re
+
+class MyParser(sgmllib.SGMLParser):
+ def __init__(self, verbose=0):
+ sgmllib.SGMLParser.__init__(self, verbose)
+
+ self.hyperlinks = []
+ self.url_reg = re.compile('res/\d+\Z')
+ self.prev = ""
+
+ def parse(self, s):
+ self.feed(s)
+ self.close()
+
+ def start_a(self, attributes):
+ for name, value in attributes:
+ if name == "href":
+ if self.url_reg.search(value) != None:
+ if self.prev != value:
+ self.hyperlinks.append(value)
+ self.prev = value
+
+ def get_hyperlinks(self):
+ return self.hyperlinks
+
+class MySubParser(MyParser):
+ def __init__(self, verbose=0):
+ MyParser.__init__(self, verbose)
+ self.url_reg = re.compile('/src/\d+\.\w{3,4}\Z')
diff --git a/progressbar.py b/progressbar.py
new file mode 100644
index 0000000..a2ea711
--- /dev/null
+++ b/progressbar.py
@@ -0,0 +1,27 @@
+import sys
+import time
+
+class Progress():
+ def __init__(self, maxvalue, maxwidth=80, fd=sys.stdout):
+ self.maxwidth = maxwidth
+ self.maxvalue = maxvalue
+ self.fd = fd
+ self.fill_char = '#'
+
+ self.show_progress(0)
+
+ def show_progress(self, value):
+ str_value = str(value)
+ str_maxvalue = str(self.maxvalue)
+ true_maxwidth = self.maxwidth - 4 - len(str_value) - len(str_maxvalue)
+ progress = int(round((true_maxwidth/float(self.maxvalue))*value))
+ self.fd.write("\r%s/%s [%s%s]" % (str_value, str_maxvalue, self.fill_char * progress, " " * (true_maxwidth - progress)))
+ self.fd.flush()
+ if value == self.maxvalue:
+ self.fd.write("\n")
+
+if __name__ == "__main__":
+ prog = Progress(200)
+ for i in range(1, 201):
+ prog.show_progress(i)
+ time.sleep(1)