summaryrefslogtreecommitdiffstats
path: root/htmlparser.py
diff options
context:
space:
mode:
authorGravatar ryuslash2010-01-17 03:48:23 +0100
committerGravatar ryuslash2010-01-17 03:48:23 +0100
commit025a723a5a201009da392bca4c27c4eb25e9e734 (patch)
tree8a16a78a0326c78ab6cdca270623986bcc369dcd /htmlparser.py
parentfb65246575871e0129b80911c3610606884451b0 (diff)
download4grab-025a723a5a201009da392bca4c27c4eb25e9e734.tar.gz
4grab-025a723a5a201009da392bca4c27c4eb25e9e734.zip
Parser seperation, progress bar
* Seperated the parser from the downloader code. * Added a progressbar class, to make it look fancier * Created some functions to do all the work in downloader.py, cleaner now * Changed parser.py to htmlparser.py, since it was conflicting with a built-in module
Diffstat (limited to 'htmlparser.py')
-rw-r--r--htmlparser.py30
1 files changed, 30 insertions, 0 deletions
diff --git a/htmlparser.py b/htmlparser.py
new file mode 100644
index 0000000..73338dd
--- /dev/null
+++ b/htmlparser.py
@@ -0,0 +1,30 @@
+import sgmllib
+import re
+
+class MyParser(sgmllib.SGMLParser):
+ def __init__(self, verbose=0):
+ sgmllib.SGMLParser.__init__(self, verbose)
+
+ self.hyperlinks = []
+ self.url_reg = re.compile('res/\d+\Z')
+ self.prev = ""
+
+ def parse(self, s):
+ self.feed(s)
+ self.close()
+
+ def start_a(self, attributes):
+ for name, value in attributes:
+ if name == "href":
+ if self.url_reg.search(value) != None:
+ if self.prev != value:
+ self.hyperlinks.append(value)
+ self.prev = value
+
+ def get_hyperlinks(self):
+ return self.hyperlinks
+
+class MySubParser(MyParser):
+ def __init__(self, verbose=0):
+ MyParser.__init__(self, verbose)
+ self.url_reg = re.compile('/src/\d+\.\w{3,4}\Z')