From 025a723a5a201009da392bca4c27c4eb25e9e734 Mon Sep 17 00:00:00 2001 From: ryuslash Date: Sun, 17 Jan 2010 03:48:23 +0100 Subject: Parser seperation, progress bar * Seperated the parser from the downloader code. * Added a progressbar class, to make it look fancier * Created some functions to do all the work in downloader.py, cleaner now * Changed parser.py to htmlparser.py, since it was conflicting with a built-in module --- htmlparser.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 htmlparser.py (limited to 'htmlparser.py') diff --git a/htmlparser.py b/htmlparser.py new file mode 100644 index 0000000..73338dd --- /dev/null +++ b/htmlparser.py @@ -0,0 +1,30 @@ +import sgmllib +import re + +class MyParser(sgmllib.SGMLParser): + def __init__(self, verbose=0): + sgmllib.SGMLParser.__init__(self, verbose) + + self.hyperlinks = [] + self.url_reg = re.compile('res/\d+\Z') + self.prev = "" + + def parse(self, s): + self.feed(s) + self.close() + + def start_a(self, attributes): + for name, value in attributes: + if name == "href": + if self.url_reg.search(value) != None: + if self.prev != value: + self.hyperlinks.append(value) + self.prev = value + + def get_hyperlinks(self): + return self.hyperlinks + +class MySubParser(MyParser): + def __init__(self, verbose=0): + MyParser.__init__(self, verbose) + self.url_reg = re.compile('/src/\d+\.\w{3,4}\Z') -- cgit v1.2.3-54-g00ecf