diff options
author | ryuslash | 2010-01-17 03:48:23 +0100 |
---|---|---|
committer | ryuslash | 2010-01-17 03:48:23 +0100 |
commit | 025a723a5a201009da392bca4c27c4eb25e9e734 (patch) | |
tree | 8a16a78a0326c78ab6cdca270623986bcc369dcd /htmlparser.py | |
parent | fb65246575871e0129b80911c3610606884451b0 (diff) | |
download | 4grab-025a723a5a201009da392bca4c27c4eb25e9e734.tar.gz 4grab-025a723a5a201009da392bca4c27c4eb25e9e734.zip |
Parser seperation, progress bar
* Seperated the parser from the downloader code.
* Added a progressbar class, to make it look fancier
* Created some functions to do all the work in downloader.py, cleaner now
* Changed parser.py to htmlparser.py, since it was conflicting with a built-in module
Diffstat (limited to 'htmlparser.py')
-rw-r--r-- | htmlparser.py | 30 |
1 files changed, 30 insertions, 0 deletions
diff --git a/htmlparser.py b/htmlparser.py new file mode 100644 index 0000000..73338dd --- /dev/null +++ b/htmlparser.py @@ -0,0 +1,30 @@ +import sgmllib +import re + +class MyParser(sgmllib.SGMLParser): + def __init__(self, verbose=0): + sgmllib.SGMLParser.__init__(self, verbose) + + self.hyperlinks = [] + self.url_reg = re.compile('res/\d+\Z') + self.prev = "" + + def parse(self, s): + self.feed(s) + self.close() + + def start_a(self, attributes): + for name, value in attributes: + if name == "href": + if self.url_reg.search(value) != None: + if self.prev != value: + self.hyperlinks.append(value) + self.prev = value + + def get_hyperlinks(self): + return self.hyperlinks + +class MySubParser(MyParser): + def __init__(self, verbose=0): + MyParser.__init__(self, verbose) + self.url_reg = re.compile('/src/\d+\.\w{3,4}\Z') |