diff options
Diffstat (limited to 'htmlparser.py')
-rw-r--r-- | htmlparser.py | 30 |
1 files changed, 30 insertions, 0 deletions
diff --git a/htmlparser.py b/htmlparser.py new file mode 100644 index 0000000..73338dd --- /dev/null +++ b/htmlparser.py @@ -0,0 +1,30 @@ +import sgmllib +import re + +class MyParser(sgmllib.SGMLParser): + def __init__(self, verbose=0): + sgmllib.SGMLParser.__init__(self, verbose) + + self.hyperlinks = [] + self.url_reg = re.compile('res/\d+\Z') + self.prev = "" + + def parse(self, s): + self.feed(s) + self.close() + + def start_a(self, attributes): + for name, value in attributes: + if name == "href": + if self.url_reg.search(value) != None: + if self.prev != value: + self.hyperlinks.append(value) + self.prev = value + + def get_hyperlinks(self): + return self.hyperlinks + +class MySubParser(MyParser): + def __init__(self, verbose=0): + MyParser.__init__(self, verbose) + self.url_reg = re.compile('/src/\d+\.\w{3,4}\Z') |