diff --git a/htmlparser.py b/htmlparser.py index bbfd620..aa1c885 100644 --- a/htmlparser.py +++ b/htmlparser.py @@ -17,33 +17,35 @@ # along with 4grab. If not, see . ###################################################################### -import sgmllib +from HTMLParser import HTMLParser, HTMLParseError import re -class MyParser(sgmllib.SGMLParser): - def __init__(self, verbose=0): - sgmllib.SGMLParser.__init__(self, verbose) - +class MyParser(HTMLParser): + def __init__(self): + HTMLParser.__init__(self) + self.hyperlinks = [] self.url_reg = re.compile('res/\d+\Z') - self.prev = "" - + def parse(self, s): self.feed(s) self.close() - def start_a(self, attributes): - for name, value in attributes: - if name == "href": - if self.url_reg.search(value) != None: - if self.prev != value: - self.hyperlinks.append(value) - self.prev = value - + def handle_starttag(self, tag, attrs): + prev = "" + + if tag == 'a': + for name, value in attrs: + if name == 'href': + if self.url_reg.search(value) != None: + if prev != value: + self.hyperlinks.append(value) + prev = value + def get_hyperlinks(self): return self.hyperlinks class MySubParser(MyParser): - def __init__(self, verbose=0): - MyParser.__init__(self, verbose) + def __init__(self): + MyParser.__init__(self) self.url_reg = re.compile('/src/\d+\.\w{3,4}\Z')