import sgmllib import re class MyParser(sgmllib.SGMLParser): def __init__(self, verbose=0): sgmllib.SGMLParser.__init__(self, verbose) self.hyperlinks = [] self.url_reg = re.compile('res/\d+\Z') self.prev = "" def parse(self, s): self.feed(s) self.close() def start_a(self, attributes): for name, value in attributes: if name == "href": if self.url_reg.search(value) != None: if self.prev != value: self.hyperlinks.append(value) self.prev = value def get_hyperlinks(self): return self.hyperlinks class MySubParser(MyParser): def __init__(self, verbose=0): MyParser.__init__(self, verbose) self.url_reg = re.compile('/src/\d+\.\w{3,4}\Z')