###################################################################### # Copyright 2009, 2010 ryuslash # # This file is part of 4grab. # # 4grab is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # 4grab is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with 4grab. If not, see . ###################################################################### import sgmllib import re class MyParser(sgmllib.SGMLParser): def __init__(self, verbose=0): sgmllib.SGMLParser.__init__(self, verbose) self.hyperlinks = [] self.url_reg = re.compile('res/\d+\Z') self.prev = "" def parse(self, s): self.feed(s) self.close() def start_a(self, attributes): for name, value in attributes: if name == "href": if self.url_reg.search(value) != None: if self.prev != value: self.hyperlinks.append(value) self.prev = value def get_hyperlinks(self): return self.hyperlinks class MySubParser(MyParser): def __init__(self, verbose=0): MyParser.__init__(self, verbose) self.url_reg = re.compile('/src/\d+\.\w{3,4}\Z')