###################################################################### # Copyright 2009, 2010 ryuslash # # This file is part of 4grab. # # 4grab is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # 4grab is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with 4grab. If not, see . ###################################################################### from HTMLParser import HTMLParser, HTMLParseError import re class MyParser(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.hyperlinks = [] self.url_reg = re.compile('res/\d+\Z') def parse(self, s): self.feed(s) self.close() def handle_starttag(self, tag, attrs): prev = "" if tag == 'a': for name, value in attrs: if name == 'href': if self.url_reg.search(value) != None: if prev != value: self.hyperlinks.append(value) prev = value def get_hyperlinks(self): return self.hyperlinks class MySubParser(MyParser): def __init__(self): MyParser.__init__(self) self.url_reg = re.compile('/src/\d+\.\w{3,4}\Z')