diff --git a/htmlparser.py b/htmlparser.py
index bbfd620..aa1c885 100644
--- a/htmlparser.py
+++ b/htmlparser.py
@@ -17,33 +17,35 @@
# along with 4grab. If not, see .
######################################################################
-import sgmllib
+from HTMLParser import HTMLParser, HTMLParseError
import re
-class MyParser(sgmllib.SGMLParser):
- def __init__(self, verbose=0):
- sgmllib.SGMLParser.__init__(self, verbose)
-
+class MyParser(HTMLParser):
+ def __init__(self):
+ HTMLParser.__init__(self)
+
self.hyperlinks = []
self.url_reg = re.compile('res/\d+\Z')
- self.prev = ""
-
+
def parse(self, s):
self.feed(s)
self.close()
- def start_a(self, attributes):
- for name, value in attributes:
- if name == "href":
- if self.url_reg.search(value) != None:
- if self.prev != value:
- self.hyperlinks.append(value)
- self.prev = value
-
+ def handle_starttag(self, tag, attrs):
+ prev = ""
+
+ if tag == 'a':
+ for name, value in attrs:
+ if name == 'href':
+ if self.url_reg.search(value) != None:
+ if prev != value:
+ self.hyperlinks.append(value)
+ prev = value
+
def get_hyperlinks(self):
return self.hyperlinks
class MySubParser(MyParser):
- def __init__(self, verbose=0):
- MyParser.__init__(self, verbose)
+ def __init__(self):
+ MyParser.__init__(self)
self.url_reg = re.compile('/src/\d+\.\w{3,4}\Z')