summaryrefslogtreecommitdiffstats
path: root/htmlparser.py
diff options
context:
space:
mode:
Diffstat (limited to 'htmlparser.py')
-rw-r--r--htmlparser.py30
1 files changed, 30 insertions, 0 deletions
diff --git a/htmlparser.py b/htmlparser.py
new file mode 100644
index 0000000..73338dd
--- /dev/null
+++ b/htmlparser.py
@@ -0,0 +1,30 @@
+import sgmllib
+import re
+
+class MyParser(sgmllib.SGMLParser):
+ def __init__(self, verbose=0):
+ sgmllib.SGMLParser.__init__(self, verbose)
+
+ self.hyperlinks = []
+ self.url_reg = re.compile('res/\d+\Z')
+ self.prev = ""
+
+ def parse(self, s):
+ self.feed(s)
+ self.close()
+
+ def start_a(self, attributes):
+ for name, value in attributes:
+ if name == "href":
+ if self.url_reg.search(value) != None:
+ if self.prev != value:
+ self.hyperlinks.append(value)
+ self.prev = value
+
+ def get_hyperlinks(self):
+ return self.hyperlinks
+
+class MySubParser(MyParser):
+ def __init__(self, verbose=0):
+ MyParser.__init__(self, verbose)
+ self.url_reg = re.compile('/src/\d+\.\w{3,4}\Z')