summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGravatar Tom Willemsen2011-06-11 11:38:21 +0200
committerGravatar Tom Willemsen2011-06-11 11:38:21 +0200
commit02524e678017fa64d54714439f322790351705c6 (patch)
tree13d9f1000b02f0c928a4b0109ac618ab51c76c8c
parent4bd89ebdedf429186bb1381a61c3adf9eaeb5eb9 (diff)
download4grab-develop-arch.tar.gz
4grab-develop-arch.zip
Moved away from deprecated SGMLlib parserdevelop-arch
I'd heard that the SGMLlib parser had been deprecated in favor of HTMLParser, so now I've moved to that.
-rw-r--r--htmlparser.py36
1 files changed, 19 insertions, 17 deletions
diff --git a/htmlparser.py b/htmlparser.py
index bbfd620..aa1c885 100644
--- a/htmlparser.py
+++ b/htmlparser.py
@@ -17,33 +17,35 @@
# along with 4grab. If not, see <http://www.gnu.org/licenses/>.
######################################################################
-import sgmllib
+from HTMLParser import HTMLParser, HTMLParseError
import re
-class MyParser(sgmllib.SGMLParser):
- def __init__(self, verbose=0):
- sgmllib.SGMLParser.__init__(self, verbose)
-
+class MyParser(HTMLParser):
+ def __init__(self):
+ HTMLParser.__init__(self)
+
self.hyperlinks = []
self.url_reg = re.compile('res/\d+\Z')
- self.prev = ""
-
+
def parse(self, s):
self.feed(s)
self.close()
- def start_a(self, attributes):
- for name, value in attributes:
- if name == "href":
- if self.url_reg.search(value) != None:
- if self.prev != value:
- self.hyperlinks.append(value)
- self.prev = value
-
+ def handle_starttag(self, tag, attrs):
+ prev = ""
+
+ if tag == 'a':
+ for name, value in attrs:
+ if name == 'href':
+ if self.url_reg.search(value) != None:
+ if prev != value:
+ self.hyperlinks.append(value)
+ prev = value
+
def get_hyperlinks(self):
return self.hyperlinks
class MySubParser(MyParser):
- def __init__(self, verbose=0):
- MyParser.__init__(self, verbose)
+ def __init__(self):
+ MyParser.__init__(self)
self.url_reg = re.compile('/src/\d+\.\w{3,4}\Z')