0

I'm using HTMLParser (python 2.7)to parse pages I pull down with urllib2,and am coming across AttributeError exceptions when I want to store my data into a list in feed method. But if comment out the __init__ method, the exception was gone


main.py

# -*- coding: utf-8 -*-
from HTMLParser import HTMLParser
import urllib2
import sys
reload(sys)
sys.setdefaultencoding('utf-8')


class MyHTMLParser(HTMLParser):
    def __init__(self):
        self.terms = []
        self.definitions = []

    def handle_starttag(self, tag, attrs):
        # retrive the terms
        if tag == 'div':
            for attribute, value in attrs:
                if value == 'word':
                    self.terms.append(attrs[1][1])
        # retrive the definitions
                if value == 'desc':
                    if attrs[1][1]:
                        self.definitions.append(attrs[1][1])
                    else:
                        self.definitions.append(None)


parser = MyHTMLParser()
# open page and retrive source page
response = urllib2.urlopen('http://localhost/')
html = response.read().decode('utf-8')
response.close()

# extract the terms and definitions
parser.feed(html)

Output

Traceback (most recent call last):
  File "/Users/megachweng/Project/Anki-Youdao/combined.py", line 35, in <module>
    parser.feed(html)
  File "/usr/local/Cellar/python/2.7.13/Frameworks/Python.framework/Versions/2.7/lib/python2.7/HTMLParser.py", line 116, in feed
    self.rawdata = self.rawdata + data
AttributeError: MyHTMLParser instance has no attribute 'rawdata'
1
  • sorry, because of some reason I cant use any third-part packages Commented Jun 15, 2017 at 11:03

2 Answers 2

1

I think that you don't initialize HTMLParser properly. Maybe you don't need to initialize it at all. This works for me:

# -*- coding: utf-8 -*-
from HTMLParser import HTMLParser
import urllib2
import sys
reload(sys)
sys.setdefaultencoding('utf-8')


class MyHTMLParser(HTMLParser):  
    def handle_starttag(self, tag, attrs):
        print "Encountered a start tag:", tag
        # retrive the terms
        if tag == 'div':
            for attribute, value in attrs:
                if value == 'word':
                    self.terms.append(attrs[1][1])
        # retrive the definitions
                if value == 'desc':
                    if attrs[1][1]:
                        self.definitions.append(attrs[1][1])
                    else:
                        self.definitions.append(None)


parser = MyHTMLParser()
# open page and retrive source page
response = urllib2.urlopen('http://localhost/')
html = response.read().decode('utf-8')
response.close()

# extract the terms and definitions
parser.feed(html)

UPDATE

# -*- coding: utf-8 -*-
from HTMLParser import HTMLParser
import urllib2
import sys
reload(sys)
sys.setdefaultencoding('utf-8')


class MyHTMLParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.terms = []
        self.definitions = []

    def handle_starttag(self, tag, attrs):
        # retrive the terms
        for attribute in attrs:
            if attribute[0] == 'align':
                self.terms.append(attribute[1])
                self.definitions.append(attribute[1])


parser = MyHTMLParser()

html = "<table align='center'><tr><td align='left'><p>ciao</p></td></tr></table>"

# extract the terms and definitions
parser.feed(html)

print parser.terms
print parser.definitions

Output:

['center', 'left']

['center', 'left']

Sign up to request clarification or add additional context in comments.

1 Comment

ya,it absolutely works, but any ideas how to store the attrs[1][1] into a list so I can access to it(print parser.terms --MyHTMLParser instance has no attribute 'terms')
0

OK I got the solution,super().__init__ cannot work, must hard code the name

def __init__(self):
        HTMLParser.__init__(self)

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.