Get the html under a tag using htmlparser python

Question

I want to get whole html under a tag and using HTMLParser. I am able to currently get the data between the tags and following is my code

class LinksParser(HTMLParser):
  def __init__(self):
    HTMLParser.__init__(self)
    self.recording = 0
    self.data = ''

  def handle_starttag(self, tag, attributes):
    if tag != 'span':
      return
    if self.recording:
      self.recording += 1
      return
    for name, value in attributes:
      if name == 'itemprop' and value == 'description':
        break
    else:
      return
    self.recording = 1

  def handle_endtag(self, tag):
    if tag == 'span' and self.recording:
      self.recording -= 1

  def handle_data(self, data):
    if self.recording:
      self.data += data

I also want the html tags inside the input for example

<span itemprop="description">
<h1>My First Heading</h1>
<p>My first <br/><br/>paragraph.</p>
</span>

when provided as input would only give me the data with out tags. Is there any method with which I can get whole html between the tags?

stackoverflow.com/questions/2061718/…

user2665694
– user2665694

2012-11-11 18:48:13 +00:00
Commented Nov 11, 2012 at 18:48 — user2665694
– user2665694, Commented Nov 11, 2012 at 18:48
See stackoverflow.com/questions/2061718/…

user2665694
– user2665694

2012-11-11 18:48:42 +00:00
Commented Nov 11, 2012 at 18:48 — user2665694
– user2665694, Commented Nov 11, 2012 at 18:48

jfs · Accepted Answer · 2012-11-11 23:29:32Z

5

One could use xml.etree.ElementTree.TreeBuilder to exploit etree API for finding/manipulating the <span> element:

import sys
from HTMLParser import HTMLParser
from xml.etree import cElementTree as etree

class LinksParser(HTMLParser):
  def __init__(self):
      HTMLParser.__init__(self)
      self.tb = etree.TreeBuilder()

  def handle_starttag(self, tag, attributes):
      self.tb.start(tag, dict(attributes))

  def handle_endtag(self, tag):
      self.tb.end(tag)

  def handle_data(self, data):
      self.tb.data(data)

  def close(self):
      HTMLParser.close(self)
      return self.tb.close()

parser = LinksParser()
parser.feed(sys.stdin.read())
root = parser.close()
span = root.find(".//span[@itemprop='description']")
etree.ElementTree(span).write(sys.stdout)

Output

<span itemprop="description">
<h1>My First Heading</h1>
<p>My first <br /><br />paragraph.</p>
</span>

To print without the parent (root) <span> tag:

sys.stdout.write(span.text)
for child in span:
    sys.stdout.write(etree.tostring(child)) # add encoding="unicode" on Python 3

edited Nov 11, 2012 at 23:29

answered Nov 11, 2012 at 19:22

jfs

417k210 gold badges1k silver badges1.7k bronze badges

Sign up to request clarification or add additional context in comments.

2 Comments

raju Over a year ago

Is there any clean way to strip of the starting and ending tags. For example above code gives <span itemprop="description"> <h1>My First Heading</h1> <p>My first <br/><br/>paragraph.</p> </span> though I want just <h1>My First Heading</h1> <p>My first <br/><br/>paragraph.</p>

jfs Over a year ago

@santu: I've added the code example to show how to print without the root tag

BenTrofatter · Accepted Answer · 2012-11-11 18:49:39Z

Here's something that gets the job done based on the test data you provided with minimal changes to your existing code (assuming it's basically doing what you want already). You'd probably want to expand it to deal with self-closing tags in a more robust way:

from HTMLParser import HTMLParser

class LinksParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.recording = 0
        self.data = ''
        self.self_closing_tags = ("br",)

    def handle_starttag(self, tag, attributes):
        if tag not in ('span',) + self.self_closing_tags:
            self.data += "<%s" % (tag,)
            if attributes:
                self.data += " " + " ".join('%s="%s"' % (k, v) for k, v in attributes)
            self.data += ">"
            return
        if self.recording:
            self.recording += 1
            return
        for name, value in attributes:
            if name == 'itemprop' and value == 'description':
                break
        else:
            return
        self.recording = 1 

    def handle_endtag(self, tag):
        if tag == 'span' and self.recording:
             self.recording -= 1
        elif tag in self.self_closing_tags:
             self.data += "<%s/"> % (tag,)
        else:
             self.data += "</%s>" % (tag,)

    def handle_data(self, data):
        if self.recording:
            self.data += data

Given this as input:

<span itemprop="description">
<h1>My First Heading</h1>
<p>My first <br/><br/>paragraph.</p>
</span>

the output is:

<h1>My First Heading</h1>
<p>My first <br/><br/>paragraph.</p>

Collectives™ on Stack Overflow

Get the html under a tag using htmlparser python

2 Answers 2

Output

2 Comments

Comments

Your Answer

Linked

Hot Network Questions

Collectives™ on Stack Overflow

2 Answers 2

2 Comments

Comments

Your Answer

Sign up or log in

Post as a guest

Linked

Related