Since most HTML is basically xml (or can easily be trimmed to be compatible with most xml parsers) I would suggest using an xml parser. Most python HTML-specific parsers are just subclasses of an xml parser anyway.
Check out: Python and XML.
Here is a good tutorial: Python XML Parser Tutorial.
Also, the xml.dom.minidom Class has been super useful for me personally.
Another similar method is explained here: xml.etree.ElementTree.
This is a good example from the xml.dom.minidom reference page:
import xml.dom.minidom
document = """\
<slideshow>
<title>Demo slideshow</title>
<slide><title>Slide title</title>
<point>This is a demo</point>
<point>Of a program for processing slides</point>
</slide>
<slide><title>Another demo slide</title>
<point>It is important</point>
<point>To have more than</point>
<point>one slide</point>
</slide>
</slideshow>
"""
dom = xml.dom.minidom.parseString(document)
def getText(nodelist):
rc = []
for node in nodelist:
if node.nodeType == node.TEXT_NODE:
rc.append(node.data)
return ''.join(rc)
def handleSlideshow(slideshow):
print "<html>"
handleSlideshowTitle(slideshow.getElementsByTagName("title")[0])
slides = slideshow.getElementsByTagName("slide")
handleToc(slides)
handleSlides(slides)
print "</html>"
def handleSlides(slides):
for slide in slides:
handleSlide(slide)
def handleSlide(slide):
handleSlideTitle(slide.getElementsByTagName("title")[0])
handlePoints(slide.getElementsByTagName("point"))
def handleSlideshowTitle(title):
print "<title>%s</title>" % getText(title.childNodes)
def handleSlideTitle(title):
print "<h2>%s</h2>" % getText(title.childNodes)
def handlePoints(points):
print "<ul>"
for point in points:
handlePoint(point)
print "</ul>"
def handlePoint(point):
print "<li>%s</li>" % getText(point.childNodes)
def handleToc(slides):
for slide in slides:
title = slide.getElementsByTagName("title")[0]
print "<p>%s</p>" % getText(title.childNodes)
handleSlideshow(dom)
If you absolutely must use regex instead of a parser, check out the re module:
In [1]: import re
In [2]: grps = re.search(r"<([^>]+)>([^<]+)</\1>", "<abc>123</abc>")
In [3]: if grps:
In [4]: print grps.groups()
Out[3]: ('abc', '123')
amazon > electronics > game console > PS3