@@ -28,6 +28,7 @@ def startswithany(str, prefixes):
2828 return False
2929
3030import sys
31+ import types
3132
3233import inputstream
3334import tokenizer
@@ -37,14 +38,18 @@ def startswithany(str, prefixes):
3738from treebuilders import simpletree
3839
3940import utils
41+ import constants
4042from constants import spaceCharacters , asciiUpper2Lower
4143from constants import scopingElements , formattingElements , specialElements
4244from constants import headingElements , tableInsertModeElements
4345from constants import cdataElements , rcdataElements , voidElements
4446from constants import tokenTypes , ReparseException , namespaces
4547
48+ debug_log = True
49+
4650def parse (doc , treebuilder = "simpletree" , encoding = None ,
4751 namespaceHTMLElements = True ):
52+ """Parse a string or file-like object into a tree"""
4853 tb = treebuilders .getTreeBuilder (treebuilder )
4954 p = HTMLParser (tb , namespaceHTMLElements = namespaceHTMLElements )
5055 return p .parse (doc , encoding = encoding )
@@ -55,6 +60,17 @@ def parseFragment(doc, container="div", treebuilder="simpletree", encoding=None,
5560 p = HTMLParser (tb , namespaceHTMLElements = namespaceHTMLElements )
5661 return p .parseFragment (doc , container = container , encoding = encoding )
5762
63+ def method_decorator_metaclass (function ):
64+ class Decorated (type ):
65+ def __new__ (meta , classname , bases , classDict ):
66+ for attributeName , attribute in classDict .iteritems ():
67+ if type (attribute ) == types .FunctionType :
68+ attribute = function (attribute )
69+
70+ classDict [attributeName ] = attribute
71+ return type .__new__ (meta , classname , bases , classDict )
72+ return Decorated
73+
5874class HTMLParser (object ):
5975 """HTML parser. Generates a tree structure from a stream of (possibly
6076 malformed) HTML"""
@@ -129,6 +145,7 @@ def reset(self):
129145 self .tree .reset ()
130146 self .firstStartTag = False
131147 self .errors = []
148+ self .log = [] #only used with debug mode
132149 # "quirks" / "limited quirks" / "no quirks"
133150 self .compatMode = "no quirks"
134151
@@ -420,6 +437,31 @@ def parseRCDataRawtext(self, token, contentType):
420437
421438 self .phase = self .phases ["text" ]
422439
440+ def log (function ):
441+ """Logger that records which phase processes each token"""
442+ type_names = dict ((value , key ) for key , value in
443+ constants .tokenTypes .iteritems ())
444+ def wrapped (self , * args , ** kwargs ):
445+ if function .__name__ != "__init__" and len (args ) > 0 :
446+ token = args [0 ]
447+ try :
448+ info = {"type" :type_names [token ['type' ]]}
449+ except :
450+ print token
451+ raise
452+ if token ['type' ] in constants .tagTokenTypes :
453+ info ["name" ] = token ['name' ]
454+
455+ self .parser .log .append ((self .parser .tokenizer .state .__name__ ,
456+ self .parser .phase .__class__ .__name__ ,
457+ self .__class__ .__name__ ,
458+ function .__name__ ,
459+ info ))
460+ return function (self , * args , ** kwargs )
461+ else :
462+ return function (self , * args , ** kwargs )
463+ return wrapped
464+
423465class Phase (object ):
424466 """Base class for helper object that implements each phase of processing
425467 """
@@ -434,6 +476,9 @@ class Phase(object):
434476 # * EndTag
435477 # - endTag* methods
436478
479+ if debug_log :
480+ __metaclass__ = method_decorator_metaclass (log )
481+
437482 def __init__ (self , parser , tree ):
438483 self .parser = parser
439484 self .tree = tree
@@ -1008,7 +1053,7 @@ def startTagForm(self, token):
10081053 self .parser .parseError (u"unexpected-start-tag" , {"name" : "form" })
10091054 else :
10101055 if self .tree .elementInScope ("p" ):
1011- self .endTagP ("p" )
1056+ self .endTagP (impliedTagToken ( "p" ) )
10121057 self .tree .insertElement (token )
10131058 self .tree .formPointer = self .tree .openElements [- 1 ]
10141059
@@ -1831,7 +1876,7 @@ def processEOF(self):
18311876 return
18321877 else :
18331878 ignoreEndTag = self .ignoreEndTagColgroup ()
1834- self .endTagColgroup ("colgroup" )
1879+ self .endTagColgroup (impliedTagToken ( "colgroup" ) )
18351880 if not ignoreEndTag :
18361881 self .parser .phase .processEOF ()
18371882
@@ -1847,7 +1892,7 @@ def startTagCol(self, token):
18471892
18481893 def startTagOther (self , token ):
18491894 ignoreEndTag = self .ignoreEndTagColgroup ()
1850- self .endTagColgroup ("colgroup" )
1895+ self .endTagColgroup (impliedTagToken ( "colgroup" ) )
18511896 if not ignoreEndTag :
18521897 self .parser .phase .processStartTag (token )
18531898
@@ -1865,7 +1910,7 @@ def endTagCol(self, token):
18651910
18661911 def endTagOther (self , token ):
18671912 ignoreEndTag = self .ignoreEndTagColgroup ()
1868- self .endTagColgroup ("colgroup" )
1913+ self .endTagColgroup (impliedTagToken ( "colgroup" ) )
18691914 if not ignoreEndTag :
18701915 self .parser .phase .processEndTag (token )
18711916
@@ -2016,7 +2061,7 @@ def startTagTableCell(self, token):
20162061
20172062 def startTagTableOther (self , token ):
20182063 ignoreEndTag = self .ignoreEndTagTr ()
2019- self .endTagTr ("tr" )
2064+ self .endTagTr (impliedTagToken ( "tr" ) )
20202065 # XXX how are we sure it's always ignored in the innerHTML case?
20212066 if not ignoreEndTag :
20222067 self .parser .phase .processStartTag (token )
@@ -2036,15 +2081,15 @@ def endTagTr(self, token):
20362081
20372082 def endTagTable (self , token ):
20382083 ignoreEndTag = self .ignoreEndTagTr ()
2039- self .endTagTr ("tr" )
2084+ self .endTagTr (impliedTagToken ( "tr" ) )
20402085 # Reprocess the current tag if the tr end tag was not ignored
20412086 # XXX how are we sure it's always ignored in the innerHTML case?
20422087 if not ignoreEndTag :
20432088 self .parser .phase .processEndTag (token )
20442089
20452090 def endTagTableRowGroup (self , token ):
20462091 if self .tree .elementInScope (token ["name" ], variant = "table" ):
2047- self .endTagTr ("tr" )
2092+ self .endTagTr (impliedTagToken ( "tr" ) )
20482093 self .parser .phase .processEndTag (token )
20492094 else :
20502095 # innerHTML case
@@ -2187,12 +2232,12 @@ def startTagOptgroup(self, token):
21872232
21882233 def startTagSelect (self , token ):
21892234 self .parser .parseError ("unexpected-select-in-select" )
2190- self .endTagSelect ("select" )
2235+ self .endTagSelect (impliedTagToken ( "select" ) )
21912236
21922237 def startTagInput (self , token ):
21932238 self .parser .parseError ("unexpected-input-in-select" )
21942239 if self .tree .elementInScope ("select" , variant = "table" ):
2195- self .endTagSelect ("select" )
2240+ self .endTagSelect (impliedTagToken ( "select" ) )
21962241 self .parser .phase .processStartTag (token )
21972242
21982243 def startTagOther (self , token ):
0 commit comments