Im trying to get rid of the HTML tags, to an extent it works, but not all the tags are removed. But the below mentioned tags aren't gone
print('NOT DEALT WITH:')
for body in not_dealt_with_list:
#p = re.compile(r'<.*?[\\t\\n\\r\\s]*?.*?>')
print(remove_tags(body))
#print(p.sub('', body))
#body = re.sub()
def remove_tags(content):
parser = lxml.html.HTMLParser(remove_comments=True,
remove_blank_text=True)
document = lxml.html.document_fromstring(content, parser)
return document.text_content()
