I have the following code in Python.
"""
Module to encapsulate body parsing.
"""
from urlparse import urlparse
from bs4 import BeautifulSoup,Comment
import os
import shutil
from hct import utils
BASE_DIR = os.getcwd()
PAGE_SOURCE_CMD = 'phantomas %s --page-source'
FEO_PAGE_SOURCE_CMD = 'phantomjs RequestURL.js %s > body.html'
class Extractor(object):
"""
This file provides utility to do body parsing of an url.
"""
def __init__(self):
pass
def check_tags(self, tags, feed):
"""
Method: Method to handle the tags as encountered during parsing.
Also contains the business logic to check to prefetch and
preresolve DNS eanblement
Args: Takes the tag and its attributes as a list
Returns: A dictionary of tags and their values.
"""
result = {}
for tag in tags:
if len(feed.select('link[rel='+tag+']')) > 0:
result['link'] = tag
return result
def get_generated_html(self, url, has_headers):
"""
Method: Method to get the generated HTML content from Phantomas.
Args: Takes the url as an argument for which to get the HTML content.
hasHeaders defaulted to false for no headers.
Returns: Nothing.
"""
if not urlparse(url).scheme:
url = 'http://'+url
if has_headers == False:
command = PAGE_SOURCE_CMD % url
else:
command = FEO_PAGE_SOURCE_CMD % url
utils.execute_command(command).communicate()
def create_analysis_folder(self, analysis_id, has_headers):
if not os.path.exists(os.path.join(BASE_DIR, analysis_id)):
os.makedirs(os.path.join(BASE_DIR,analysis_id))
path = os.path.join(BASE_DIR, analysis_id, 'html')
if has_headers:
os.makedirs(path)
shutil.copy(os.path.join(BASE_DIR, "RequestURL.js"), path)
return path
"""
def create_analysis_folder(self, analysis_id, has_headers):
Method: To create a folder to fetch and analyse the HTML based on
analysis ID.
Args: Takes the Analsis ID as an argument.
Returns: The path to the created folder.
analysis_id = str(analysis_id)
path = None
if not os.path.exists(analysis_id):
os.makedirs(analysis_id)
os.chdir(analysis_id)
if has_headers == False:
path = os.getcwd() + '/html'
print path
return path
else:
print "coming here"
os.makedirs('html')
os.chdir('html')
shutil.copy("../../hct/data_processors/RequestURL.js", os.getcwd())
return os.getcwd()
"""
def start_parser(self, analysis_id, url, hasHeaders=False):
"""
Method: Method to start the parser.
Args: Analsyis ID and URL as an argument.
Returns: Nothing.
"""
feed = None
analysis_id = str(analysis_id)
path = self.create_analysis_folder(analysis_id, hasHeaders)
os.chdir(path)
self.get_generated_html(url, hasHeaders)
for root, dirs, files in os.walk(path):
for file in files:
if file.endswith('.html'):
feed = BeautifulSoup(open(path + '/' +file).read())
print feed
if hasHeaders:
os.chdir('..')
#shutil.rmtree(path)
break
return feed
I am creating folder based on some conditions, downloading the HTML source and then after parsing have to delete the folders. This module will be called multiple times. The problem is that the very first time it works fine. But the second time the line #shutil.rmtree(path) throws and error No such file or directorybecause it is trying to create a directory in a non-existent directory which gets deleted when first call to the method happens.
How do I overcome this problem as I have to delete the directory otherwise it will flood the server memory