How to delete a python directory effectively?

Question

I have the following code in Python.

"""
Module to encapsulate body parsing.
"""

from urlparse import urlparse
from bs4 import BeautifulSoup,Comment
import os
import shutil
from hct import utils


BASE_DIR = os.getcwd()
PAGE_SOURCE_CMD = 'phantomas %s --page-source'
FEO_PAGE_SOURCE_CMD = 'phantomjs  RequestURL.js %s > body.html'


class Extractor(object):
    """
    This file provides utility to do body parsing of an url.
    """

    def __init__(self):
        pass

    def check_tags(self, tags, feed):
        """
        Method: Method to handle the tags as encountered during parsing.
                Also contains the business logic to check to prefetch and
                preresolve DNS eanblement

        Args: Takes the tag and its attributes as a list

        Returns: A dictionary of tags and their values.
        """
        result = {}
        for tag in tags:
            if len(feed.select('link[rel='+tag+']')) > 0:
                result['link'] = tag
        return result

    def get_generated_html(self, url, has_headers):
        """
        Method: Method to get the generated HTML content from Phantomas.

        Args: Takes the url as an argument for which to get the HTML content.
              hasHeaders defaulted to false for no headers.

        Returns: Nothing.
        """
        if not urlparse(url).scheme:
            url = 'http://'+url
        if has_headers == False:
            command = PAGE_SOURCE_CMD % url
        else:
            command = FEO_PAGE_SOURCE_CMD % url
        utils.execute_command(command).communicate()

    def create_analysis_folder(self, analysis_id, has_headers):

        if not os.path.exists(os.path.join(BASE_DIR, analysis_id)):
            os.makedirs(os.path.join(BASE_DIR,analysis_id))
        path = os.path.join(BASE_DIR, analysis_id, 'html')
        if has_headers:
            os.makedirs(path)
            shutil.copy(os.path.join(BASE_DIR, "RequestURL.js"), path) 
        return path

    """
    def create_analysis_folder(self, analysis_id, has_headers):

        Method: To create a folder to fetch and analyse the HTML based on
                analysis ID.

        Args: Takes the Analsis ID as an argument.

        Returns: The path to the created folder.


        analysis_id = str(analysis_id)
        path = None
        if not os.path.exists(analysis_id):
            os.makedirs(analysis_id)
        os.chdir(analysis_id)
        if has_headers == False:
            path = os.getcwd() + '/html'
            print path
            return path
        else:
            print "coming here"
            os.makedirs('html')
            os.chdir('html')
            shutil.copy("../../hct/data_processors/RequestURL.js", os.getcwd()) 
            return os.getcwd()
    """

    def start_parser(self, analysis_id, url, hasHeaders=False):
        """
        Method: Method to start the parser.

        Args: Analsyis ID and URL as an argument.

        Returns: Nothing.
        """

        feed = None
        analysis_id = str(analysis_id)
        path = self.create_analysis_folder(analysis_id, hasHeaders)
        os.chdir(path)
        self.get_generated_html(url, hasHeaders)
        for root, dirs, files in os.walk(path):
            for file in files:
                if file.endswith('.html'):
                    feed = BeautifulSoup(open(path + '/' +file).read())
                    print feed
                    if hasHeaders:
                        os.chdir('..')
                    #shutil.rmtree(path)

            break
        return feed

I am creating folder based on some conditions, downloading the HTML source and then after parsing have to delete the folders. This module will be called multiple times. The problem is that the very first time it works fine. But the second time the line #shutil.rmtree(path) throws and error No such file or directorybecause it is trying to create a directory in a non-existent directory which gets deleted when first call to the method happens.

How do I overcome this problem as I have to delete the directory otherwise it will flood the server memory

jfs · Accepted Answer · 2014-12-30 07:31:31Z

1

If shutil.rmtree(path) raises "No such file or directory" exception for the path directory itself then it is probably a bug in shutil.rmtree() -- it should be happy that somebody did its work for it -- it should ignore such errors.

You could write a wrapper that workarounds the problem:

import errno
import shutil

def rmtree(path):
    try:
        return shutil.rmtree(path)
    except OSError as e:
        if e.errno == errno.ENOENT and e.filename == path:
            pass # path is gone already -- ignore the exception
        else:
            raise

You can call rmtree(path) whether the path exists or not.

You could also use ignore_errors, onerror rmtree's parameters for the same effect.

edited Dec 30, 2014 at 7:31

answered Dec 30, 2014 at 7:26

jfs

417k210 gold badges1k silver badges1.7k bronze badges

Sign up to request clarification or add additional context in comments.

Comments

John Zwinck · Accepted Answer · 2014-12-30 05:39:41Z

1

You can just use os.mkdir() to recreate the directory immediately after shutil.rmtree().

answered Dec 30, 2014 at 5:39

John Zwinck

252k44 gold badges346 silver badges459 bronze badges

Collectives™ on Stack Overflow

How to delete a python directory effectively?

2 Answers 2

Comments

Comments

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

2 Answers 2

Comments

Comments

Your Answer

Sign up or log in

Post as a guest

Related