0

I understand this isn't exactly the smallest code, i've tried to cut it down as much as i could. The script just consumes more and more memory until it finally runs out. I've used unset() where possible but it doesn't seem to have any effect. It always seems to error in the MultiGet function but i'm not sure if that is where the leak is. Any input would be greatly appreciated.

public function Test()
{
    $base = dirname(__FILE__) .'/';
    $prod_file = $base.'products.dbf';

    $this->dbf->load($prod_file);
    $num_rec=$ci->dbf->dbf_num_rec;

    $buffer = Array();
    for($i=0;$i<$num_rec;$i++):
        $row = $ci->dbf->getRowAssoc($i);

        $info = Array('part_number' => $row['PART_NUM'],
                      'td_group_id' => $row['GRP'],
                      'name' => 'DESCR');

        $this->db->where('td_group_id',$info['td_group_id']);
        $result = $this->db->get('tbl_categories')->row_array();
        if(isset($result['id'])):
            $info['category_id'] = $result['id'];
            $buffer[]  = $info;
        endif;

        if(count($buffer) == 100 || $i == $num_rec -1):
            $url_buffer = Array();
            foreach($buffer as $row):
                $url_buffer[] = $this->_product_url($row['part_number']);
            endforeach;

            $html_returns = $this->MultiCrawl($url_buffer);
            foreach($html_returns as $url_index=>$html):
                $more_info = $this->_extract_more_info($html);
                if($more_info):
                    $more_info['category_id'] = $buffer[$url_index]['category_id'];
                    $more_info['td_part_number'] = $buffer[$url_index]['part_number'];
                    $this->_parse_product($more_info);
                endif;
            endforeach;
            $buffer = Array();
        endif;

    endfor;



}


function MultiGet($all_urls)
{

    $useragent = $this->_useragent;
    $cookie_file = $this->_cookie_file;

    $url_index = $this->UrlIndex($all_urls);

    $return_buffer = Array();

    $mh = curl_multi_init();

    $ch = Array();
    $max_connections = 15;
    $index = 0;
    $open_connections = 0;
    $execReturnValue = true;
    $running = true;
    $max_index = count($all_urls)-1;
    $url_count = count($all_urls);
    $buffer_count = 0;

    while ($buffer_count < $url_count){

        if($open_connections < $max_connections && $index <= $max_index):
            for($i=$open_connections;$i<$max_connections && $index <= $max_index;$i++):
                $url = $all_urls[$index];
                $ch[$index] = curl_init($url);
                curl_setopt($ch[$index],CURLOPT_FOLLOWLOCATION, true);
                curl_setopt($ch[$index],CURLOPT_RETURNTRANSFER, true);
                curl_setopt($ch[$index],CURLOPT_COOKIESESSION, false);
                curl_setopt($ch[$index],CURLOPT_SSL_VERIFYHOST , false);
                curl_setopt($ch[$index],CURLOPT_SSL_VERIFYPEER , false);
                curl_setopt($ch[$index],CURLOPT_COOKIEJAR, $cookie_file);
                curl_setopt($ch[$index],CURLOPT_COOKIEFILE, $cookie_file);
                curl_setopt($ch[$index],CURLOPT_USERAGENT,$useragent);
                curl_multi_add_handle($mh, $ch[$index]);
                $open_connections++;
                $index++;
                $execReturnValue = curl_multi_exec($mh,$running);
                usleep(200);
            endfor;
        endif;

        $execReturnValue = curl_multi_exec($mh,$running);
        $ready=curl_multi_select($mh);


        while($info=curl_multi_info_read($mh)){
            $status=curl_getinfo($info['handle'],CURLINFO_HTTP_CODE);
            if($status==200){
                $successUrl=curl_getinfo($info['handle'],CURLINFO_EFFECTIVE_URL);
                $curl_index = $url_index[$successUrl];
                $return_buffer[$curl_index] = curl_multi_getcontent($ch[$curl_index]);
                $buffer_count = count($return_buffer);
                curl_multi_remove_handle($mh, $ch[$curl_index]);
                curl_close($ch[$curl_index]);
                unset($ch[$curl_index]);
                $open_connections--;
            }else{

                echo "ERROR: $status\n";
            }
        }
    } 

    curl_multi_close($mh);
    unset($mh);

    return $return_buffer;
}



private function _extract_more_info($html)
{

    $buffer = array();


    $query = "//img[@id='ctl00_cphMain_cntrlProductProfile_imgprodimage']";
    $result = $this->_xquery($html,$query);
    $node = $result instanceof DOMNode?$this->_to_dom_node($result):null;
    if(!$node) return null;
    $buffer['td_img_url'] = $node?trim($node->getAttribute('src')):null;
    unset($result);


    $query = "//span[@class='priceLarge']";
    $result = $this->_xquery($html,$query);
    $node = $result instanceof DOMNode?$this->_to_dom_node($result):null;
    if(!$node) return null;
    $buffer['price'] = $node?trim($node->nodeValue):null;
    if($buffer['price'] == 'Req. Auth.') return null;
    unset($result);


    $query = "//span[@id='ctl00_cphMain_cntrlProductProfile_newLtFinalPrice']";
    $result = $this->_xquery($html,$query);
    $node = $result instanceof DOMNode?$this->_to_dom_node($result):null;
    if(!$node) return null;
    $buffer['msrp'] = $node?trim($node->nodeValue):null;
    unset($result);


    $query = "//span[@id='ctl00_cphMain_cntrlProductProfile_newLTMRF']";
    $result = $this->_xquery($html,$query);
    $node = $result instanceof DOMNode?$this->_to_dom_node($result):null;
    if(!$node) return null;
    $buffer['manf_part_number'] = $node?trim($node->nodeValue):null;
    unset($result);


    $query = "//span[@id='ctl00_cphMain_cntrlProductProfile_newLblUPC']";
    $result = $this->_xquery($html,$query);
    $node = $result instanceof DOMNode?$this->_to_dom_node($result):null;
    $buffer['upc_part_number'] = $node?trim($node->nodeValue):null;
    unset($result);


    $query = "//td[@class='black_text_WUL']";
    $result = $this->_xquery($html,$query);
    $node = $result instanceof DOMNode?$this->_to_dom_node($result):null;
    if(!$node) return null;
    $buffer['manufacturer'] = $node?trim($node->nodeValue):null;
    unset($result);


    $query = "//td[@class='textt' and @colspan='3']";
    $result = $this->_xquery($html,$query);
    $node = $result instanceof DOMNode?$this->_to_dom_node($result):null;
    if(!$node) return null;
    $buffer['short_description'] = $node?trim($node->nodeValue):null;
    unset($result);





    $query = "//div[@id='ctl00_cphMain_pnlMarketingDesc']//td[@class='textt']";
    $result = $this->_xquery($html,$query);
    $node = $result instanceof DOMNode?$this->_to_dom_node($result):null;
    if(!$node) return null;
    $buffer['long_description'] = $node?trim($node->nodeValue):null;
    unset($result);

    $query = "//table[@id='ctl00_cphMain_cntrlMainSpecs_dgSpecs']";
    $result = $this->_xquery($html,$query);
    $table = $result instanceof DOMNode?$this->_to_dom_node($result):null;
    unset($result);

    if(!$table) return null;
    $table_array = Array();
    $rows = $table->getElementsByTagName('tr');
    foreach($rows as $tr):
        $temp = Array();
        $columns = $tr->getElementsByTagName('td');
        $caption = $columns->length > 0 && $columns->length <= 2 ? trim($columns->item(0)->nodeValue) : null;
        $value = $columns->length == 2 ? trim($columns->item(1)->nodeValue) : null;

        if ($caption) $table_array[$caption] = $value;
    endforeach;


    $buffer['main_specs']=$table_array;


    $query = "//table[@id='ctl00_cphMain_cntrlExtSpecs_tblData']";
    $result = $this->_xquery($html,$query);
    $table = $result instanceof DOMNode?$this->_to_dom_node($result):null;
    unset($result);
    $buffer['additional_specs'] = null;
    if(!$table) return $buffer;


    $table_array = Array();
    $rows = $table->getElementsByTagName('tr');
    foreach($rows as $tr):
        $temp = Array();
        $columns = $tr->getElementsByTagName('td');
        $caption = $columns->length > 0 && $columns->length <= 2 ? trim($columns->item(0)->nodeValue) : null;
        $value = $columns->length == 2 ? trim($columns->item(1)->nodeValue) : null;

        if ($caption) $table_array[$caption] = $value;
    endforeach;
    $buffer['additional_specs']=$table_array;;
    return $buffer;

}



private function _xquery($html,$query,$allnodes = false){
    $src = '';
    $dom = new DOMDocument();
    $node = null;
    if (@$dom->loadHTML($html)) {
        $xpath = new DOMXpath($dom);
        $nodeList = $xpath->query($query);
        if ($nodeList->length > 0) {
            $node = $allnodes==false?$nodeList->item(0):$nodeList;
        }
    }
    unset($xpath);
    unset($nodeList);
    unset($dom);
    return $node;
}
10
  • Are you sure it's a leak, or does it just need more memory? Commented Apr 6, 2011 at 18:00
  • How much memory is it consuming before php services are shut down and what is the maximum amount of data you expect to be stored at any given time? Commented Apr 6, 2011 at 18:01
  • I believe it's a leak as I unset any variables where I can. It gets up to 100mb+ in memory usage. None of the pages are that big nor am I storing that much data in any variables. Commented Apr 6, 2011 at 18:03
  • @65Fbef05: I'm running the script again to get an exact number for you. My memory_limit is set to 128M Commented Apr 6, 2011 at 18:05
  • 1
    I don't think you " tried to cut it down as much as i could" lol Commented Apr 6, 2011 at 18:06

1 Answer 1

1

Strategies to find a leak?

  • make sure it is a leak (if processing 1/100 of the data, is memory still not freed? 1/1000?)
  • think about complexity: if foo is O(n), bar is O(n) and bar calls foo, the result may become O(n*n).
  • experiment: disable parts of the program until it leaks no more

At first sight, you're crawling a series of url's. These may contain more url's, to be crawled using the MultiCrawl method. Are you sure there can't be a cycle in there? (working with folders has tricked me more than once: browsing '.' as a subfolder yields infinite loops)

Sign up to request clarification or add additional context in comments.

1 Comment

It doesn't do any depth. I have a dbf of product numbers/prices. It uses the product number to go to the distributors site to grab some additional information.

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.