0

I need you help folks, to scrapy a text element which is encypted here is my spider

      import json
import scrapy


class YPSpider(scrapy.Spider):
    name = 'yp'
    start_urls = ['https://www.infobel.com/fr/france/business/50000/informatique_internet/']

    def parse(self, response):
    next_page = response.xpath('//*[@rel="next"]').extract_first()
    if next_page_url:
        yield response.follow(next_page_url, callback=self.parse)

    if response.meta.get('has_phone'):
        item = response.meta['item']

        response = json.loads(response.body)
        item['phone'] = response['result']

        yield item
    else:
        items = response.xpath('//*[contains(@class, "customer-box")]')

        for item in items:
            address_lines = item.xpath('.//span[contains(@class, "fa-map-marker")]/../span[@class="detail-text"]//text()').extract()

            title = item.xpath('.//h2[@class="customer-item-name"]/a/text()').extract_first().strip()
            address = address_lines[0].replace('\r', '').replace('\t', '').strip() if address_lines else ''
            village = address_lines[1].replace('\r', '').replace('\t', '').strip() if len(address_lines) >= 1 else ''
            phone = item.xpath('.//span[contains(@class, "icon-phone")]/../span[@class="detail-text"]/text()').extract()

            item = {
                'title': title,
                'address': address,
                'village': village,
                'phone': phone,
            }

            if phone:
                if phone[0].isnumeric():
                    item['phone'] = phone[0]

                    yield item
                elif len(phone) >= 2:
                    yield scrapy.Request('https://www.infobel.com/fr/france/Search/Decrypt?encryptedString={}'.format(phone[1]), meta={'item': item, 'has_phone': True}

                    )

My problem is that the returned phone string is encoded and need you help to get the the text Thank you in advance!

2 Answers 2

1
import json
import scrapy


class YPSpider(scrapy.Spider):
    name = 'yp'
    start_urls = ['http://www.infobel.com/fr/france/business/50000/informatique_internet/']

    def parse(self, response):

        pages = response.xpath('//ul[@class="pagination"]//*[@rel="next"]/@href').extract()

        next_page = pages[-1] if pages else None

        if next_page:
            yield response.follow(next_page)


        if response.meta.get('has_phone'):
            item = response.meta['item']

            response = json.loads(response.body)
            item['phone'] = response['result']

            yield item
        else:
            items = response.xpath('//*[contains(@class, "customer-box")]')

            for item in items:
                address_lines = item.xpath('.//span[contains(@class, "fa-map-marker")]/../span[@class="detail-text"]//text()').extract()

                title = item.xpath('.//h2[@class="customer-item-name"]/a/text()').extract_first().strip()
                address = address_lines[0].replace('\r', '').replace('\t', '').strip() if address_lines else ''
                village = address_lines[1].replace('\r', '').replace('\t', '').strip() if len(address_lines) >= 1 else ''
                phone = item.xpath('.//span[contains(@class, "icon-phone")]/../span[@class="detail-text"]/text()').extract()

                item = {
                    'title': title,
                    'address': address,
                    'village': village,
                    'phone': phone,
                }

                if phone:
                    if phone[0].isnumeric():
                        item['phone'] = phone[0]

                        yield item
                    elif len(phone) >= 2:
                        yield scrapy.Request('https://www.infobel.com/fr/france/Search/Decrypt?encryptedString={}'.format(phone[1]), meta={'item': item, 'has_phone': True})
Sign up to request clarification or add additional context in comments.

7 Comments

Hi @yash pokar, thank you very much its do what i was looking for, you saved my day, i have just a question please, how can i get also numers phone starts with numbers else that 06, i see that the crawl do not scrap numbers start with 09 or 01 Regards thank you again
@AbdelmoulaNami just put that condition before yield item
It works fine, do you know how can I make it go ahead to next pages please.
next_page = response.xpath('//*[@rel="next"]').extract_first()
@AbdelmoulaNami now schedule next page request
|
0

Seems like the website is using their own internal AJAX calls to decrypt phone number strings; if you look at your web browser inspector: enter image description here

You can replicate this request in scrapy:

from urllib.parse import quote
from scrapy import Request

def parse(self, response):
    code = quote('iHB/1oF0m7ELfO6Mfsl+mvm+o8SZZ37q', safe='')
    url = f"https://www.infobel.com/fr/france/Search/Decrypt?encryptedString={code}"
    yield Request(url, body=json.dumps(data))

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.