scrapy encoding data text python

Question

I need you help folks, to scrapy a text element which is encypted here is my spider

      import json
import scrapy


class YPSpider(scrapy.Spider):
    name = 'yp'
    start_urls = ['https://www.infobel.com/fr/france/business/50000/informatique_internet/']

    def parse(self, response):
    next_page = response.xpath('//*[@rel="next"]').extract_first()
    if next_page_url:
        yield response.follow(next_page_url, callback=self.parse)

    if response.meta.get('has_phone'):
        item = response.meta['item']

        response = json.loads(response.body)
        item['phone'] = response['result']

        yield item
    else:
        items = response.xpath('//*[contains(@class, "customer-box")]')

        for item in items:
            address_lines = item.xpath('.//span[contains(@class, "fa-map-marker")]/../span[@class="detail-text"]//text()').extract()

            title = item.xpath('.//h2[@class="customer-item-name"]/a/text()').extract_first().strip()
            address = address_lines[0].replace('\r', '').replace('\t', '').strip() if address_lines else ''
            village = address_lines[1].replace('\r', '').replace('\t', '').strip() if len(address_lines) >= 1 else ''
            phone = item.xpath('.//span[contains(@class, "icon-phone")]/../span[@class="detail-text"]/text()').extract()

            item = {
                'title': title,
                'address': address,
                'village': village,
                'phone': phone,
            }

            if phone:
                if phone[0].isnumeric():
                    item['phone'] = phone[0]

                    yield item
                elif len(phone) >= 2:
                    yield scrapy.Request('https://www.infobel.com/fr/france/Search/Decrypt?encryptedString={}'.format(phone[1]), meta={'item': item, 'has_phone': True}

                    )

My problem is that the returned phone string is encoded and need you help to get the the text Thank you in advance!

Yash Pokar · Accepted Answer · 2018-10-26 10:48:48Z

1

import json
import scrapy


class YPSpider(scrapy.Spider):
    name = 'yp'
    start_urls = ['http://www.infobel.com/fr/france/business/50000/informatique_internet/']

    def parse(self, response):

        pages = response.xpath('//ul[@class="pagination"]//*[@rel="next"]/@href').extract()

        next_page = pages[-1] if pages else None

        if next_page:
            yield response.follow(next_page)


        if response.meta.get('has_phone'):
            item = response.meta['item']

            response = json.loads(response.body)
            item['phone'] = response['result']

            yield item
        else:
            items = response.xpath('//*[contains(@class, "customer-box")]')

            for item in items:
                address_lines = item.xpath('.//span[contains(@class, "fa-map-marker")]/../span[@class="detail-text"]//text()').extract()

                title = item.xpath('.//h2[@class="customer-item-name"]/a/text()').extract_first().strip()
                address = address_lines[0].replace('\r', '').replace('\t', '').strip() if address_lines else ''
                village = address_lines[1].replace('\r', '').replace('\t', '').strip() if len(address_lines) >= 1 else ''
                phone = item.xpath('.//span[contains(@class, "icon-phone")]/../span[@class="detail-text"]/text()').extract()

                item = {
                    'title': title,
                    'address': address,
                    'village': village,
                    'phone': phone,
                }

                if phone:
                    if phone[0].isnumeric():
                        item['phone'] = phone[0]

                        yield item
                    elif len(phone) >= 2:
                        yield scrapy.Request('https://www.infobel.com/fr/france/Search/Decrypt?encryptedString={}'.format(phone[1]), meta={'item': item, 'has_phone': True})

edited Oct 26, 2018 at 10:48

answered Oct 1, 2018 at 3:35

Yash Pokar

5,5511 gold badge15 silver badges27 bronze badges

Sign up to request clarification or add additional context in comments.

7 Comments

Abdelmoula Nami Over a year ago

Hi @yash pokar, thank you very much its do what i was looking for, you saved my day, i have just a question please, how can i get also numers phone starts with numbers else that 06, i see that the crawl do not scrap numbers start with 09 or 01 Regards thank you again

Yash Pokar Over a year ago

@AbdelmoulaNami just put that condition before yield item

Abdelmoula Nami Over a year ago

It works fine, do you know how can I make it go ahead to next pages please.

Yash Pokar Over a year ago

next_page = response.xpath('//*[@rel="next"]').extract_first()

Yash Pokar Over a year ago

@AbdelmoulaNami now schedule next page request

|

Granitosaurus · Accepted Answer · 2018-10-01 06:01:12Z

0

Seems like the website is using their own internal AJAX calls to decrypt phone number strings; if you look at your web browser inspector:

You can replicate this request in scrapy:

from urllib.parse import quote
from scrapy import Request

def parse(self, response):
    code = quote('iHB/1oF0m7ELfO6Mfsl+mvm+o8SZZ37q', safe='')
    url = f"https://www.infobel.com/fr/france/Search/Decrypt?encryptedString={code}"
    yield Request(url, body=json.dumps(data))

edited Oct 1, 2018 at 6:01

answered Oct 1, 2018 at 1:02

Granitosaurus

21.6k6 gold badges64 silver badges88 bronze badges

Collectives™ on Stack Overflow

scrapy encoding data text python

2 Answers 2

7 Comments

Comments

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

2 Answers 2

7 Comments

Comments

Your Answer

Sign up or log in

Post as a guest

Related