1

I have minimal coding knowledge and I'm trying to adapt some tutorials without success.

The JavaScript code I wish to adapt (script A) is pasted into the Chrome developer console and successfully pulls the data I need. This JavaScript snippet identifies the largest price graphic in an e-commerce site.

A second tutorial (script B) is run from the shell and calls the Puppeteer library. This script pulls some hotel booking data and runs successfully.

I wish to adapt script A to run from the shell using the Puppeteer library.

This is Script A -

let elements = [
 ...document.querySelectorAll(' body *')
]

function createRecordFromElement(element) {
 const text = element.textContent.trim()
 var record = {}
 const bBox = element.getBoundingClientRect()

if(text.length <= 30 && !(bBox.x == 0 && bBox.y == 0)) {
 record['fontSize'] = parseInt(getComputedStyle(element)['fontSize']) } 
 record['y'] = bBox.y 
 record['x'] = bBox.x 
 record['text'] = text 
 return record 
} 
let records = elements.map(createRecordFromElement) 

function canBePrice(record) { 
 if( record['y'] > 600 ||
  record['fontSize'] == undefined ||
  !record['text'].match(/(^(US ){0,1}(rs\.|Rs\.|RS\.|\$|₹|INR|USD|CAD|C\$){0,1}(\s){0,1}[\d,]+(\.\d+){0,1}(\s){0,1}(AED){0,1}$)/)
)
 return false
 else return true
}

let possiblePriceRecords = records.filter(canBePrice)
let priceRecordsSortedByFontSize = possiblePriceRecords.sort(function(a, b) {
if (a['fontSize'] == b['fontSize']) return a['y'] > b['y']
return a['fontSize'] < b['fontSize']

})
console.log(priceRecordsSortedByFontSize[0]['text']);console.log(priceRecordsSortedByFontSize[1]['text']);

This is Script B -

const puppeteer = require('puppeteer');

let bookingUrl = 'insert booking URL';
(async () => {
    const browser = await puppeteer.launch({ headless: true });
    const page = await browser.newPage();
    await page.setViewport({ width: 1920, height: 926 });
    await page.goto(bookingUrl);

    // get hotel details
    let hotelData = await page.evaluate(() => {
        let hotels = [];
        // get the hotel elements
        let hotelsElms = document.querySelectorAll('div.sr_property_block[data-hotelid]');
        // get the hotel data
        hotelsElms.forEach((hotelelement) => {
            let hotelJson = {};
            try {
                hotelJson.name = hotelelement.querySelector('span.sr-hotel__name').innerText;
                hotelJson.reviews = hotelelement.querySelector('span.review-score-widget__subtext').innerText;
                hotelJson.rating = hotelelement.querySelector('span.review-score-badge').innerText;
                if(hotelelement.querySelector('strong.price')){
                    hotelJson.price = hotelelement.querySelector('strong.price').innerText;
                }
            }
            catch (exception){

            }
            hotels.push(hotelJson);
        });
        return hotels;
    });

    console.dir(hotelData);
})();

I've had various attempts at adapting Script A into the format of Script B. Various and many different errors have been thrown. Without coding knowledge, I'm not getting anywhere.

Here's one of many variations I've tried, called Script C -

const puppeteer = require('puppeteer-core');

let bookingUrl = 'https://shop.coles.com.au/a/dianella/product/moccona-coffee-capsules-espresso-7';
(async () => {
    const browser = await puppeteer.launch({
        executablePath: '/usr/bin/chromium-browser',
        headless: true
        });
    const page = await browser.newPage();
    await page.setViewport({ width: 1920, height: 926 });
    await page.goto(bookingUrl);

    // get hotel details
    let hotelData = await page.evaluate(() => {
        let hotels = [];
        // get the hotel elements
        let elements = [
 ...document.querySelectorAll(' body *')
]

function createRecordFromElement(element) {
 const text = element.textContent.trim()
 var record = {}
 const bBox = element.getBoundingClientRect()

if(text.length <= 30 && !(bBox.x == 0 && bBox.y == 0)) {
 record['fontSize'] = parseInt(getComputedStyle(element)['fontSize']) } 
 record['y'] = bBox.y 
 record['x'] = bBox.x 
 record['text'] = text 
 return record 
} 
let records = elements.map(createRecordFromElement) 

function canBePrice(record) { 
 if( record['y'] > 600 ||
  record['fontSize'] == undefined ||
  !record['text'].match(/(^(US ){0,1}(rs\.|Rs\.|RS\.|\$|₹|INR|USD|CAD|C\$){0,1}(\s){0,1}[\d,]+(\.\d+){0,1}(\s){0,1}(AED){0,1}$)/)
)
 return false
 else return true
}

let possiblePriceRecords = records.filter(canBePrice)
let priceRecordsSortedByFontSize = possiblePriceRecords.sort(function(a, b) {
if (a['fontSize'] == b['fontSize']) return a['y'] > b['y']
return a['fontSize'] < b['fontSize']

})
console.log(priceRecordsSortedByFontSize[0]['text']);
})();

Here's the links to the tutorials for info -

https://www.scrapehero.com/how-to-scrape-prices-from-any-ecommerce-website/

https://www.scrapehero.com/how-to-build-a-web-scraper-using-puppeteer-and-node-js/

Is there anything obviously wrong in Script C?

1 Answer 1

1

After reading through script C, it appears that you have not made any mistakes, rather the website you are attempting to access has decided to block scraper bots.

A quick host lookup on the domain shows that they are using security service section.io to block scraper bots on their website. See:

shop.coles.com.au is an alias for shop.coles.com.au.c.section.io. shop.coles.com.au.c.section.io is an alias for shop.coles.com.au.x.section.io

Sign up to request clarification or add additional context in comments.

1 Comment

Thanks for looking at the code. I didn't anticipate they would block bots!

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.