How do you loop through a web page for content using Python Selenium

Question

I am quite new to using Selenium (Python). I have just scraped some data off a website in exactly the way I wanted to but the code only pulls off the first 10 records. It doesn't proceed to pick up the entire content by looping through the other pages. Would you happen to know why the script fails to open the proceeding pages? Any help would be greatly appreciated. If you find the """This is for navigation to next page""" I think this is the incorrect area.

Code:

from selenium import webdriver from selenium.common.exceptions import NoSuchElementException from selenium.common.exceptions import NoSuchWindowException

import time from openpyxl import Workbook

"""This is to collect links that associate with all the profiles present in Allen Overy website""" def get_links(driver, target, upper_datetime_str, lower_datetime_str):

"""This part allows me to filter on date so I'm not having to pull back all the data each time it's run"""
if upper_datetime_str == '' and lower_datetime_str == '':
    time_constrain = 0
else:
    time_constrain = 1
    upper_datetime = time.strptime(upper_datetime_str,'%d/%m/%Y')
    lower_datetime = time.strptime(lower_datetime_str,'%d/%m/%Y')

"""This is to search for news that present in Freshfields website"""
"""Go to page that contains news list"""
driver.get(target) 

isbreak = 0
list_links = []
while True: 
    try:
        """Get links that associate to news in each page"""
        list_ppl_link =  driver.find_elements_by_xpath('//div[@class = "srch-Title3"]')                                                 
        for item in list_ppl_link:                
            rel_date = item.find_elements_by_xpath('//div[@class = "srch-Metadata2"]')
            if time_constrain == 0:
                rel_link = item.find_element_by_tag_name('a').get_attribute('href')
                list_links.append(rel_link)
            else:
                input_datetime = time.strptime(rel_date,'%d %B %Y')
                if input_datetime < lower_datetime:
                    isbreak = 1
                    break
                elif input_datetime >=lower_datetime and input_datetime <= upper_datetime:
                    rel_link = item.find_element_by_tag_name('a').get_attribute('href')
                    list_links.append(rel_link)

        """This is for navigation to next page"""
        next_b = driver.find_element_by_xpath('//div[@class="srch-Page srch-Page-bg"]')
        if next_b.get_attribute('class') == 'srch-Page-img':
            next_b.click() 
        else:
            break

        if isbreak == 1:
            break

    except KeyboardInterrupt:
        break
    except NoSuchWindowException:
        break           
    except:
        raise

return list_links

def get_news_content(driver, link): driver.get(link)

try:
    rels_date = driver.find_element_by_class_name('ao-rteElement-H4').text
except NoSuchElementException:
    rels_date = ''

try:
    headline = driver.find_element_by_class_name('ao-rteElement-H1').text
except:
    headline = ''

try:
    content1 = driver.find_element_by_class_name('ao-rteElement-introText').text
except NoSuchElementException: 
    content1 = ''
try:
    content2  = driver.find_element_by_id('ctl00_PlaceHolderMain_main__ControlWrapper_ExtendedRichHtmlField').text
except NoSuchElementException: 
    content2 = ''
content = '\n'.join([content1, content2]).strip()

return {'news_date':rels_date ,\
        'news_content':content, 'news_headline':headline}

def extract_data(adict):

return [adict.get('news_date', ''),
        adict.get('news_headline', ''),
        adict.get('news_content', '')]

===============================================================================================================

if name == "main": """Highlight the file variables such as file name and headers for columns with a date stamp of===""" printout = time.strftime('%y%m%d_%H%M%S', time.localtime()) + '_allenovery_news.xlsx'
header = ['Firm Name','Date', 'Headline Title', 'News Content']

wb = Workbook()
ws = wb.active
ws.append(header)
log = open('test.txt', 'w')

"""Identify target link where the data is stored"""
target = 'http://www.allenovery.com/search/Pages/results.aspx?k=*&v1=-write&s=NewsAndDeals&r=aolanguage%3d%22AQdFbmdsaXNoCmFvbGFuZ3VhZ2UBAV4BJA%3d%3d%22'

"""Engage Chrome Driver"""
chromeOptions = webdriver.ChromeOptions()
prefs = {"profile.managed_default_content_settings.images":2}
chromeOptions.add_experimental_option("prefs",prefs)
driver = webdriver.Chrome(chrome_options=chromeOptions)
driver.set_page_load_timeout = 120
driver.maximize_window()

"""Time format : dd/mm/yyyy"""
"""Select your timeframe below should you wish. Otherwise leave fields blank '' """
upper_datetime_str = ''
lower_datetime_str = ''

print('Collecting news links')
list_ppls = get_links(driver, target, upper_datetime_str, lower_datetime_str)    
driver.quit()


"""Engage Chrome Driver"""
driver = webdriver.Chrome(chrome_options=chromeOptions)
driver.set_page_load_timeout = 120
driver.maximize_window()


total_link = len(list_ppls)
idx = 0
while idx < total_link:
    try:
        print(idx + 1, 'in', total_link, list_ppls[idx])

        """Append client name to data"""
        ws.append(['Allen and Overy'] + extract_data(get_news_content(driver, list_ppls[idx])))          
        idx += 1  
        if not(idx%100):
            wb.save(printout) 
            driver.quit()
            time.sleep(10)
            driver = webdriver.Chrome(chrome_options=chromeOptions)
            driver.set_page_load_timeout = 120
            driver.maximize_window()

    except KeyboardInterrupt:
        break
    except NoSuchWindowException:
        break
    except:
        driver.quit()
        time.sleep(10)
        driver = webdriver.Chrome(chrome_options=chromeOptions)
        driver.set_page_load_timeout = 120
        driver.maximize_window()
        continue

wb.save(printout)        
log.close()
driver.quit()

Could you be more succinct? Just share exact lines of code which are the cause of your problem and provide short issue description, exception log (if you got any) — Andersson
– Andersson, Commented Feb 8, 2017 at 21:14
Target = 'allenovery.com/search/Pages/results.aspx?k=*&v1=-write&s=NewsAndDeals&r=aolanguage%3d%22AQdFbmdsaXNoCmFvbGFuZ3VhZ2UBAV4BJA%3d%3d%22' My coding pulls off the data for that link but just the first 10 records on page 1. The coding that is stopping the programme from looping through is; """This is for navigation to next page""" next_b = driver.find_element_by_xpath('//div[@class="srch-Page srch-Page-bg"]') if next_b.get_attribute('class') == 'srch-Page-img': next_b.click() else: break — Chris Moore
– Chris Moore, Commented Feb 8, 2017 at 21:32

Andersson · Accepted Answer · 2017-02-08 22:48:10Z

0

You're trying to handle wrong element (driver.find_element_by_xpath('//div[@class="srch-Page srch-Page-bg"]') is not what you actually need). Try to implement below code:

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

try:
    time.sleep(1)
    WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.ID, "SRP_NextImg"))).click()
except TimeOutException:
    break

This should allow you to get next page until "Next" button (>) is available

edited Feb 8, 2017 at 22:48

answered Feb 8, 2017 at 21:51

Andersson

52.8k18 gold badges83 silver badges132 bronze badges

Sign up to request clarification or add additional context in comments.

13 Comments

Chris Moore Over a year ago

Thanks for getting back to me. I incorporated your suggestion into the script and got the following error "Element is not clickable". Could I share the script with you?

Chris Moore Over a year ago

I got the same error. It may be a glitch in another area of the script. I am so close to making it work but just can't seem to get this last bit right.

Andersson Over a year ago

Show the complete exception log

Chris Moore Over a year ago

It doesn't seem to be allowing me to take a screen shot of my anaconda prompt window. I don't have any other logs

Andersson Over a year ago

I mean Element is not clickable full log

|

Collectives™ on Stack Overflow

How do you loop through a web page for content using Python Selenium

===============================================================================================================

1 Answer 1

13 Comments

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

===============================================================================================================

1 Answer 1

13 Comments

Your Answer

Sign up or log in

Post as a guest

Related