1

I am quite new to using Selenium (Python). I have just scraped some data off a website in exactly the way I wanted to but the code only pulls off the first 10 records. It doesn't proceed to pick up the entire content by looping through the other pages. Would you happen to know why the script fails to open the proceeding pages? Any help would be greatly appreciated. If you find the """This is for navigation to next page""" I think this is the incorrect area.

Code:

from selenium import webdriver from selenium.common.exceptions import NoSuchElementException from selenium.common.exceptions import NoSuchWindowException

import time from openpyxl import Workbook

"""This is to collect links that associate with all the profiles present in Allen Overy website""" def get_links(driver, target, upper_datetime_str, lower_datetime_str):

"""This part allows me to filter on date so I'm not having to pull back all the data each time it's run"""
if upper_datetime_str == '' and lower_datetime_str == '':
    time_constrain = 0
else:
    time_constrain = 1
    upper_datetime = time.strptime(upper_datetime_str,'%d/%m/%Y')
    lower_datetime = time.strptime(lower_datetime_str,'%d/%m/%Y')

"""This is to search for news that present in Freshfields website"""
"""Go to page that contains news list"""
driver.get(target) 

isbreak = 0
list_links = []
while True: 
    try:
        """Get links that associate to news in each page"""
        list_ppl_link =  driver.find_elements_by_xpath('//div[@class = "srch-Title3"]')                                                 
        for item in list_ppl_link:                
            rel_date = item.find_elements_by_xpath('//div[@class = "srch-Metadata2"]')
            if time_constrain == 0:
                rel_link = item.find_element_by_tag_name('a').get_attribute('href')
                list_links.append(rel_link)
            else:
                input_datetime = time.strptime(rel_date,'%d %B %Y')
                if input_datetime < lower_datetime:
                    isbreak = 1
                    break
                elif input_datetime >=lower_datetime and input_datetime <= upper_datetime:
                    rel_link = item.find_element_by_tag_name('a').get_attribute('href')
                    list_links.append(rel_link)

        """This is for navigation to next page"""
        next_b = driver.find_element_by_xpath('//div[@class="srch-Page srch-Page-bg"]')
        if next_b.get_attribute('class') == 'srch-Page-img':
            next_b.click() 
        else:
            break

        if isbreak == 1:
            break

    except KeyboardInterrupt:
        break
    except NoSuchWindowException:
        break           
    except:
        raise

return list_links

def get_news_content(driver, link): driver.get(link)

try:
    rels_date = driver.find_element_by_class_name('ao-rteElement-H4').text
except NoSuchElementException:
    rels_date = ''

try:
    headline = driver.find_element_by_class_name('ao-rteElement-H1').text
except:
    headline = ''

try:
    content1 = driver.find_element_by_class_name('ao-rteElement-introText').text
except NoSuchElementException: 
    content1 = ''
try:
    content2  = driver.find_element_by_id('ctl00_PlaceHolderMain_main__ControlWrapper_ExtendedRichHtmlField').text
except NoSuchElementException: 
    content2 = ''
content = '\n'.join([content1, content2]).strip()

return {'news_date':rels_date ,\
        'news_content':content, 'news_headline':headline}

def extract_data(adict):

return [adict.get('news_date', ''),
        adict.get('news_headline', ''),
        adict.get('news_content', '')]

===============================================================================================================

if name == "main": """Highlight the file variables such as file name and headers for columns with a date stamp of===""" printout = time.strftime('%y%m%d_%H%M%S', time.localtime()) + '_allenovery_news.xlsx'
header = ['Firm Name','Date', 'Headline Title', 'News Content']

wb = Workbook()
ws = wb.active
ws.append(header)
log = open('test.txt', 'w')

"""Identify target link where the data is stored"""
target = 'http://www.allenovery.com/search/Pages/results.aspx?k=*&v1=-write&s=NewsAndDeals&r=aolanguage%3d%22AQdFbmdsaXNoCmFvbGFuZ3VhZ2UBAV4BJA%3d%3d%22'

"""Engage Chrome Driver"""
chromeOptions = webdriver.ChromeOptions()
prefs = {"profile.managed_default_content_settings.images":2}
chromeOptions.add_experimental_option("prefs",prefs)
driver = webdriver.Chrome(chrome_options=chromeOptions)
driver.set_page_load_timeout = 120
driver.maximize_window()

"""Time format : dd/mm/yyyy"""
"""Select your timeframe below should you wish. Otherwise leave fields blank '' """
upper_datetime_str = ''
lower_datetime_str = ''

print('Collecting news links')
list_ppls = get_links(driver, target, upper_datetime_str, lower_datetime_str)    
driver.quit()


"""Engage Chrome Driver"""
driver = webdriver.Chrome(chrome_options=chromeOptions)
driver.set_page_load_timeout = 120
driver.maximize_window()


total_link = len(list_ppls)
idx = 0
while idx < total_link:
    try:
        print(idx + 1, 'in', total_link, list_ppls[idx])

        """Append client name to data"""
        ws.append(['Allen and Overy'] + extract_data(get_news_content(driver, list_ppls[idx])))          
        idx += 1  
        if not(idx%100):
            wb.save(printout) 
            driver.quit()
            time.sleep(10)
            driver = webdriver.Chrome(chrome_options=chromeOptions)
            driver.set_page_load_timeout = 120
            driver.maximize_window()

    except KeyboardInterrupt:
        break
    except NoSuchWindowException:
        break
    except:
        driver.quit()
        time.sleep(10)
        driver = webdriver.Chrome(chrome_options=chromeOptions)
        driver.set_page_load_timeout = 120
        driver.maximize_window()
        continue

wb.save(printout)        
log.close()
driver.quit()
2
  • 1
    Could you be more succinct? Just share exact lines of code which are the cause of your problem and provide short issue description, exception log (if you got any) Commented Feb 8, 2017 at 21:14
  • Target = 'allenovery.com/search/Pages/results.aspx?k=*&v1=-write&s=NewsAndDeals&r=aolanguage%3d%22AQdFbmdsaXNoCmFvbGFuZ3VhZ2UBAV4BJA%3d%3d%22' My coding pulls off the data for that link but just the first 10 records on page 1. The coding that is stopping the programme from looping through is; """This is for navigation to next page""" next_b = driver.find_element_by_xpath('//div[@class="srch-Page srch-Page-bg"]') if next_b.get_attribute('class') == 'srch-Page-img': next_b.click() else: break Commented Feb 8, 2017 at 21:32

1 Answer 1

0

You're trying to handle wrong element (driver.find_element_by_xpath('//div[@class="srch-Page srch-Page-bg"]') is not what you actually need). Try to implement below code:

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

try:
    time.sleep(1)
    WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.ID, "SRP_NextImg"))).click()
except TimeOutException:
    break

This should allow you to get next page until "Next" button (>) is available

Sign up to request clarification or add additional context in comments.

13 Comments

Thanks for getting back to me. I incorporated your suggestion into the script and got the following error "Element is not clickable". Could I share the script with you?
I got the same error. It may be a glitch in another area of the script. I am so close to making it work but just can't seem to get this last bit right.
Show the complete exception log
It doesn't seem to be allowing me to take a screen shot of my anaconda prompt window. I don't have any other logs
I mean Element is not clickable full log
|

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.