selenium multiple window scrape. Python

Question

Have been trying many things for the last few hours on this. However, strangely when selenium opens the new windows, it switches to it but instead of scraping data from the new page, if keeps closing the previous windows and scrapes data from them. Also, it keeps opening all the links instead of the next one. Sort of hit a wall with this one. Any help is appreciated. Thanks.

from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException


driver = webdriver.Chrome(executable_path='C:/chromedriver.exe')
actions = ActionChains(driver)
search_term = input("Enter your search term :")
url = f'https://www.sciencedirect.com/search?qs={search_term}&years=2021%2C2020%2C2019&lastSelectedFacet=years'
driver.get(url)
driver.maximize_window()

WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH,'/html/body/div[3]/div/div/div/button/span'))).click()
divs = driver.find_elements_by_class_name('result-item-content')
links = []
for div in divs:
    link = div.find_element_by_tag_name('a')
    links.append(link)


def get_data():
    actions.key_down(Keys.CONTROL).click(link).key_up(Keys.CONTROL).perform()
    par_guid = driver.current_window_handle
    allguid = driver.window_handles
    for guid in allguid:
        if guid != par_guid:
            driver.switch_to.window(guid)
            break
    author_group = driver.find_element_by_id('author-group')
    for author in author_group.find_elements_by_css_selector("a.author"):
        try:
            given_name = author.find_element_by_css_selector(".given-name").text
            surname = author.find_element_by_css_selector(".surname").text

        except NoSuchElementException:
            print("Could not extract first or last name")
            continue

        try:
            mail_icon = author.find_element_by_css_selector(".icon-envelope")
            mail_icon.click()
            mail_icon_present = True
            mail = driver.find_element_by_class_name('e-address')
            print(mail.text)

        except NoSuchElementException:
            mail_icon_present = False
            print(f"Author {given_name} {surname}. Mail icon present: {mail_icon_present}")
    driver.close()
    driver.switch_to.window(par_guid)

for link in links:
    get_data()

Hello! hmmm those links that are being opened to a new tab... you do some scraping there right? — Ice Bear
– Ice Bear, Commented Dec 19, 2020 at 7:00
Yes..but instead of scraping the new ones and closing them..it is opening all the links again. Scraping the old pages as well. — Abhishek Rai
– Abhishek Rai, Commented Dec 19, 2020 at 7:01
Hmm... how about you may consider instead of opening the links to a new tab is you can use python requests or reqeusts-HTML libraries for it and parser it with another parser? requests-HTML requests — Ice Bear
– Ice Bear, Commented Dec 19, 2020 at 7:06
Could you instead of clicking on the link try to driver.get(link) so no new tab opens? — Abrogans
– Abrogans, Commented Dec 19, 2020 at 8:46

Abhishek Rai · Accepted Answer · 2020-12-19 09:03:29Z

The problem part was this line actions.key_down(Keys.CONTROL).click(link).key_up(Keys.CONTROL).perform() I think this is not the right way to open a new tab if you are dealing with multiple links. I changed it to driver.execute_script('window.open(arguments[0]);', link) . I also let go of saving the links to a list. This works because there are only two windows open at a time. The home window and the link we clicked on. So, In entirety the code becomes

from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException


driver = webdriver.Chrome(executable_path='C:/chromedriver.exe')
actions = ActionChains(driver)
search_term = input("Enter your search term :")
url = f'https://www.sciencedirect.com/search?qs={search_term}&years=2021%2C2020%2C2019&lastSelectedFacet=years'
driver.get(url)
driver.maximize_window()

WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH,'/html/body/div[3]/div/div/div/button/span'))).click()
divs = driver.find_elements_by_class_name('result-item-content')
for div in divs:
    parent_window = driver.current_window_handle
    link = div.find_element_by_tag_name('a')
    driver.execute_script('window.open(arguments[0]);', link)
    all_windows = driver.window_handles
    child_window = [window for window in all_windows if window != parent_window][0]
    driver.switch_to.window(child_window)
    title = driver.find_element_by_tag_name('h1')
    print("Article Title:- ",title.text)
    author_group = driver.find_element_by_id('author-group')
    for author in author_group.find_elements_by_css_selector("a.author"):
        try:
            given_name = author.find_element_by_css_selector(".given-name").text
            surname = author.find_element_by_css_selector(".surname").text

        except NoSuchElementException:
            print("Could not extract first or last name")
            continue

        try:
            mail_icon = author.find_element_by_css_selector(".icon-envelope")
            mail_icon.click()

            mail = driver.find_element_by_class_name('e-address')
            print(mail.text)

        except NoSuchElementException:
            print(f"Author {given_name} {surname}")
    driver.close()
    driver.switch_to.window(parent_window)

Collectives™ on Stack Overflow

selenium multiple window scrape. Python

1 Answer 1

Comments

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

1 Answer 1

Comments

Your Answer

Sign up or log in

Post as a guest

Related