As a python novice I wish to download old newspaper archived on a website (http://digesto.asamblea.gob.ni/consultas/coleccion/) with my script below.
However, I fail to get my script to go through each row of the table and select "PDF" in the dropdown menu saving the corresponding link to a list (in order to download them).
My problem seems to be that the script cannot locate the PDF value from the each dropdown menu using the provided xpath.
This just be the part of the source code which does not function:
table_id = driver.find_element(By.ID, 'gridTableDocCollection')
rows = table_id.find_elements(By.TAG_NAME, "tr") # get all table rows
for row in rows:
elems = driver.find_elements_by_xpath('//ul[@class="dropdown-menu"]/a')
for elem in elems:
print(elem.get_attribute("href"))
Edit:
When I use this code:
list_of_links = driver.find_element_by_xpath('//ul[@class="dropdown-menu"]/li')
print(list_of_links)
I get selenium.webdriver.firefox.webelement.FirefoxWebElement (session="e6799ba5-5f0b-8b4f-817a-721326940b91", element="66c956f0-d813-a840-b24b-a12f92e1189b"instead of a link. What do I do wrong?
Can anyone please help me? I have read for hours through stackoverflow but where never able to get anything working (see part of the code which is commented out).
Disclaimer: when using the script you need to type the captcha by hand without pressing enter for the script to continue.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# A small script to download issues of the Gaceta de Nicaragua (1843-1960) 19758 issues
import logging
from selenium.webdriver.remote.remote_connection import LOGGER
LOGGER.setLevel(logging.WARNING)
import os
import sys
import time
import shutil
from subprocess import call
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.action_chains import ActionChains
profile = webdriver.FirefoxProfile() # profile to prevent download manager
profile.set_preference("network.cookie.cookieBehavior", 0) # accept all cookies
profile.set_preference("network.cookie.lifetimePolicy", 0) # accept cookies
profile.set_preference("network.cookie.alwaysAcceptSessionCookies", 1) # always allow sess
profile.set_preference("browser.download.folderList", 2)
profile.set_preference("browser.download.manager.showWhenStarting", False)
profile.set_preference("browser.download.dir", 'Downloads/')
profile.set_preference("browser.helperApps.neverAsk.saveToDisk", 'image/jpeg;application/jpeg;image/jpg;application/jpg')
url = 'http://digesto.asamblea.gob.ni/consultas/coleccion/' # web page
print('Opening digesto.asamblea.gob.ni...')
driver = webdriver.Firefox(firefox_profile=profile)
driver.get(url) # open url
driver.find_element_by_xpath('//*[@id="cavanzada"]').click() # advanced menu
driver.find_element_by_xpath("//select[@id='slcCollection']/option[text()='Diario Oficial']").click()
driver.find_element_by_xpath("//select[@id='slcMedio']/option[text()='Gaceta Oficial']").click() # change journal name here
inputElement = driver.find_element_by_xpath('//*[@id="txtDatePublishFrom"]')
inputElement.send_keys('01/01/1844') # change start date
inputElement = driver.find_element_by_xpath('//*[@id="txtDatePublishTo"]')
inputElement.send_keys('31/12/1860') # change end date
time.sleep( 5 ) # wait for Human Captcha Insertion
inputElement.send_keys(Keys.ENTER) # search
time.sleep( 2 ) # wait to load
select_element = Select(driver.find_element_by_xpath('//*[@id="slcResPage"]')) # page count
select_element.select_by_value('50') # max 50
time.sleep( 1 ) # wait to load
list_of_links = driver.find_elements_by_xpath('//ul[@class="dropdown-menu"]/a')
print(list_of_links)
#a=[];
#a = driver.find_elements_by_link_text("PDF");
#driver.find_element_by_link_text("PDF").click()
#a = driver.find_element_by_xpath("//select[@class='dropdown-menu']/option[text()='PDF']").click()
#a = driver.find_element_by_xpath('//*[contains(text(), '"dropdown-menu"')] | //*[@#='"PDF"']'); #[contains(@#, "PDF")]
#a = driver.find_elements_by_xpath("//*[contains(text(), 'PDF')]")
#a = driver.find_elements_by_xpath('//div[@class="dropdown-menu"][contains(@#, "PDF")]')
#print(a, sep='\n')
#print(*a, sep='\n')
#driver.find_element(By.CssSelector("a[title='Acciones']")).find_element(By.xpath(".//span[text()='PDF']")).click();
#select_element = Select(driver.find_element_by_xpath('//*[@id="gridTableDocCollection"]/html/body/div[3]/div[1]/div/div/form/div[3]/div[2]/table/tbody/tr[1]/td[5]/div/ul/li[1]/a'))
#select_element.select_by_text('PDF')
table_id = driver.find_element(By.ID, 'gridTableDocCollection')
rows = table_id.find_elements(By.TAG_NAME, "tr") # get all table rows
for row in rows:
elems = driver.find_elements_by_xpath('//ul[@class="dropdown-menu"]/a')
for elem in elems:
print(elem.get_attribute("href"))