I am trying to scrape all of the perfumes which are located at https://www.fragrantica.com/search/
There are almost 73,367 perfumes on the site and I want to load all of them. The problem is the site shows 30 perfumes and then you need to click 'show more results' which will show additional 30 perfumes and so on.. so basically we must press the show more button almost 2444 times to reach the end of the page and have all perfumes loaded. This is my code so far:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import time
options = Options()
options.add_argument("--profile-directory=Default")
options.add_argument('--disable-blink-features=AutomationControlled')
options.add_argument("start-maximized")
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36")
driver = webdriver.Chrome(service = Service(executable_path='C:/Users/armon/Downloads/chromedriver_win32/chromedriver.exe'), options=options)
url = 'https://www.fragrantica.com/'
driver.get(url)
time.sleep(3)
perfumes_btn = driver.find_element(by=By.XPATH, value = '//*[@id="offCanvasLeft"]/ul/li[5]/a')
perfumes_btn.click()
search_btn = WebDriverWait(driver,20).until(EC.element_to_be_clickable((By.XPATH,'/html/body/div[2]/div[2]/div[2]/ul/li[5]/ul/li[1]/a')))
search_btn.click()
load_more_btn = '//*[@id="main-content"]/div[1]/div[1]/div/div/div/div[2]/div[1]/div/div[3]/div/div/div/div/div/button'
loadingButton = WebDriverWait(driver,70).until(EC.element_to_be_clickable((By.XPATH,load_more_btn)))
maxPerfumes = 73346;
i = 0
while loadingButton:
driver.execute_script("arguments[0].click();", loadingButton)
i = i+1
print(i)
loadingButton = WebDriverWait(driver,70).until(EC.element_to_be_clickable((By.XPATH,load_more_btn)))
loadElems = driver.find_elements(by = By.XPATH, value = '//*[@id="main-content"]/div[1]/div[1]/div/div/div/div[2]/div[1]/div/div[3]/div/div/div/span/div[1]')
if len(loadElems)>0:
loadingButton = WebDriverWait(driver,70).until(EC.element_to_be_clickable((By.XPATH,load_more_btn)))
PerfumesLoaded = len(driver.find_elements(by = By.XPATH, value = '//*[@id="main-content"]/div[1]/div[1]/div/div/div/div[2]/div[1]/div/div[3]/div/div/div/span/div[1]'))
else:
print("Loaded all the tires")
break
if PerfumesLoaded >= maxPerfumes:
print (PerfumesLoaded + " are loaded successfully.")
break
The problem is each time I run it, it only makes maximum of 34 loops and then stops and throws and error:
TimeoutException Traceback (most recent call last)
c:\Users\armon\OneDrive\Desktop\OLD\Data Analytics\Portfolio Projects\Jolse Project\Jolse Scraping Notebook.ipynb Cell 1' in <cell line: 36>()
41 loadElems = driver.find_elements(by = By.XPATH, value = '//*[@id="main-content"]/div[1]/div[1]/div/div/div/div[2]/div[1]/div/div[3]/div/div/div/span/div[1]')
42 if len(loadElems)>0:
---> 43 loadingButton = WebDriverWait(driver,70).until(EC.element_to_be_clickable((By.XPATH,load_more_btn)))
44 PerfumesLoaded = len(driver.find_elements(by = By.XPATH, value = '//*[@id="main-content"]/div[1]/div[1]/div/div/div/div[2]/div[1]/div/div[3]/div/div/div/span/div[1]'))
45 else:
File c:\Users\armon\anaconda3\envs\armoniaenv\lib\site-packages\selenium\webdriver\support\wait.py:87, in WebDriverWait.until(self, method, message)
85 if time.monotonic() > end_time:
86 break
---> 87 raise TimeoutException(message, screen, stacktrace)
TimeoutException: Message:
The print(i) that I have in the loop is simply to know how many loops it made.. any suggestions to how tweak it? or if I am doing something wrong?
My goal is to click load more until I can't no more, and then I can access all the perfumes that are on the page. TIA