Im trying to scrape all 5000 companies from this page. its dynamic page and companies are loaded when i scroll down. But i can only scrape 5 companies, So how can i scrape all 5000? URL is changing as I scroll down the page. I tried selenium but not working. https://www.inc.com/profile/onetrust Note: I want to scrape all info of companies but just now selected two.
import time
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
my_url = 'https://www.inc.com/profile/onetrust'
options = Options()
driver = webdriver.Chrome(chrome_options=options)
driver.get(my_url)
time.sleep(3)
page = driver.page_source
driver.quit()
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.find_all("div", class_="sc-prOVx cTseUq company-profile")
container = containers[0]
for container in containers:
rank = container.h2.get_text()
company_name_1 = container.find_all("h2", class_="sc-AxgMl LXebc h2")
Company_name = company_name_1[0].get_text()
print("rank :" + rank)
print("Company_name :" + Company_name)
UPDATED CODE BUT PAGE IS NOT SCROLLING AT ALL. Corrected some mistake in BeautifulSoup codes
import time
from bs4 import BeautifulSoup as soup
from selenium import webdriver
my_url = 'https://www.inc.com/profile/onetrust'
driver = webdriver.Chrome()
driver.get(my_url)
def scroll_down(self):
"""A method for scrolling the page."""
# Get scroll height.
last_height = self.driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to the bottom.
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load the page.
time.sleep(2)
# Calculate new scroll height and compare with last scroll height.
new_height = self.driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
page_soup = soup(driver.page_source, "html.parser")
containers = page_soup.find_all("div", class_="sc-prOVx cTseUq company-profile")
container = containers[0]
for container in containers:
rank = container.h2.get_text()
company_name_1 = container.find_all("h2", class_="sc-AxgMl LXebc h2")
Company_name = company_name_1[0].get_text()
print("rank :" + rank)
print("Company_name :" + Company_name)
Thank you for reading!