Python: Web Scraping with Selenium. Iterate over table and retrieve data

Question

I am learning Python and decided to do a web-scraping project, where I am using Beautifulsoup and Selenium

Site: https://careers.amgen.com/ListJobs?

Goal: retrieve all the variables related to a job add. Variables identified: ID, jobs, URL, city, state, zip, country, day of the job post.

Problem: I managed to extract the data from the first page of the table. However, I cannot extract the data from all other pages of the table. And I did use the option to go to the next page.

Any help would be much appreciated.

Please find my code below.

import re
import os
import selenium
import pandas as pd

from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.expected_conditions import presence_of_element_located
from bs4 import BeautifulSoup


#driver = webdriver.Chrome(ChromeDriverManager().install())
browser = webdriver.Chrome("") #path needed to execute chromedriver. Check your own 
#path
browser.get('https://careers.amgen.com/ListJobs?')
browser.implicitly_wait(100)
soup = BeautifulSoup(browser.page_source, 'html.parser')
code_soup = soup.find_all('tr', attrs={'role': 'row'})

# creating data set
df =pd.DataFrame({'id':[],
                  'jobs':[],
                 'url':[],
                 'city':[],
                 'state':[],
                  'zip':[],
                  'country':[],
                 'added':[]
                 })
d = code_soup

next_page = browser.find_element_by_xpath('//*[@id="jobGrid0"]/div[2]/a[3]/span')



for i in range(2,12): #catch error, out of bonds?
    df = df.append({'id' : d[i].find_all("td", {"class": "DisplayJobId-cell"}),
                     "jobs" : d[i].find_all("td", {"class":"JobTitle-cell"}),
                     "url" : d[i].find("a").attrs['href'],
                     "city" : d[i].find_all("td", {"class": "City-cell"}),
                     "state" : d[i].find_all("td", {"class": "State-cell"}),
                     "zip" : d[i].find_all("td", {"class": "Zip-cell"}),
                     "country" : d[i].find_all("td", {"class": "Country-cell"}),
                     "added" : d[i].find_all("td", {"class": "AddedOn-cell"})}, ignore_index=True)
    
df['url'] = 'https://careers.amgen.com/' + df['url'].astype(str)
df["company"] = "Amgen"
df

#iterate through the pages

next_page = browser.find_element_by_xpath('//*[@id="jobGrid0"]/div[2]/a[3]/span')
for p in range(1,7): #go from page 1 to 6
    next_page.click()
    browser.implicitly_wait(20)
    print(p)

>quote 
I tried multiple things, this is my last multiple attempt. It did not work:

```
p = 0
next_page = browser.find_element_by_xpath('//*[@id="jobGrid0"]/div[2]/a[3]/span')

for p in range(1,7):   
    for i in range(2,12):
        df1 = df.append({'id' : d[i].find_all("td", {"class": "DisplayJobId-cell"}),
                         "jobs" : d[i].find_all("td", {"class":"JobTitle-cell"}),
                         "url" : d[i].find("a").attrs['href'],
                         "city" : d[i].find_all("td", {"class": "City-cell"}),
                         "state" : d[i].find_all("td", {"class": "State-cell"}),
                         "zip" : d[i].find_all("td", {"class": "Zip-cell"}),
                         "country" : d[i].find_all("td", {"class": "Country-cell"}),
                         "added" : d[i].find_all("td", {"class": "AddedOn-cell"})}, ignore_index=True)
        p += 1
        next_page.click()
    print(p)

αԋɱҽԃ αмєяιcαη · Accepted Answer · 2020-05-12 11:05:29Z

1

import requests
import re
import pandas as pd


params = {
    'sort': 'AddedOn-desc',
    'page': '1',
    'pageSize': '1000',
    'group': '',
    'filter': '',
    'fields': 'JobTitle,DisplayJobId,City,State,Zip,Country,AddedOn,UrlJobTitle'

}

headers = {
    "Origin": 'https://careers.amgen.com'
}


def main(url):
    r = requests.get(url)
    api = re.search('JobsApiUrl="(.*?)\"', r.text).group(1)
    r = requests.get(api, params=params, headers=headers).json()
    df = pd.DataFrame(r['Data'])
    print(df)
    df.to_csv("data.csv", index=False)


main("https://careers.amgen.com/ListJobs")

Output: view-online

Sample:

answered May 12, 2020 at 11:05

αԋɱҽԃ αмєяιcαη

11.6k3 gold badges23 silver badges58 bronze badges

Sign up to request clarification or add additional context in comments.

Comments

Siddhartha · Accepted Answer · 2020-05-12 12:54:12Z

0

Changing your code at one line will do the work for you. Instead of the existing xpath you are using to choose the 'next' arrow to change the table, you can use the following xpath.

>>> next_page = browser.find_element_by_xpath('//a[@class="k-link k-pager-nav"]//following::a[@class="k-link k-pager-nav"]')

answered May 12, 2020 at 12:54

Siddhartha

3052 silver badges9 bronze badges

Collectives™ on Stack Overflow

Python: Web Scraping with Selenium. Iterate over table and retrieve data

2 Answers 2

Comments

Comments

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

2 Answers 2

Comments

Comments

Your Answer

Sign up or log in

Post as a guest

Related