0

I am learning Python and decided to do a web-scraping project, where I am using Beautifulsoup and Selenium

Site: https://careers.amgen.com/ListJobs?

Goal: retrieve all the variables related to a job add. Variables identified: ID, jobs, URL, city, state, zip, country, day of the job post.

Problem: I managed to extract the data from the first page of the table. However, I cannot extract the data from all other pages of the table. And I did use the option to go to the next page.

Any help would be much appreciated.

Please find my code below.

import re
import os
import selenium
import pandas as pd

from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.expected_conditions import presence_of_element_located
from bs4 import BeautifulSoup


#driver = webdriver.Chrome(ChromeDriverManager().install())
browser = webdriver.Chrome("") #path needed to execute chromedriver. Check your own 
#path
browser.get('https://careers.amgen.com/ListJobs?')
browser.implicitly_wait(100)
soup = BeautifulSoup(browser.page_source, 'html.parser')
code_soup = soup.find_all('tr', attrs={'role': 'row'})

# creating data set
df =pd.DataFrame({'id':[],
                  'jobs':[],
                 'url':[],
                 'city':[],
                 'state':[],
                  'zip':[],
                  'country':[],
                 'added':[]
                 })
d = code_soup

next_page = browser.find_element_by_xpath('//*[@id="jobGrid0"]/div[2]/a[3]/span')



for i in range(2,12): #catch error, out of bonds?
    df = df.append({'id' : d[i].find_all("td", {"class": "DisplayJobId-cell"}),
                     "jobs" : d[i].find_all("td", {"class":"JobTitle-cell"}),
                     "url" : d[i].find("a").attrs['href'],
                     "city" : d[i].find_all("td", {"class": "City-cell"}),
                     "state" : d[i].find_all("td", {"class": "State-cell"}),
                     "zip" : d[i].find_all("td", {"class": "Zip-cell"}),
                     "country" : d[i].find_all("td", {"class": "Country-cell"}),
                     "added" : d[i].find_all("td", {"class": "AddedOn-cell"})}, ignore_index=True)
    
df['url'] = 'https://careers.amgen.com/' + df['url'].astype(str)
df["company"] = "Amgen"
df

#iterate through the pages

next_page = browser.find_element_by_xpath('//*[@id="jobGrid0"]/div[2]/a[3]/span')
for p in range(1,7): #go from page 1 to 6
    next_page.click()
    browser.implicitly_wait(20)
    print(p)

>quote 
I tried multiple things, this is my last multiple attempt. It did not work:

```
p = 0
next_page = browser.find_element_by_xpath('//*[@id="jobGrid0"]/div[2]/a[3]/span')

for p in range(1,7):   
    for i in range(2,12):
        df1 = df.append({'id' : d[i].find_all("td", {"class": "DisplayJobId-cell"}),
                         "jobs" : d[i].find_all("td", {"class":"JobTitle-cell"}),
                         "url" : d[i].find("a").attrs['href'],
                         "city" : d[i].find_all("td", {"class": "City-cell"}),
                         "state" : d[i].find_all("td", {"class": "State-cell"}),
                         "zip" : d[i].find_all("td", {"class": "Zip-cell"}),
                         "country" : d[i].find_all("td", {"class": "Country-cell"}),
                         "added" : d[i].find_all("td", {"class": "AddedOn-cell"})}, ignore_index=True)
        p += 1
        next_page.click()
    print(p)

2 Answers 2

1
import requests
import re
import pandas as pd


params = {
    'sort': 'AddedOn-desc',
    'page': '1',
    'pageSize': '1000',
    'group': '',
    'filter': '',
    'fields': 'JobTitle,DisplayJobId,City,State,Zip,Country,AddedOn,UrlJobTitle'

}

headers = {
    "Origin": 'https://careers.amgen.com'
}


def main(url):
    r = requests.get(url)
    api = re.search('JobsApiUrl="(.*?)\"', r.text).group(1)
    r = requests.get(api, params=params, headers=headers).json()
    df = pd.DataFrame(r['Data'])
    print(df)
    df.to_csv("data.csv", index=False)


main("https://careers.amgen.com/ListJobs")

Output: view-online

Sample:

enter image description here

Sign up to request clarification or add additional context in comments.

Comments

0

Changing your code at one line will do the work for you. Instead of the existing xpath you are using to choose the 'next' arrow to change the table, you can use the following xpath.

>>> next_page = browser.find_element_by_xpath('//a[@class="k-link k-pager-nav"]//following::a[@class="k-link k-pager-nav"]')

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.