I'm working on web scraping with beautiful soup to retrieve jobs from indeed. My code is working but when it loops to the next page it would overwrite the existing CSV file. I see from other posts that I would need to use pandas concat? but I can't seem to get it to work or where to implement it in my source code. Any suggestions to improve my code would also be greatly appreciated.
Below scrape pages 1-2 on indeed.
from bs4 import BeautifulSoup
import requests, pandas as pd
from urllib.parse import urljoin
print('Getting new jobs...')
main_url = 'https://www.indeed.com/jobs?q=web+developer&l=Sacramento,+CA&sort=date'
start_from = '&start='
for page in range(1, 3):
page = (page - 1) * 10
url = "%s%s%d" % (main_url, start_from, page) # get full url
indeed = requests.get(url)
indeed.raise_for_status()
soup = BeautifulSoup(indeed.text, 'html.parser')
home = 'https://www.indeed.com/viewjob?'
jobsTitle, companiesName, citiesName, jobsSummary, jobsLink = [], [], [], [], []
target = soup.find_all('div', class_=' row result')
for div in target:
if div:
title = div.find('a', class_='turnstileLink').text.strip()
jobsTitle.append(title)
company = div.find('span', class_='company').text.strip()
companiesName.append(company)
city = div.find('span', class_='location').text.strip()
citiesName.append(city)
summary = div.find('span', class_='summary').text.strip()
jobsSummary.append(summary)
job_link = urljoin(home, div.find('a').get('href'))
jobsLink.append(job_link)
target2 = soup.find_all('div', class_='lastRow row result')
for i in target2:
title2 = i.find('a', class_='turnstileLink').text.strip()
jobsTitle.append(title2)
company2 = i.find('span', class_='company').text.strip()
companiesName.append(company2)
city2 = i.find('span', class_='location').text.strip()
citiesName.append(city2)
summary2 = i.find('span', class_='summary').text.strip()
jobsSummary.append(summary2)
jobLink2 = urljoin(home, i.find('a').get('href'))
jobsLink.append(jobLink2)
data_record = []
for title, company, city, summary, link in zip(jobsTitle, companiesName, citiesName, jobsSummary, jobsLink):
data_record.append({'Job Title': title, 'Company': company, 'City': city, 'Summary': summary, 'Job Link': link})
df = pd.DataFrame(data_record, columns=['Job Title', 'Company', 'City', 'Summary', 'Job Link'])
df