Web Scraping data from multiple pages then appending it to csv file

Question

I'm working on web scraping with beautiful soup to retrieve jobs from indeed. My code is working but when it loops to the next page it would overwrite the existing CSV file. I see from other posts that I would need to use pandas concat? but I can't seem to get it to work or where to implement it in my source code. Any suggestions to improve my code would also be greatly appreciated.

Below scrape pages 1-2 on indeed.

from bs4 import BeautifulSoup
import requests, pandas as pd
from urllib.parse import urljoin


print('Getting new jobs...')

main_url = 'https://www.indeed.com/jobs?q=web+developer&l=Sacramento,+CA&sort=date'
start_from = '&start='  


for page in range(1, 3):
    page = (page - 1) * 10
    url = "%s%s%d" % (main_url, start_from, page)  # get full url
    indeed = requests.get(url)
    indeed.raise_for_status()
    soup = BeautifulSoup(indeed.text, 'html.parser')

    home = 'https://www.indeed.com/viewjob?'
    jobsTitle, companiesName, citiesName, jobsSummary, jobsLink = [], [], [], [], []
    target = soup.find_all('div', class_=' row result')

    for div in target:

        if div:
            title = div.find('a', class_='turnstileLink').text.strip()
            jobsTitle.append(title)

            company = div.find('span', class_='company').text.strip()
            companiesName.append(company)

            city = div.find('span', class_='location').text.strip()
            citiesName.append(city)

            summary = div.find('span', class_='summary').text.strip()
            jobsSummary.append(summary)

            job_link = urljoin(home, div.find('a').get('href'))
            jobsLink.append(job_link)


    target2 = soup.find_all('div', class_='lastRow row result')
    for i in target2:
        title2 = i.find('a', class_='turnstileLink').text.strip()
        jobsTitle.append(title2)

        company2 = i.find('span', class_='company').text.strip()
        companiesName.append(company2)

        city2 = i.find('span', class_='location').text.strip()
        citiesName.append(city2)

        summary2 = i.find('span', class_='summary').text.strip()
        jobsSummary.append(summary2)

        jobLink2 = urljoin(home, i.find('a').get('href'))
        jobsLink.append(jobLink2)


    data_record = []
    for title, company, city, summary, link in zip(jobsTitle, companiesName, citiesName, jobsSummary, jobsLink):
        data_record.append({'Job Title': title, 'Company': company, 'City': city, 'Summary': summary, 'Job Link': link})

    df = pd.DataFrame(data_record, columns=['Job Title', 'Company', 'City', 'Summary', 'Job Link'])
df

jezrael · Accepted Answer · 2018-06-27 07:35:57Z

You can crate list data_record out of loop with DataFrame contructor:

data_record = []
for page in range(1, 3):
    page = (page - 1) * 10
    url = "%s%s%d" % (main_url, start_from, page)  # get full url
    indeed = requests.get(url)
    indeed.raise_for_status()
    soup = BeautifulSoup(indeed.text, 'html.parser')

...

    for title, company, city, summary, link in zip(jobsTitle, companiesName, citiesName, jobsSummary, jobsLink):
        data_record.append({'Job Title': title, 'Company': company, 'City': city, 'Summary': summary, 'Job Link': link})

df = pd.DataFrame(data_record, columns=['Job Title', 'Company', 'City', 'Summary', 'Job Link'])

Possible solution with concat:

dfs = []
for page in range(1, 3):
    page = (page - 1) * 10
    url = "%s%s%d" % (main_url, start_from, page)  # get full url
    indeed = requests.get(url)
    indeed.raise_for_status()
    soup = BeautifulSoup(indeed.text, 'html.parser')

...

    data_record = []
    for title, company, city, summary, link in zip(jobsTitle, companiesName, citiesName, jobsSummary, jobsLink):
        data_record.append({'Job Title': title, 'Company': company, 'City': city, 'Summary': summary, 'Job Link': link})

    df = pd.DataFrame(data_record, columns=['Job Title', 'Company', 'City', 'Summary', 'Job Link'])
    dfs.append(df)

df_fin = pd.concat(dfs, ignore_index=True)

Collectives™ on Stack Overflow

Web Scraping data from multiple pages then appending it to csv file

1 Answer 1

Comments

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

1 Answer 1

Comments

Your Answer

Sign up or log in

Post as a guest

Related