Below is the code, when I run the script, it starts over from the 1st page while running. The aim is to get the post title, date, and body from each page and then click next at the bottom of each page to start the process again.
Here are the includes: import requests import csv import urllib.parse as urlparse from urllib.parse import parse_qs from bs4 import BeautifulSoup from selenium import webdriver import time
browser = webdriver.Chrome('/Users/Xander/desktop/scraper/chromedriver')
URL = "https://www.jancox.com/jans-daily-news"
browser.get(URL)
URL_PAG = None
PAG = None
# Function Definition
def scrapeP(r):
count = 0
soup = BeautifulSoup(r.content, 'html5lib') # If this line causes an error, run 'pip install html5lib' or install html5lib
quotes = []
table = soup.find('div', attrs = {'class':'main-content'})
for post in table.findAll('div', attrs = {'class':'post'}):
quote = {}
quote['title'] = post.h1.text
print(quote['date_published'])
doc = browser.find_elements_by_xpath('/html/body/div/div/div[2]/div/div[1]/div/div[2]/nav/div/ul/li[2]/a')[0]
time.sleep(2)
doc.click()
URL_PAG = browser.current_url
count += 1
PAG = True
time.sleep(10)
print(count, ' - ', URL_PAG)
if(count % 10 == 0):
filename = 'post.csv'
with open(filename, 'a+', newline='') as f:
w = csv.DictWriter(f,['title', 'post', 'date_published'])
w.writeheader()
for quote in quotes:
w.writerow(quote)
quote.clear()
while True:
if(PAG == True):
browser.get(URL_PAG)
r = requests.get(URL_PAG)
print(URL_PAG)
scrapeP(r)
else:
browser.get(URL)
r = requests.get(URL)
scrapeP(r)