This script scrapes data from a website with upcoming sports games into a dictionary (www.oddsportal.com). Takes under 2.5 mins.
from selenium.common.exceptions import TimeoutException
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import datetime
import time
upcoming = ['http://www.oddsportal.com/basketball/usa/wnba/']
nextgames = []
def rescrape(urls, cs):
driver = webdriver.PhantomJS(executable_path=r'C:/phantomjs.exe')
driver.get('http://www.oddsportal.com/set-timezone/15/')
# The above link sets the timezone. I believe problem lies here, explicit wait?
driver.implicitly_wait(3)
for url in urls:
for i in range(2):
#This is to run the the scrape twice within function. It scrapes the same way both times
wait = WebDriverWait(driver, 5)
driver.get(url)
# this is to ensure the table with games has appeared
try:
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "table#tournamentTable tr.odd")))
except TimeoutException:
continue
# below is the script to get details from each game
for match in driver.find_element_by_css_selector("table#tournamentTable").find_elements_by_tag_name('tr')[3:]:
try:
home, away = match.find_element_by_class_name("table-participant").text.split(" - ")
except:
continue
date = match.find_element_by_xpath(".//preceding::th[contains(@class, 'first2')][1]").text
kickoff = match.find_element_by_class_name("table-time").text
# following deals with exceptions to a recognized date format
if "oday" in date:
date = datetime.date.today().strftime("%d %b %Y")
event = "Not specified"
elif "omorrow" in date:
date = datetime.date.today() + datetime.timedelta(days=1)
date = date.strftime("%d %b %Y")
elif "esterday" in date:
date = datetime.date.today() + datetime.timedelta(days=-1)
date = date.strftime("%d %b %Y")
elif " - " in date:
date, event = date.split(" - ", 1)
nextgames.append({
"current time": time.ctime(),
"home": home.strip(),
"away": away.strip(),
"date": date,
"time": kickoff.strip()})
time.sleep(3)
print len(nextgames)
print len(nextgames)
driver.close()
df = pd.DataFrame(nextgames)
df.to_csv(cs, encoding='utf-8')
return df
for i in range(3):
rescrape(upcoming, 'trial' + str(i) + '.csv')
What's wrong with it is that setting timezone driver.get('http://www.oddsportal.com/set-timezone/15/') doesn't always work. It reverts to default timezone of GMT about 20% of the time it scrapes. This output shows the wrong date & time on the third round, after getting things right first 2 times. Notice how last range(2) loop gets both times wrong but only second date is wrong - means it can change timezone in either loop:
pd.set_option('display.max_colwidth', 10)
Unnamed: 0 away current time date home time
0 0 Washin... Wed Ju... 8-Jun-16 Dallas... 20:30
1 1 San An... Wed Ju... 9-Jun-16 Phoeni... 22:00
2 2 Washin... Wed Ju... 8-Jun-16 Dallas... 20:30
3 3 San An... Wed Ju... 9-Jun-16 Phoeni... 22:00
4 4 Washin... Wed Ju... 8-Jun-16 Dallas... 20:30
5 5 San An... Wed Ju... 9-Jun-16 Phoeni... 22:00
6 6 Washin... Wed Ju... 8-Jun-16 Dallas... 20:30
7 7 San An... Wed Ju... 9-Jun-16 Phoeni... 22:00
8 8 Phoeni... Wed Ju... 8-Jun-16 Minnes... 0:00
9 9 New Yo... Wed Ju... 8-Jun-16 Los An... 2:00
10 10 Washin... Wed Ju... 9-Jun-16 Dallas... 0:30
11 11 San An... Wed Ju... 10-Jun-16 Phoeni... 2:00
So how do I ensure the timezone .get works every time? Currently I have an implicit wait and have tried explicit waits to no avail.