0

I am doing some webscraping with BeautifulSoup and Selenium and I want to write my data to an excel file

# coding: utf-8

import requests
import bs4
from datetime import datetime
import re
import os
import urllib
import urllib2
from bs4 import BeautifulSoup
from selenium import webdriver
import time

initialpage = 'https://www.boxofficemojo.com/yearly/chart/?yr=2017&p=.htm'
res = requests.get(initialpage, timeout=None)
soup = bs4.BeautifulSoup(res.text, 'html.parser')



pages = []
pagelinks=soup.select('a[href^="/yearly/chart/?page"]')

for i in range(int(len(pagelinks)/2)):
    pages.append(str(pagelinks[i])[9:-14])
    pages[i]=pages[i].replace("amp;","")
    pages[i]= "https://www.boxofficemojo.com" + pages[i]
    pages[i]=pages[i][:-1]



pages.insert(0, initialpage)
date_dic = {}
movie_links = []
titles = []
Domestic_Gross_Arr=[]
Genre_Arr=[]
Release_Date_Arr = []
Theaters_Arr=[]
Budget_Arr = []
Views_Arr = []
Edits_Arr = []
Editors_Arr = []


for i in range(int(len(pagelinks)/2 + 1)): 
    movie_count=0;
    res1 = requests.get(pages[i])
    souppage=bs4.BeautifulSoup(res1.text, 'html.parser')
    for j in souppage.select('tr > td > b > font > a'):
        link = j.get("href")[7:].split("&")
        str1 = "".join(link)
        final = "https://www.boxofficemojo.com/movies" + str1
        if "/?id" in final: 
            movie_links.append(final)
            movie_count += 1

    number_of_theaters=souppage.find("tr", bgcolor="#dcdcdc")
    for k in range(movie_count):
        #print(number_of_theaters.next_sibling.contents[4].text)
        Theaters_Arr.append(number_of_theaters.next_sibling.contents[4].text)
        number_of_theaters=number_of_theaters.next_sibling



k=0
path = os.getcwd()  
path = path + '/movie_pictures'
os.makedirs(path)
os.chdir(path)
while(k < 2):
    j = movie_links[k]
    try:
        res1 = requests.get(j)
        soup1 = bs4.BeautifulSoup(res1.text, 'html.parser')

        c = soup1.select('td[width="35%"]')
        d=soup1.select('div[class="mp_box_content"]')

        genre = soup1.select('td[valign="top"]')[5].select('b')
        image = soup1.select('img')[6].get('src')
        budget = soup1.select('tr > td > b')
        domestic = str(c[0].select('b'))[4:-5]


        release = soup1.nobr.a
        title = soup1.select('title')[0].getText()[:-25]
        print ("-----------------------------------------")
        print ("Title: " +title)
        titles.append(title)
        print ("Domestic Gross: " +domestic)
        Domestic_Gross_Arr.append(domestic)
        print ("Genre: "+genre[0].getText())
        Genre_Arr.append(genre[0].getText())
        print ("Release Date: " +release.contents[0])
        Release_Date_Arr.append(release.contents[0])
        print ("Production Budget: " +budget[5].getText())
        Budget_Arr.append(budget[5].getText())

        year1=str(release.contents[0])[-4:]
        a,b=str(release.contents[0]).split(",")
        month1, day1=a.split(" ")
        datez= year1 + month1 + day1
        new_date= datetime.strptime(datez , "%Y%B%d")
        date_dic[title]=new_date       


        with open('pic' + str(k) + '.png', 'wb') as handle:
            response = requests.get(image, stream=True)

            if not response.ok:
                print response

            for block in response.iter_content(1024):
                if not block:
                    break

            handle.write(block)
    except:
        print("Error Occured, Page Or Data Not Available")
    k+=1



def subtract_one_month(t):

    import datetime
    one_day = datetime.timedelta(days=1)
    one_month_earlier = t - one_day
    while one_month_earlier.month == t.month or one_month_earlier.day > t.day:
        one_month_earlier -= one_day
    year=str(one_month_earlier)[:4]

    day=str(one_month_earlier)[8:10]

    month=str(one_month_earlier)[5:7]

    newdate= year + "-" + month +"-" + day

    return newdate

number_of_errors=0
browser = webdriver.Chrome("/Users/Gokce/Downloads/chromedriver")
browser.maximize_window() 
browser.implicitly_wait(20)
for i in titles:
    try:
        release_date = date_dic[i]
        i = i.replace(' ', '_')
        i = i.replace("2017", "2017_film")
    #end = datetime.strptime(release_date, '%B %d, %Y')


        end_date = release_date.strftime('%Y-%m-%d')
        start_date = subtract_one_month(release_date)
        url = "https://tools.wmflabs.org/pageviews/?project=en.wikipedia.org&platform=all-access&agent=user&start="+ start_date +"&end="+ end_date + "&pages=" + i


        browser.get(url)
        page_views_count = browser.find_element_by_css_selector(" .summary-column--container .legend-block--pageviews .linear-legend--counts:first-child span.pull-right ")
        page_edits_count = browser.find_element_by_css_selector(" .summary-column--container .legend-block--revisions .linear-legend--counts:first-child span.pull-right ")
        page_editors_count = browser.find_element_by_css_selector(" .summary-column--container .legend-block--revisions .legend-block--body .linear-legend--counts:nth-child(2) span.pull-right ")
        print (i)

        print ("Number of Page Views: " +page_views_count.text)
        Views_Arr.append(page_views_count.text)
        print ("Number of Edits: " +page_edits_count.text)
        Edits_Arr.append(page_edits_count.text)
        print ("Number of Editors: " +page_editors_count.text)
        Editors_Arr.append(page_editors_count.text)
    except:
        print("Error Occured for this page: " + str(i))
        number_of_errors += 1
        Views_Arr.append(-1)
        Edits_Arr.append(-1)
        Editors_Arr.append(-1)


time.sleep(5)
browser.quit()



import xlsxwriter
os.chdir("/home")
workbook = xlsxwriter.Workbook('WebScraping.xlsx')
worksheet = workbook.add_worksheet()

worksheet.write(0,0, "Hello")
worksheet.write(0,1, 'Genre')
worksheet.write(0,2, 'Production Budget')
worksheet.write(0,3, 'Domestic Gross')
worksheet.write(0,4, 'Release Date')
worksheet.write(0,5, 'Number of Wikipedia Page Views')
worksheet.write(0,6, 'Number of Wikipedia Edits')
worksheet.write(0,7, 'Number of Wikipedia Editors')

row=1
for i in range(len(titles)):
    worksheet.write(row, 0, titles[i])
    worksheet.write(row, 1, Genre_Arr[i])
    worksheet.write(row, 2, Budget_Arr[i])
    worksheet.write(row, 3, Domestic_Gross_Arr[i])
    worksheet.write(row, 4, Release_Date_Arr[i])
    worksheet.write(row, 5, Theaters_Arr[i])
    worksheet.write(row, 6, Views_Arr[i])
    worksheet.write(row, 7, Edits_Arr[i])
    worksheet.write(row, 8, Editors_Arr[i])
    row += 1


workbook.close()

The code works until import xlsxwriter, then I get this error:

---------------------------------------------------------------------------
IOError                                   Traceback (most recent call last)
<ipython-input-9-c99eea52d475> in <module>()
     27 
     28 
---> 29 workbook.close()

/Users/Gokce/anaconda2/lib/python2.7/site-packages/xlsxwriter/workbook.pyc in close(self)
    309         if not self.fileclosed:
    310             self.fileclosed = 1
--> 311             self._store_workbook()
    312 
    313     def set_size(self, width, height):

/Users/Gokce/anaconda2/lib/python2.7/site-packages/xlsxwriter/workbook.pyc in _store_workbook(self)
    638 
    639         xlsx_file = ZipFile(self.filename, "w", compression=ZIP_DEFLATED,
--> 640                             allowZip64=self.allow_zip64)
    641 
    642         # Add XML sub-files to the Zip file with their Excel filename.

/Users/Gokce/anaconda2/lib/python2.7/zipfile.pyc in __init__(self, file, mode, compression, allowZip64)
    754             modeDict = {'r' : 'rb', 'w': 'wb', 'a' : 'r+b'}
    755             try:
--> 756                 self.fp = open(file, modeDict[mode])
    757             except IOError:
    758                 if mode == 'a':

IOError: [Errno 45] Operation not supported: 'WebScraping.xlsx' 

What might be the problem? If if cut off the last part and run in a new IDLE with fake data, it works. But it does not work in the main IDLE. So the problem must be in the previous part I believe

2
  • 1
    Please create a minimal reproducible example. Whatever web scraping you're doing has nothing to do with problems writing an Excel file, so it should be removed and replaced with fixed data. Commented Aug 29, 2018 at 19:13
  • Make sure the file is closed in Excel before you run the script. Commented Aug 29, 2018 at 19:50

2 Answers 2

1

The error is triggered when the code tries to write the file. Confirm that you have write permissions to that directory, and that the file doesn't already exist. It's unlikely that you have access to /home.

Sign up to request clarification or add additional context in comments.

Comments

0

I used openpyxl a few years ago to automate some excel spreadsheets. You can use code like below:

import openpyxl
# you could create a template spreadsheet that does all of the
# worksheet.write() commands on lines 188-195
wb = openpyxl.load_workbook('Template.xlsx')
wb.create_sheet('my_sheet')
ws = wb.get_sheet_by_name(title='my_sheet')
row=1
for i in range(len(titles)):
    ws.cell(row=row, column=0, value=titles[i])
    ws.cell(row=row, column=1, value=Genre_Arr[i])
    ws.cell(row=row, column=2, value=Budget_Arr[i])
    ws.cell(row=row, column=3, value=Domestic_Gross_Arr[i])
    ws.cell(row=row, column=4, value=Release_Date_Arr[i])
    ws.cell(row=row, column=5, value=Theaters_Arr[i])
    ws.cell(row=row, column=6, value=Views_Arr[i])
    ws.cell(row=row, column=7, value=Edits_Arr[i])
    ws.cell(row=row, column=8, value=Editors_Arr[i])
    row += 1
wb.save('my_file.xlsx')

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.