0

currently i have Webscraped the data and printed it but now i want it to export to excel/csvi am new to python need help there are multiple pages that i have scraped now i need to export them to csv/excel.need help my code below

import requests
from urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
from bs4 import BeautifulSoup as bs

def scrap_bid_data():

page_no = 1 #initial page number
while True:
    print('Hold on creating URL to fetch data...')
    URL = 'https://bidplus.gem.gov.in/bidlists?bidlists&page_no=' + str(page_no) #create dynamic URL
    print('URL cerated: ' + URL)

    scraped_data = requests.get(URL,verify=False) # request to get the data
    soup_data = bs(scraped_data.text, 'lxml') #parse the scraped data using lxml
    extracted_data = soup_data.find('div',{'id':'pagi_content'}) #find divs which contains required data

    if len(extracted_data) == 0: # **if block** which will check the length of extracted_data if it is 0 then quit and stop the further execution of script.
        break
    else:
        for idx in range(len(extracted_data)): # loops through all the divs and extract and print data
            if(idx % 2 == 1): #get data from odd indexes only because we have required data on odd indexes
                bid_data = extracted_data.contents[idx].text.strip().split('\n')
                print('-' * 100)
                print(bid_data[0]) #BID number
                print(bid_data[5]) #Items
                print(bid_data[6]) #Quantitiy Required
                print(bid_data[10] + bid_data[12].strip()) #Department name and address
                print(bid_data[16]) #Start date
                print(bid_data[17]) #End date                   
                print('-' * 100)

        page_no +=1 #increments the page number by 1

 scrap_bid_data()

1 Answer 1

1

Since you have the data elements already, use can write them to a csv in a couple steps.

  • Create a list of lists, with each list being a single row of data elements
  • Save the full list to csv using csv.writer.writerows passing in the full list

Here are the code updates:

def scrap_bid_data():

    csvlst = [['BID number','Items','Quantity Required','Department name and address','Start date','End date']]  # header row # ADD THIS LINE
    page_no = 1 #initial page number
    while True:
        ...................

        if len(extracted_data) == 0: # **if block** which will check the length of extracted_data if it is 0 then quit and stop the further execution of script.
            break
        else:
            for idx in range(len(extracted_data)): # loops through all the divs and extract and print data
                if(idx % 2 == 1): #get data from odd indexes only because we have required data on odd indexes
                    bid_data = extracted_data.contents[idx].text.strip().split('\n')
                    .................
                    csvlst.append([bid_data[0],bid_data[5],bid_data[6],bid_data[10],bid_data[16],bid_data[17]])  # CSV row  # ADD THIS LINE

            page_no +=1 #increments the page number by 1

        import csv  # Write CSV  # ADD THIS SECTION
        with open("out.csv", "w", newline="") as f:
            writer = csv.writer(f)
            writer.writerows(csvlst)

scrap_bid_data()
Sign up to request clarification or add additional context in comments.

2 Comments

it's too much time 800 pages in 90 min !!
It's probably requests.get that's taking most of the time. Look into multiprocessing\multithreading to help with this.

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.