I scraped data from a local .html page and the below code is working. I am a newbie to scraping, just tried with a simple HTML page. It takes 10 sec to complete execution and print data. If I did anything wrong or need improvements, please let me know.
from bs4 import BeautifulSoup
import json
import pyodbc
import datetime
class JsonClass:
def __init__(self, Date, DocumentType, Procedure, NoOfPages):
self.Date = Date
self.DocumentType = DocumentType
self.Procedure = Procedure
self.NoOfPages = NoOfPages
def json_to_db(json_string):
conn = pyodbc.connect('Driver={SQL Server};' 'Server=xyz-PAVILION;' 'Database=jsondata;' 'Trusted_Connect=yes')
conn.autocommit = True
cursor = conn.cursor()
try:
cursor.execute('EXEC prcJsonInsertData @json = ?',
json_string) # Passing Json Data to DB via Stored Procedure(SP)
print('Data inserted')
except pyodbc.Error as error:
print('Error : %s' % error)
return False
except:
print('Operation Failed')
return False
conn.close()
return True
def json_serialize(dict_list):
with open('html_to_json.json', 'w') as file_out:
json.dump(dict_list, file_out, indent=4) # Serializing dict_list and writing in .json file
return json.dumps(dict_list)
def json_deserilaize(json_string):
with open('html_to_json.json', 'r') as file_out:
json_data = json.load(file_out) # Deserialization Data
json_class = [JsonClass(**i) for i in json_data] # Binding Json_data to Json_Class
print('********* After Deserialization *******************')
print('-------------------------------------')
for i in json_class:
print('Date : ' + i.Date)
print('DocumentType : ' + i.DocumentType)
print('Procedure : ' + i.Procedure)
print('NoOfPages : ' + i.NoOfPages)
print('-------------------------------------')
def html_data():
my_file = open("C:/Users/xyz/Downloads/sample.htm", 'r')
soup = BeautifulSoup(my_file, 'html.parser', from_encoding="UTF-8")
t_body = soup.find('tbody')
rows = t_body.find_all('tr')
dict_list = []
for row in rows:
column = row.find_all('td')
column = [x.text for x in column]
record = dict()
date_obj = datetime.datetime.strptime(column[1], '%d.%m.%Y')
record['Date'] = date_obj.date().isoformat()
record['DocumentType'] = column[2]
record['Procedure'] = column[3].replace('\u00a0/\u00a0', '/').replace('\u00a0', '/')
record['NoOfPages'] = column[4]
dict_list.append(record)
json_string = json_serialize(dict_list) # Func - 1
if json_to_db(json_string): # Func - 2
json_deserilaize(json_string) # Func - 3
if __name__ == '__main__':
html_data()