I´m trying to get data from a travel webite using selenium. I can extract the data in CSV but i am not able to insert data into my mysql database.
import requests
from bs4 import BeautifulSoup
import csv
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
import unittest
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import time
import unittest
import re
import sys
import urllib
import json
import sys, mysql.connector
import csv
import mysql
user_agent = {'User-agent': 'Chrome/43.0.2357.124'}
output_file = open("Excel.csv", "w", newline='')
class Crawling(unittest.TestCase):
def setUp(self):
self.driver = webdriver.Firefox()
self.driver.set_window_size(1024, 768)
self.base_url = "https://www.ctrip.com/"
self.accept_next_alert = True
def test_sel(self):
driver = self.driver
delay = 3
driver.get(self.base_url + "Search/new york")
for i in range(1,2):
driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
time.sleep(2)
html_source = driver.page_source
data = html_source.encode("utf-8")
elements = driver.find_elements_by_xpath("/html/body/div[4]/div[1]/div[2]/div/div[5]/div/div[1]/div[1]/ul/li[1]/div/div[1]")
innerElements = 15
outerElements = len(elements)/innerElements
#print(innerElements, "\t", outerElements, "\t", len(elements))
for j in range(1, 20):
price = driver.find_element_by_xpath("/html/body/div[4]/div[1]/div[2]/div/div[5]/div/div[1]/div[1]/ul/li["+str(j)+"]/div/div[1]/div[2]/span[1]").text
headline = driver.find_element_by_xpath("/html/body/div[4]/div[1]/div[2]/div/div[5]/div/div[1]/div[1]/ul/li["+str(j)+"]/div/div[1]/div[2]/strong").text
deeplink = driver.find_element_by_xpath("/html/body/div[4]/div[1]/div[2]/div/div[5]/div/div[1]/div[1]/ul/li["+str(j)+"]/div/div[1]/div[3]/div/ul/li[1]/a").get_attribute("href")
if not all([headline, price]):
print("Header not available " " | " + "Price not available " + " | " + "Deeplink: " + str(deeplink))
headline = "Not available as well as price"
else:
print("Header: " + headline + " | " + "Price: " + price[4:] + " | " + "Deeplink: " + str(deeplink))
writer = csv.writer(output_file)
csv_fields = ['Header', 'Price', 'Deeplink', 'PartnerID', 'LocationID']
if elements:
writer.writerow([headline, price[4:], deeplink, partner_ID, location_ID])
if __name__ == "__main__":
unittest.main()
That is the additional code that I should enable me to extract it to the database:
if not all([headline, price]):
print("Header not available " " | " + "Price not available " + " | " + "Deeplink: " + str(deeplink))
headline = "Not available as well as price"
else:
print("Header: " + headline + " | " + "Price: " + price[4:] + " | " + "Deeplink: " + str(deeplink))
writer = csv.writer(output_file)
csv_fields = ['Header', 'Price', 'Deeplink', 'PartnerID', 'LocationID']
if elements:
writer.writerow([headline, price[4:], deeplink, partner_ID, location_ID])
try:
connection = mysql.connector.connect\
(host = "localhost", user = "root", passwd ="", db = "crawling")
except:
print("No connection")
sys.exit(0)
cursor = connection.cursor()
cursor.execute("TRUNCATE meta;")
connection.commit()
cursor.execute("ALTER TABLE meta AUTO_INCREMENT =1;")
connection.commit()
cursor.execute('''INSERT INTO meta (price_id, Header, Price, Deeplink) \
VALUES("%s", "%s", "%s", "%s")''')
connection.commit()
cursor.close()
connection.close()
if __name__ == "__main__":
unittest.main()
But the problem is that it does not extract to the database. Can you guys help me out/give me a hint? Any feedback is appreciated