I have to extract whole table in csv from the give URL https://www.jma.go.jp/bosai/warning/#lang=en
I tried but I cannot extract with same column name because when I try to extract text it ignores all blank and I need exactly same table in csv format using Beautiful Soup and selenium
from cgi import print_form
from cgitb import text
from lib2to3.pytree import type_repr
import time
from turtle import clear, title
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import pandas as pd
url = 'https://www.jma.go.jp/bosai/warning/#lang=en'
options = Options()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
driver = webdriver.Chrome('chromedriver_win32\chromedriver.exe',chrome_options=options)
driver.get(url)
time.sleep(3)
page = driver.page_source
driver.quit()
soup = BeautifulSoup(page, 'html.parser')
A = ['th']
def first_column(A, soup):
df = pd.DataFrame()
k = 0
for i in A:
temp = []
temp.extend(strong_tag.text for strong_tag in soup.find_all(i,attrs={'class' : ['contents-area contents-clickable warning-clickable','contents-clickable warning-clickable']}))
temp = list(temp)
df["{}".format(temp[0])] = temp[1:]
df = df.drop_duplicates()
return df
def remaining_data(soup):
df = pd.DataFrame()
for i in A:
temp = []
temp.extend(strong_tag.text for strong_tag in soup.find_all('tr'))
temp = list(temp)
df["{}".format(temp[3])] = temp[4:]
df = df.drop_duplicates()
return df
df = first_column(A, soup)
df2 = remaining_data(soup)
#df2.drop(df2.index[df2["HokkaidoHeavy Rain(Inundation)Heavy Rain(Landslide)FloodStormGaleHigh WaveStorm SurgeThunder StormDense FogDry Air"] == ""])
df2.reset_index()
df["Warnings"] = df2.loc[:]
warnings_list = ["Advisory", "Warning", "Emergency Warning"]
df["Final Warnings"] = "No Warnings"
for i in range(0, len(warnings_list)):
for k in range(0, len(df)):
if warnings_list[i] in df["Warnings"].loc[k]:
df["Final Warnings"].loc[k] = warnings_list[i]
# else:
# df["Final Warnings"] = ""
df = df.drop(columns = "Warnings")
df.columns = ["Locations","Alert Type"]
df.to_csv("Japan_Warning_alerts.csv")
df.columns = df.iloc[1, :]so we can dynamically get the headers from here