0

I have to extract whole table in csv from the give URL https://www.jma.go.jp/bosai/warning/#lang=en

I tried but I cannot extract with same column name because when I try to extract text it ignores all blank and I need exactly same table in csv format using Beautiful Soup and selenium

from cgi import print_form
from cgitb import text
from lib2to3.pytree import type_repr
import time
from turtle import clear, title
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import pandas as pd

url = 'https://www.jma.go.jp/bosai/warning/#lang=en'
options = Options()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
driver = webdriver.Chrome('chromedriver_win32\chromedriver.exe',chrome_options=options)
driver.get(url)
time.sleep(3)
page = driver.page_source
driver.quit()
soup = BeautifulSoup(page, 'html.parser')
A = ['th']
def first_column(A, soup):
    df = pd.DataFrame()
    k = 0
    for i in A:
        temp = []
        temp.extend(strong_tag.text for strong_tag in soup.find_all(i,attrs={'class' : ['contents-area contents-clickable warning-clickable','contents-clickable warning-clickable']}))
        temp = list(temp)
        df["{}".format(temp[0])] = temp[1:]
    
    df = df.drop_duplicates()
    return df

def remaining_data(soup):
    df = pd.DataFrame()
    for i in A:
        temp = []
        temp.extend(strong_tag.text for strong_tag in soup.find_all('tr'))
        temp = list(temp)
        df["{}".format(temp[3])] = temp[4:]
    
    df = df.drop_duplicates()
    return df

df = first_column(A, soup)
df2 = remaining_data(soup)
#df2.drop(df2.index[df2["HokkaidoHeavy Rain(Inundation)Heavy Rain(Landslide)FloodStormGaleHigh WaveStorm SurgeThunder StormDense FogDry Air"] == ""])
df2.reset_index()
df["Warnings"] = df2.loc[:]

warnings_list = ["Advisory", "Warning", "Emergency Warning"]
df["Final Warnings"] = "No Warnings"
for i in range(0, len(warnings_list)):
    for k in range(0, len(df)):
        if warnings_list[i] in df["Warnings"].loc[k]:
            df["Final Warnings"].loc[k] = warnings_list[i]
        # else:
        #     df["Final Warnings"] = ""
df = df.drop(columns = "Warnings")
df.columns = ["Locations","Alert Type"]
df.to_csv("Japan_Warning_alerts.csv")
4
  • please include the code snippet of what you have tried Commented Aug 30, 2022 at 14:46
  • Please provide enough code so others can better understand or reproduce the problem. Commented Aug 30, 2022 at 20:05
  • Thank you, Kapil for the code But The issue is that on this website table column names are changing sometimes there are 5 columns sometimes there are 6 They are not constant so we can't put them in headers Commented Aug 31, 2022 at 12:17
  • okay, so what you can do is maybe extract the columns dynamically, the way i was extracting, it is able to get the headers at this line df.columns = df.iloc[1, :] so we can dynamically get the headers from here Commented Aug 31, 2022 at 14:56

1 Answer 1

1

here you go, this divides up the file by region too

import datetime
import os

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import pandas as pd

TABLE_HEADER = "Heavy Rain(Inundation),Heavy Rain(Landslide),Flood,StormGale,High Wave,Storm Surge,Thunder Storm,Dense Fog"

driver = webdriver.Firefox(executable_path=os.path.join(os.getcwd(), "drivers", "geckodriver.exe"))
driver.get("https://www.jma.go.jp/bosai/warning/#lang=en")
WebDriverWait(driver, 10).until((EC.visibility_of_element_located(
    (By.CSS_SELECTOR, "div.contents-wide-table-scroll"))))
tables = driver.find_element(by=By.CSS_SELECTOR, value="div.contents-wide-table-scroll")
df = pd.read_html(tables.get_attribute("innerHTML"))
driver.quit()

df = df[0]
df.columns = df.iloc[1, :]
df = df.iloc[2:, :]
df.to_csv(os.path.join(os.getcwd(), "data", "data.csv"), index=False)


def save(current_table, name):
    with open(os.path.join(os.getcwd(), "data", f"{name}.csv"), "w") as write_file:
        write_file.writelines(current_table)


for file in os.listdir(os.path.join(os.getcwd(), "data")):
    current_table = [TABLE_HEADER]
    with open(os.path.join(os.getcwd(), "data", file), "r", encoding="utf-8") as read_file:
        for line in read_file.readlines():
            if TABLE_HEADER in line:
                name = line.split(",")[0]
                save(current_table, name)
                current_table = [TABLE_HEADER]
            else:
                current_table.append(line)

Sign up to request clarification or add additional context in comments.

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.