Webscrapping using python

Question

I have to extract whole table in csv from the give URL https://www.jma.go.jp/bosai/warning/#lang=en

I tried but I cannot extract with same column name because when I try to extract text it ignores all blank and I need exactly same table in csv format using Beautiful Soup and selenium

from cgi import print_form
from cgitb import text
from lib2to3.pytree import type_repr
import time
from turtle import clear, title
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import pandas as pd

url = 'https://www.jma.go.jp/bosai/warning/#lang=en'
options = Options()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
driver = webdriver.Chrome('chromedriver_win32\chromedriver.exe',chrome_options=options)
driver.get(url)
time.sleep(3)
page = driver.page_source
driver.quit()
soup = BeautifulSoup(page, 'html.parser')
A = ['th']
def first_column(A, soup):
    df = pd.DataFrame()
    k = 0
    for i in A:
        temp = []
        temp.extend(strong_tag.text for strong_tag in soup.find_all(i,attrs={'class' : ['contents-area contents-clickable warning-clickable','contents-clickable warning-clickable']}))
        temp = list(temp)
        df["{}".format(temp[0])] = temp[1:]
    
    df = df.drop_duplicates()
    return df

def remaining_data(soup):
    df = pd.DataFrame()
    for i in A:
        temp = []
        temp.extend(strong_tag.text for strong_tag in soup.find_all('tr'))
        temp = list(temp)
        df["{}".format(temp[3])] = temp[4:]
    
    df = df.drop_duplicates()
    return df

df = first_column(A, soup)
df2 = remaining_data(soup)
#df2.drop(df2.index[df2["HokkaidoHeavy Rain(Inundation)Heavy Rain(Landslide)FloodStormGaleHigh WaveStorm SurgeThunder StormDense FogDry Air"] == ""])
df2.reset_index()
df["Warnings"] = df2.loc[:]

warnings_list = ["Advisory", "Warning", "Emergency Warning"]
df["Final Warnings"] = "No Warnings"
for i in range(0, len(warnings_list)):
    for k in range(0, len(df)):
        if warnings_list[i] in df["Warnings"].loc[k]:
            df["Final Warnings"].loc[k] = warnings_list[i]
        # else:
        #     df["Final Warnings"] = ""
df = df.drop(columns = "Warnings")
df.columns = ["Locations","Alert Type"]
df.to_csv("Japan_Warning_alerts.csv")

Please provide enough code so others can better understand or reproduce the problem. — Community
– Community Bot, Commented Aug 30, 2022 at 20:05
Thank you, Kapil for the code But The issue is that on this website table column names are changing sometimes there are 5 columns sometimes there are 6 They are not constant so we can't put them in headers — gaurav12
– gaurav12, Commented Aug 31, 2022 at 12:17
okay, so what you can do is maybe extract the columns dynamically, the way i was extracting, it is able to get the headers at this line df.columns = df.iloc[1, :] so we can dynamically get the headers from here — Kapil
– Kapil, Commented Aug 31, 2022 at 14:56

Kapil · Accepted Answer · 2022-08-30 16:30:24Z

here you go, this divides up the file by region too

import datetime
import os

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import pandas as pd

TABLE_HEADER = "Heavy Rain(Inundation),Heavy Rain(Landslide),Flood,StormGale,High Wave,Storm Surge,Thunder Storm,Dense Fog"

driver = webdriver.Firefox(executable_path=os.path.join(os.getcwd(), "drivers", "geckodriver.exe"))
driver.get("https://www.jma.go.jp/bosai/warning/#lang=en")
WebDriverWait(driver, 10).until((EC.visibility_of_element_located(
    (By.CSS_SELECTOR, "div.contents-wide-table-scroll"))))
tables = driver.find_element(by=By.CSS_SELECTOR, value="div.contents-wide-table-scroll")
df = pd.read_html(tables.get_attribute("innerHTML"))
driver.quit()

df = df[0]
df.columns = df.iloc[1, :]
df = df.iloc[2:, :]
df.to_csv(os.path.join(os.getcwd(), "data", "data.csv"), index=False)


def save(current_table, name):
    with open(os.path.join(os.getcwd(), "data", f"{name}.csv"), "w") as write_file:
        write_file.writelines(current_table)


for file in os.listdir(os.path.join(os.getcwd(), "data")):
    current_table = [TABLE_HEADER]
    with open(os.path.join(os.getcwd(), "data", file), "r", encoding="utf-8") as read_file:
        for line in read_file.readlines():
            if TABLE_HEADER in line:
                name = line.split(",")[0]
                save(current_table, name)
                current_table = [TABLE_HEADER]
            else:
                current_table.append(line)

Collectives™ on Stack Overflow

Webscrapping using python

1 Answer 1

Comments

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

1 Answer 1

Comments

Your Answer

Sign up or log in

Post as a guest

Related