Writing data to. csv pandas

Question

I need to write data to a csv file, I am currently engaged in parsing an online store, there is a different number of characteristics on each product, for example: weight, length, etc. I am trying to write data using pandas, but I can't write all the data correctly into the dictionary, tell me how to do it correctly

from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import requests
import time
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
import pandas as pd

URLS = ['https://www.ikea.com/ru/ru/p/pahl-pol-pismennyy-stol-belyy-s29278422/','https://www.ikea.com/ru/ru/p/micke-mikke-pismennyy-stol-belyy-20373923/']
HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.3'}

content = []
colum = []
driver = webdriver.Chrome(ChromeDriverManager().install())

def get_html(url, params=None):
    r = requests.get(url, headers=HEADERS, params=params)
    return r


def get_specifications_parameter():
    num = 0
    WebDriverWait(driver, 5).until(expected_conditions.visibility_of_element_located((By.XPATH, '//*[@id="content"]/div/div[1]/div/div[2]/div[2]/div[2]/div[2]/button')))

    driver.find_element_by_xpath('//*[@id="content"]/div/div[1]/div/div[2]/div[2]/div[2]/div[2]/button').click()
    time.sleep(3)
    table = driver.find_element_by_xpath('//*[@id="range-modal-mount-node"]/div/div[4]/div/div[2]/div/div/div').get_attribute('innerHTML')
    soup = BeautifulSoup(table,'html.parser')
    
    param = soup.find_all('dd',class_='range-revamp-product-dimensions__list-item-measure')
    titles = soup.find_all('dt',class_='range-revamp-product-dimensions__list-item-name')
    for item in titles:
        if item.text in colum:
            pass
        else:
            colum.append(item.text)
    for item in param:
        content.append({titles[num].text:item.text}) #Writing characteristics to content
        num+=1
    print(content)

    
def get_content(url):
    driver.get(url)
    get_specifications_parameter()

    # content.append[{
    #     'name':name,
    #     'price':price,
    #     'photo':photo,
    #     'description':description
    #     }]
    #  Additional data to be recorded 
    print(content)

def start():
    for URL in URLS:
        html = get_html(URL)
        if html.status_code == 200:
            get_content(URL)
        else:
            print('Network error')

def write():
    df = pd.DataFrame(colum)
    for p in content:
        df = pd.concat([df, pd.DataFrame(p,index=[0])],ignore_index=True)
    df.to_csv("output.csv", index=False)


start()
write()

At the output to the content, I get this

[
    {"Ширина:\xa0": "128 см"},
    {"Глубина:\xa0": "58 см"},
    {"Мин высота:\xa0": "59 см"},
    {"Макс высота:\xa0": "72 см"},
    {"Макс нагрузка:\xa0": "50 кг"},
]
[
    {"Ширина:\xa0": "128 см"},
    {"Глубина:\xa0": "58 см"},
    {"Мин высота:\xa0": "59 см"},
    {"Макс высота:\xa0": "72 см"},
    {"Макс нагрузка:\xa0": "50 кг"},
]
[
    {"Ширина:\xa0": "128 см"},
    {"Глубина:\xa0": "58 см"},
    {"Мин высота:\xa0": "59 см"},
    {"Макс высота:\xa0": "72 см"},
    {"Макс нагрузка:\xa0": "50 кг"},
    {"Ширина:\xa0": "73 см"},
    {"Глубина:\xa0": "50 см"},
    {"Высота:\xa0": "75 см"},
    {"Макс нагрузка:\xa0": "50 кг"},
]
[
    {"Ширина:\xa0": "128 см"},
    {"Глубина:\xa0": "58 см"},
    {"Мин высота:\xa0": "59 см"},
    {"Макс высота:\xa0": "72 см"},
    {"Макс нагрузка:\xa0": "50 кг"},
    {"Ширина:\xa0": "73 см"},
    {"Глубина:\xa0": "50 см"},
    {"Высота:\xa0": "75 см"},
    {"Макс нагрузка:\xa0": "50 кг"},
]

And I want to separate this

products = [
    {
        "артикул": 12345,
        "высота": 50,
        "материал": "дерево",
    },
    {
        "артикул": 12346,
        "ширина": 30,
        "вес": 1.5,
    },
    {
        "артикул": 12347,
        "длина": 14,
        "высота": 6.2,
        "материал": "пластик",
    },
]

To get such a file as a link https://drive.google.com/file/d/1uGoW1kpsDGDA-Zh7SiiCDcg9cf2lHQUd/view?usp=sharing

It looks like your content should just be a dict, not a list you append 1-length dicts into. — AKX
– AKX, Commented Aug 25, 2021 at 7:56

AKX · Accepted Answer · 2021-08-25 08:05:41Z

1

You don't need Pandas to write CSV to a file.

For this case, you don't need Selenium either.

import csv
import sys
from typing import List

import requests
from bs4 import BeautifulSoup

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.3"
}


def get_html(url, params=None) -> str:
    r = requests.get(url, headers=HEADERS, params=params)
    r.raise_for_status()
    return r.text


def get_specifications_parameter(url: str) -> dict:
    html = get_html(url)
    soup = BeautifulSoup(html, "html.parser")

    param = soup.find_all("dd", class_="range-revamp-product-dimensions__list-item-measure")
    titles = soup.find_all("dt", class_="range-revamp-product-dimensions__list-item-name")

    content = {}

    for title, value in zip(titles, param):
        content[title.text.strip("\xA0:")] = value.text
    return content


def scrape_urls(urls: List[str]) -> List[dict]:
    contents = []
    for url in urls:
        content = get_specifications_parameter(url)
        content["url"] = url
        print(content, file=sys.stderr)  # progress printing
        contents.append(content)
    return contents


def write_output(contents: List[dict]):
    # Figure out all keys in the content for CSV writer
    all_keys = set()
    for content in contents:
        all_keys |= set(content)
    # Write to standard output (could be a file too)
    w = csv.DictWriter(sys.stdout, all_keys)
    w.writeheader()
    for content in contents:
        w.writerow(content)


def main():
    urls = [
        "https://www.ikea.com/ru/ru/p/pahl-pol-pismennyy-stol-belyy-s29278422/",
        "https://www.ikea.com/ru/ru/p/micke-mikke-pismennyy-stol-belyy-20373923/",
    ]
    contents = scrape_urls(urls)
    write_output(contents)


if __name__ == "__main__":
    main()

outputs

{'Ширина': '128 см', 'Глубина': '58 см', 'Мин высота': '59 см', 'Макс высота': '72 см', 'Макс нагрузка': '50 кг', 'url': 'https://www.ikea.com/ru/ru/p/pahl-pol-pismennyy-stol-belyy-s29278422/'}
{'Ширина': '73 см', 'Глубина': '50 см', 'Высота': '75 см', 'Макс нагрузка': '50 кг', 'url': 'https://www.ikea.com/ru/ru/p/micke-mikke-pismennyy-stol-belyy-20373923/'}

as debug information, followed by

url,Макс нагрузка,Ширина,Высота,Мин высота,Макс высота,Глубина
https://www.ikea.com/ru/ru/p/pahl-pol-pismennyy-stol-belyy-s29278422/,50 кг,128 см,,59 см,72 см,58 см
https://www.ikea.com/ru/ru/p/micke-mikke-pismennyy-stol-belyy-20373923/,50 кг,73 см,75 см,,,50 см

answered Aug 25, 2021 at 8:05

AKX

171k16 gold badges146 silver badges229 bronze badges

Sign up to request clarification or add additional context in comments.

5 Comments

Коля Нарушев Over a year ago

So well, then how do I write everything to a file so that the value is under its own key, because the amount of data will change

AKX Over a year ago

This will take care of the various product pages having different amounts of dimensions. (That's the "Figure out all keys in the content for CSV writer" bit.)

Коля Нарушев Over a year ago

I still don't understand how to convert the code that I would write.csv

AKX Over a year ago

The CSV writer is currently writing to sys.stdout - open("file.csv", "w") instead if you'd like to write to a file.

Коля Нарушев Over a year ago

Your answer works, thank you very much for being a kind, intelligent person, happiness, health and all the best

Collectives™ on Stack Overflow

Writing data to. csv pandas

1 Answer 1

5 Comments

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

1 Answer 1

5 Comments

Your Answer

Sign up or log in

Post as a guest

Related