0

I need to write data to a csv file, I am currently engaged in parsing an online store, there is a different number of characteristics on each product, for example: weight, length, etc. I am trying to write data using pandas, but I can't write all the data correctly into the dictionary, tell me how to do it correctly

from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import requests
import time
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
import pandas as pd

URLS = ['https://www.ikea.com/ru/ru/p/pahl-pol-pismennyy-stol-belyy-s29278422/','https://www.ikea.com/ru/ru/p/micke-mikke-pismennyy-stol-belyy-20373923/']
HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.3'}

content = []
colum = []
driver = webdriver.Chrome(ChromeDriverManager().install())

def get_html(url, params=None):
    r = requests.get(url, headers=HEADERS, params=params)
    return r


def get_specifications_parameter():
    num = 0
    WebDriverWait(driver, 5).until(expected_conditions.visibility_of_element_located((By.XPATH, '//*[@id="content"]/div/div[1]/div/div[2]/div[2]/div[2]/div[2]/button')))

    driver.find_element_by_xpath('//*[@id="content"]/div/div[1]/div/div[2]/div[2]/div[2]/div[2]/button').click()
    time.sleep(3)
    table = driver.find_element_by_xpath('//*[@id="range-modal-mount-node"]/div/div[4]/div/div[2]/div/div/div').get_attribute('innerHTML')
    soup = BeautifulSoup(table,'html.parser')
    
    param = soup.find_all('dd',class_='range-revamp-product-dimensions__list-item-measure')
    titles = soup.find_all('dt',class_='range-revamp-product-dimensions__list-item-name')
    for item in titles:
        if item.text in colum:
            pass
        else:
            colum.append(item.text)
    for item in param:
        content.append({titles[num].text:item.text}) #Writing characteristics to content
        num+=1
    print(content)

    
def get_content(url):
    driver.get(url)
    get_specifications_parameter()

    # content.append[{
    #     'name':name,
    #     'price':price,
    #     'photo':photo,
    #     'description':description
    #     }]
    #  Additional data to be recorded 
    print(content)

def start():
    for URL in URLS:
        html = get_html(URL)
        if html.status_code == 200:
            get_content(URL)
        else:
            print('Network error')

def write():
    df = pd.DataFrame(colum)
    for p in content:
        df = pd.concat([df, pd.DataFrame(p,index=[0])],ignore_index=True)
    df.to_csv("output.csv", index=False)


start()
write()

At the output to the content, I get this

[
    {"Ширина:\xa0": "128 см"},
    {"Глубина:\xa0": "58 см"},
    {"Мин высота:\xa0": "59 см"},
    {"Макс высота:\xa0": "72 см"},
    {"Макс нагрузка:\xa0": "50 кг"},
]
[
    {"Ширина:\xa0": "128 см"},
    {"Глубина:\xa0": "58 см"},
    {"Мин высота:\xa0": "59 см"},
    {"Макс высота:\xa0": "72 см"},
    {"Макс нагрузка:\xa0": "50 кг"},
]
[
    {"Ширина:\xa0": "128 см"},
    {"Глубина:\xa0": "58 см"},
    {"Мин высота:\xa0": "59 см"},
    {"Макс высота:\xa0": "72 см"},
    {"Макс нагрузка:\xa0": "50 кг"},
    {"Ширина:\xa0": "73 см"},
    {"Глубина:\xa0": "50 см"},
    {"Высота:\xa0": "75 см"},
    {"Макс нагрузка:\xa0": "50 кг"},
]
[
    {"Ширина:\xa0": "128 см"},
    {"Глубина:\xa0": "58 см"},
    {"Мин высота:\xa0": "59 см"},
    {"Макс высота:\xa0": "72 см"},
    {"Макс нагрузка:\xa0": "50 кг"},
    {"Ширина:\xa0": "73 см"},
    {"Глубина:\xa0": "50 см"},
    {"Высота:\xa0": "75 см"},
    {"Макс нагрузка:\xa0": "50 кг"},
]

And I want to separate this

products = [
    {
        "артикул": 12345,
        "высота": 50,
        "материал": "дерево",
    },
    {
        "артикул": 12346,
        "ширина": 30,
        "вес": 1.5,
    },
    {
        "артикул": 12347,
        "длина": 14,
        "высота": 6.2,
        "материал": "пластик",
    },
]

To get such a file as a link https://drive.google.com/file/d/1uGoW1kpsDGDA-Zh7SiiCDcg9cf2lHQUd/view?usp=sharing

1
  • It looks like your content should just be a dict, not a list you append 1-length dicts into. Commented Aug 25, 2021 at 7:56

1 Answer 1

1

You don't need Pandas to write CSV to a file.

For this case, you don't need Selenium either.

import csv
import sys
from typing import List

import requests
from bs4 import BeautifulSoup

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.3"
}


def get_html(url, params=None) -> str:
    r = requests.get(url, headers=HEADERS, params=params)
    r.raise_for_status()
    return r.text


def get_specifications_parameter(url: str) -> dict:
    html = get_html(url)
    soup = BeautifulSoup(html, "html.parser")

    param = soup.find_all("dd", class_="range-revamp-product-dimensions__list-item-measure")
    titles = soup.find_all("dt", class_="range-revamp-product-dimensions__list-item-name")

    content = {}

    for title, value in zip(titles, param):
        content[title.text.strip("\xA0:")] = value.text
    return content


def scrape_urls(urls: List[str]) -> List[dict]:
    contents = []
    for url in urls:
        content = get_specifications_parameter(url)
        content["url"] = url
        print(content, file=sys.stderr)  # progress printing
        contents.append(content)
    return contents


def write_output(contents: List[dict]):
    # Figure out all keys in the content for CSV writer
    all_keys = set()
    for content in contents:
        all_keys |= set(content)
    # Write to standard output (could be a file too)
    w = csv.DictWriter(sys.stdout, all_keys)
    w.writeheader()
    for content in contents:
        w.writerow(content)


def main():
    urls = [
        "https://www.ikea.com/ru/ru/p/pahl-pol-pismennyy-stol-belyy-s29278422/",
        "https://www.ikea.com/ru/ru/p/micke-mikke-pismennyy-stol-belyy-20373923/",
    ]
    contents = scrape_urls(urls)
    write_output(contents)


if __name__ == "__main__":
    main()

outputs

{'Ширина': '128 см', 'Глубина': '58 см', 'Мин высота': '59 см', 'Макс высота': '72 см', 'Макс нагрузка': '50 кг', 'url': 'https://www.ikea.com/ru/ru/p/pahl-pol-pismennyy-stol-belyy-s29278422/'}
{'Ширина': '73 см', 'Глубина': '50 см', 'Высота': '75 см', 'Макс нагрузка': '50 кг', 'url': 'https://www.ikea.com/ru/ru/p/micke-mikke-pismennyy-stol-belyy-20373923/'}

as debug information, followed by

url,Макс нагрузка,Ширина,Высота,Мин высота,Макс высота,Глубина
https://www.ikea.com/ru/ru/p/pahl-pol-pismennyy-stol-belyy-s29278422/,50 кг,128 см,,59 см,72 см,58 см
https://www.ikea.com/ru/ru/p/micke-mikke-pismennyy-stol-belyy-20373923/,50 кг,73 см,75 см,,,50 см
Sign up to request clarification or add additional context in comments.

5 Comments

So well, then how do I write everything to a file so that the value is under its own key, because the amount of data will change
This will take care of the various product pages having different amounts of dimensions. (That's the "Figure out all keys in the content for CSV writer" bit.)
I still don't understand how to convert the code that I would write.csv
The CSV writer is currently writing to sys.stdout - open("file.csv", "w") instead if you'd like to write to a file.
Your answer works, thank you very much for being a kind, intelligent person, happiness, health and all the best

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.