1

I created a script which parse YAML files to Excel. I've made part of code which adjust width of column but now I want to wrap text for Desciprtion column.

After script execution:

Incorrect Description

What I want:

Correct Description

Here is my code:

import yaml
import os
import pandas as pd
from openpyxl.utils import get_column_letter
import sys


def yaml_to_excel(dictionary, row):
    for key, value in dictionary.items():
        if type(value) is dict:
            yield from yaml_to_excel(value, row + 1)
        elif type(value) is list:
            for item in value:
                yield from yaml_to_excel(item, row + 1)
                row += 1
        else:
            yield (key, value, row)


def get_all_subdirectiries_in_path(root_dir):
    all_subdirectories = []

    for file in os.listdir(root_dir):
        d = os.path.join(root_dir, file)
        if os.path.isdir(d):
            all_subdirectories.append(d)

    return all_subdirectories


def dfs_tabs(df_list, sheet_list, file_name, columns):
    writer = pd.ExcelWriter(file_name,engine='openpyxl')
    for dataframe, sheet, columns_list in zip(df_list, sheet_list, columns):
        dataframe.to_excel(writer, sheet_name=sheet, startrow=0 , startcol=0, index=False, columns=columns_list)

        for column in dataframe:
            column_width = max(dataframe[column].astype(str).map(len).max() + 5, len(column) + 5)
            col_letter = get_column_letter(dataframe.columns.get_loc(column) + 1)
            writer.sheets[sheet].column_dimensions[col_letter].width = column_width

    writer.save()


def get_nested_dir_path(main_dir):
    if any(File.endswith(".yml") for File in os.listdir(main_dir)):
        yield main_dir
    else:
        for file in os.listdir(main_dir):
            yield from get_nested_dir_path(main_dir + f"/{file}")


def main(dir_path):
    all_sheets = []
    sheet_names = []
    all_columns = []
    for subdirectory in get_nested_dir_path(dir_path):
        final_object = []
        last_row = 1
        current_row = 1

        for file in os.listdir(subdirectory):
            # Check if YAML file exist
            if (file.split(".")[1] == "yml") | (file.split(".")[1] == "yaml"):
                with open(subdirectory + f"/{file}", "r", encoding="utf-8") as f:
                    dataMap = yaml.safe_load(
                        f,
                    )

                last_row += 1
                hierarchy_obj = {}
                hierarchy = 1

                prev_value = ""
                prev_temp_depth = 2
                current_row += 1

                starting_row = current_row

                for key, value, temp_depth in yaml_to_excel(dataMap, last_row):
                    if key not in hierarchy_obj:
                        hierarchy_obj[key] = hierarchy
                        hierarchy += 1

                    if prev_value != "":
                        if hierarchy_obj[key] < hierarchy_obj[prev_value]:
                            current_row += 1
                        if (temp_depth > prev_temp_depth) & (
                            hierarchy_obj[key] > hierarchy_obj[prev_value]
                        ):
                            current_row += 1

                    if type(value) is bool:
                        if value:
                            value = 'yes'
                        else:
                            value = 'no'

                    final_object.append(
                        {
                            "column": hierarchy_obj[key],
                            "value": value,
                            "row": current_row,
                        }
                    )

                    prev_value = key
                    prev_temp_depth = temp_depth

        columns_arr = []
        for column_name in hierarchy_obj.keys():
            columns_arr.append(column_name)
        all_columns.append(columns_arr)
        last_row_in_excel = final_object[len(final_object)-1]['row']

        main_arr = []

        for i in range(last_row_in_excel):
            main_arr.append(["" for i in range(len(columns_arr))])

        main_arr = [main_arr]

        for item in final_object:
            if (type(item['value']) == int) | (type(item['value']) == float):
                main_arr[0][item['row'] - 1][item['column'] - 1] = item['value']
            else:
                main_arr[0][item['row'] - 1][item['column'] -1] = item['value']

        result = pd.DataFrame(main_arr[0], columns = columns_arr)
        result = result[1:]
        all_sheets.append(result)
        sheet_names.append(subdirectory[subdirectory.rfind('/')+1 :].replace("/", " ").replace("_", " ")[0:31])

    dfs_tabs(all_sheets, sheet_names, 'test.xlsx', all_columns)


if __name__ == "__main__":
    dir_path = 'Lands'
    main(dir_path)

And below you can find YAML file which I base of:

Country: France
Capital City: Paris
Language: French
Description: |-
  A country whose metropolitan part is in Western Europe, and also has overseas territories on other continents.

  The most important people:
  President: Emmanuel Macron
  Prime Minister: Élisabeth Borne
Year: 2020
Population:
- Total population: 67 522 000
  Year: 2021
  Nationality:
  - French population: 87%
    German population: 2%
    Year: 2022
    Paris neighborhoods:
    - 1st district of Paris: Louvre
      Paris II district: Bourse
      Year: 2023
Additional information:

To run my script you must create folder Lands/France/france.yml

I will be grateful for any tips how to wrap text in Description column!

1
  • you can split your text in cell with '/n' new line Commented Sep 25, 2022 at 11:33

1 Answer 1

1

I suggest you in this particular situation to parse the .YAML file with pandas.json_normalize and use the format class of xlswriter to wrap the text in the column Description.

Try this :

import pandas as pd
from yaml import safe_load

with open(r"Lands\France\france.yaml", 'r', encoding='utf-8') as f:
    df = pd.json_normalize(safe_load(f))
    
# --- Selecting columns
description_index = df.columns.get_loc("Description")
df.iloc[:, 0:description_index+1]

with pd.ExcelWriter('final_result.xlsx') as writer :
    df.to_excel(writer, sheet_name='Sheet1', index=False)

    # --- Wrapping the text of column D
    workbook=writer.book
    worksheet = writer.sheets['Sheet1']
    format = workbook.add_format({'text_wrap': True})
    worksheet.set_column(description_index, description_index, None, format)

    # --- Auto-fit all the columns
    for column in df:
        column_width = max(df[column].astype(str).map(len).max(), len(column))
        col_idx = df.columns.get_loc(column)
        worksheet.set_column(col_idx, col_idx, column_width)

# Output (in Excel) :

enter image description here

Sign up to request clarification or add additional context in comments.

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.