I have a custom object which stores dataframes in memory given a certain hierarchy and I want to store this data in a file while maintaining the hierarchy. This hierarchy involved parents, children, and grandchildren and so on. Here is a small example:
df1 = {‘A’: [1,2,3,4], ‘B’: [5,6,7,8]}
df2 = {‘C’: [4,3,2,1], ‘D’: [8,7,6,5]}
df3 = {‘E’: [2,3,4,5], ‘F’: [6,7,8,9]}
Final output (pipes are important):
|1|5| # ←--- From df1
|4|8| # ←--- From df2
|3|7|
|2|6|
|1|5|
|2|6| # ←--- Back to df1
… # ←--- df3 starts here
Here is a minimal working example of what I want to achieve; however, it is not a scalable solution to a file with over one million lines of text (hardly scales to thousands).
import pandas as pd
import numpy as np
from time import time
# DATA STRUCTURE
class Block:
def __init__(self, name: str, data: dict[str, pd.DataFrame]):
self.name = name
self.registries = data
def __getitem__(self, registry: str) -> pd.DataFrame:
""" Allow for dictionary stryle access. """
return self.registries[registry]
class Sped:
def __init__(self):
self.blocks: dict[str, Block] = {}
def __getitem__(self, name: str) -> Block:
return self.blocks[name]
def add_block(self, block: Block) -> None:
self.blocks[block.name] = block
# GENERATE FAKE DATA
n_rows = 1000
n_cols = 5
random_data = np.random.randint(0, 100, size=(n_rows, n_cols))
column_names = ['Column_1', 'Column_2', 'Column_3', 'Column_4', 'Column_5']
grandchild = pd.DataFrame(random_data, columns=column_names)
random_data = np.random.randint(0, 100, size=(2, n_cols))
child = pd.DataFrame(random_data, columns=column_names)
random_data = np.random.randint(0, 100, size=(1, n_cols))
parent = pd.DataFrame(random_data, columns=column_names)
parent = parent.astype(str)
child = child.astype(str)
grandchild = grandchild.astype(str)
parent["id"] = 1
child["id"] = [1,2]
child["parent_id"] = 1
grandchild["id"] = range(n_rows)
grandchild["parent_id"] = 1
parent = parent[["id", 'Column_1', 'Column_2', 'Column_3', 'Column_4', 'Column_5']]
child = child[["id", "parent_id", 'Column_1', 'Column_2', 'Column_3', 'Column_4', 'Column_5']]
grandchild = grandchild[["id", "parent_id", 'Column_1', 'Column_2', 'Column_3', 'Column_4', 'Column_5']]
parent_dict = {"Parent": parent}
child_dict = {"Child": child}
grandchild_dict = {"Grandchild": grandchild}
parent_block = Block("Parent", parent_dict)
child_block = Block("Child", child_dict)
grandchild_block = Block("Grandchild", grandchild_dict)
sped = Sped()
sped.add_block(parent_block)
sped.add_block(child_block)
sped.add_block(grandchild_block)
# FUNCTION TO BE OPTIMIZED
def _save_row(block, registry, row_index, column_id, parent_id):
# Work on a copy to not destroy the original data in memory.
temp_table = sped[block][registry].copy()
# Cast id column from int to object to replace it with a string.
temp_table = temp_table.astype("object")
# Set the column id as the text version of the sped.
temp_table.loc[row_index, column_id] = f"|{registry}"
# Remove parent ID as they don't exist in speds. Some registries don't have parents so they pass None (idealy).
if parent_id:
temp_table.drop(parent_id, axis=1, inplace=True)
# Create one string from the data in sped format.
return "|".join(temp_table.values[row_index]) + "|\n"
# USAGE OF FUNCTION (WHICH CAN ALSO BE OPTIMIZED)
start = time()
sped_lines = []
sped_lines.append(_save_row("Parent", "Parent", 0, "id", None))
sped_lines.append(_save_row("Child", "Child", 0, "id", "parent_id"))
for i in range(len(sped["Grandchild"]["Grandchild"])):
sped_lines.append(_save_row("Grandchild", "Grandchild", i, "id", "parent_id"))
sped_lines.append(_save_row("Child", "Child", 1, "id", "parent_id"))
with open("temp.txt", 'w', encoding="utf-8") as fp:
fp.writelines(sped_lines)
print(f"Time take to write sped (in seconds): {time() - start}")
How can I optimize this solution to run faster?