import pandas as pd
import numpy as np
import random
from datetime import timedelta
import re
import os
# Function to generate unique random integers within the range of a column in the original dataframe
def generate_unique_random_numbers(df_new, df_original, num_cols, num_rows):
for col in num_cols:
min_val = int(df_original[col].min())
max_val = int(df_original[col].max())
# Generate a list of random integers (with or without replacement)
if max_val - min_val + 1 >= num_rows:
df_new[col] = random.sample(range(min_val, max_val + 1), num_rows)
else:
# If unique values are insufficient, use sampling with replacement
df_new[col] = random.choices(range(min_val, max_val + 1), k=num_rows)
return df_new
# Function to generate random dates within the date range of a column in the original dataframe
def generate_random_dates(df_new, df_original, date_cols, num_rows):
for col in date_cols:
if pd.api.types.is_datetime64_any_dtype(df_original[col]):
min_date = df_original[col].min()
max_date = df_original[col].max()
df_new[col] = [min_date + timedelta(days=random.randint(0, (max_date - min_date).days)) for _ in range(num_rows)]
else:
# If the date format is different (like strings), keep it as a constant
df_new[col] = [df_original[col].iloc[0]] * num_rows
return df_new
# Function to generate random email IDs based on a base email string
def generate_random_emails(df_new, email_col, base_email, num_rows):
email_username, email_domain = base_email.split('@')
# Generate unique email IDs using a base email and an index
df_new[email_col] = [f"{email_username}{i+1}@{email_domain}" for i in range(num_rows)]
return df_new
# Function to detect email columns based on the content
def detect_email_columns(df):
email_columns = []
email_pattern = re.compile(r"[^@]+@[^@]+\.[^@]+")
for col in df.columns:
# Check if the column contains at least one value matching the email pattern
if df[col].astype(str).apply(lambda x: bool(email_pattern.match(x))).any():
email_columns.append(col)
return email_columns
# Main function to read the input file, generate new test data, and save/print it
def generate_test_data(input_file, num_rows, output_folder, default_values={}, base_email="[email protected]"):
# Read the input Excel file with multiple sheets
excel_data = pd.read_excel(input_file, sheet_name=None, skiprows=1)
# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)
for sheet_name, df_original in excel_data.items():
print(f"\nProcessing sheet: {sheet_name}")
# Create a new dataframe with the specified number of rows, filled with NaN initially
df_new = pd.DataFrame(index=range(num_rows), columns=df_original.columns)
# Identify numeric columns
num_cols = df_original.select_dtypes(include=[np.number]).columns.tolist()
# Identify date columns
date_cols = df_original.select_dtypes(include=['datetime']).columns.tolist()
# Detect email columns based on the content
email_cols = detect_email_columns(df_original)
# Fill numeric columns with unique random integers or sampled values
df_new = generate_unique_random_numbers(df_new.copy(), df_original, num_cols, num_rows)
# Fill date columns with random dates
df_new = generate_random_dates(df_new.copy(), df_original, date_cols, num_rows)
# Generate random email IDs if email columns are detected
for email_col in email_cols:
df_new = generate_random_emails(df_new.copy(), email_col, base_email, num_rows)
# Add constant default values
for col, default_value in default_values.items():
if col in df_new.columns:
df_new[col] = [default_value] * num_rows
# Print the generated data to the console for verification
print(f"Generated test data for sheet '{sheet_name}':")
print(df_new.head(num_rows).to_csv(index=False))
# Save the generated data to a CSV file in the output folder
output_file = os.path.join(output_folder, f"{sheet_name}_Generated_Test_Data.csv")
df_new.to_csv(output_file, index=False)
print(f"Generated test data saved to {output_file}")
# Example usage:
default_values = {
'Status': 'Active', # Example: All rows will have 'Active' in the 'Status' column
'Country': 'USA'
}
# Input Excel file
input_file = r"C:\Test_data.xlsx"
# Output folder for generated CSV files
output_folder = r"C:\Generated_Test_Data"
# Specify the number of rows to generate (e.g., 10 rows)
num_rows = 10
# Generate, print, and save the test data for each sheet
generate_test_data(input_file, num_rows, output_folder, default_values=default_values, base_email="[email protected]")