How do I generate test data for my Python script?

Question

A equation takes values in the following form :

   x = [0x02,0x00]  # which is later internally converted to in the called function to  0x300
   y = [0x01, 0xFF]
   z = [0x01, 0x0F]

How do I generate a series of test values for this function ? for instance I want to send a 100 odd values from a for loop

for i in range(0,300):
   # where a,b are derived for a range
   x = [a,b]

My question was a bit unclear so please let my clarify. what I wanted to ask how I can do x =[a,b] generate different values for a,b

bayda · Accepted Answer · 2009-03-24 05:05:29Z

4

use generators:

def gen_xyz( max_iteration ):
    for i in xrange( 0, max_iteration ):
       # code which will generate next ( x, y, z )
       yield ( x, y, z ) 

for x, y, z in gen_xyz( 1000 ):
  f( x, y, z )

answered Mar 24, 2009 at 5:05

bayda

13.6k9 gold badges42 silver badges48 bronze badges

Sign up to request clarification or add additional context in comments.

Comments

dbr · Accepted Answer · 2009-03-24 06:48:50Z

2

The hex() function?

import random
for i in range(10):
    a1, a2 = random.randint(1,100), random.randint(1,100)
    x = [hex(a1), hex(a2)]
    print x

..outputs something similar to..

['0x21', '0x4f']
['0x59', '0x5c']
['0x61', '0x40']
['0x57', '0x45']
['0x1a', '0x11']
['0x4c', '0x49']
['0x40', '0x1b']
['0x1f', '0x7']
['0x8', '0x2b']
['0x1e', '0x13']

answered Mar 24, 2009 at 6:48

dbr

171k69 gold badges284 silver badges348 bronze badges

Comments

Ganesh A · Accepted Answer · 2024-11-12 06:48:40Z

import pandas as pd
import numpy as np
import random
from datetime import timedelta
import re
import os

# Function to generate unique random integers within the range of a column in the original dataframe
def generate_unique_random_numbers(df_new, df_original, num_cols, num_rows):
    for col in num_cols:
        min_val = int(df_original[col].min())
        max_val = int(df_original[col].max())
        # Generate a list of random integers (with or without replacement)
        if max_val - min_val + 1 >= num_rows:
            df_new[col] = random.sample(range(min_val, max_val + 1), num_rows)
        else:
            # If unique values are insufficient, use sampling with replacement
            df_new[col] = random.choices(range(min_val, max_val + 1), k=num_rows)
    return df_new

# Function to generate random dates within the date range of a column in the original dataframe
def generate_random_dates(df_new, df_original, date_cols, num_rows):
    for col in date_cols:
        if pd.api.types.is_datetime64_any_dtype(df_original[col]):
            min_date = df_original[col].min()
            max_date = df_original[col].max()
            df_new[col] = [min_date + timedelta(days=random.randint(0, (max_date - min_date).days)) for _ in range(num_rows)]
        else:
            # If the date format is different (like strings), keep it as a constant
            df_new[col] = [df_original[col].iloc[0]] * num_rows
    return df_new

# Function to generate random email IDs based on a base email string
def generate_random_emails(df_new, email_col, base_email, num_rows):
    email_username, email_domain = base_email.split('@')
    # Generate unique email IDs using a base email and an index
    df_new[email_col] = [f"{email_username}{i+1}@{email_domain}" for i in range(num_rows)]
    return df_new

# Function to detect email columns based on the content
def detect_email_columns(df):
    email_columns = []
    email_pattern = re.compile(r"[^@]+@[^@]+\.[^@]+")

    for col in df.columns:
        # Check if the column contains at least one value matching the email pattern
        if df[col].astype(str).apply(lambda x: bool(email_pattern.match(x))).any():
            email_columns.append(col)
    
    return email_columns

# Main function to read the input file, generate new test data, and save/print it
def generate_test_data(input_file, num_rows, output_folder, default_values={}, base_email="[email protected]"):
    # Read the input Excel file with multiple sheets
    excel_data = pd.read_excel(input_file, sheet_name=None, skiprows=1)
    
    # Create the output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)
    
    for sheet_name, df_original in excel_data.items():
        print(f"\nProcessing sheet: {sheet_name}")
        
        # Create a new dataframe with the specified number of rows, filled with NaN initially
        df_new = pd.DataFrame(index=range(num_rows), columns=df_original.columns)
        
        # Identify numeric columns
        num_cols = df_original.select_dtypes(include=[np.number]).columns.tolist()
        
        # Identify date columns
        date_cols = df_original.select_dtypes(include=['datetime']).columns.tolist()
        
        # Detect email columns based on the content
        email_cols = detect_email_columns(df_original)
        
        # Fill numeric columns with unique random integers or sampled values
        df_new = generate_unique_random_numbers(df_new.copy(), df_original, num_cols, num_rows)
        
        # Fill date columns with random dates
        df_new = generate_random_dates(df_new.copy(), df_original, date_cols, num_rows)
        
        # Generate random email IDs if email columns are detected
        for email_col in email_cols:
            df_new = generate_random_emails(df_new.copy(), email_col, base_email, num_rows)
        
        # Add constant default values
        for col, default_value in default_values.items():
            if col in df_new.columns:
                df_new[col] = [default_value] * num_rows
        
        # Print the generated data to the console for verification
        print(f"Generated test data for sheet '{sheet_name}':")
        print(df_new.head(num_rows).to_csv(index=False))
        
        # Save the generated data to a CSV file in the output folder
        output_file = os.path.join(output_folder, f"{sheet_name}_Generated_Test_Data.csv")
        df_new.to_csv(output_file, index=False)
        print(f"Generated test data saved to {output_file}")

# Example usage:
default_values = {
    'Status': 'Active',  # Example: All rows will have 'Active' in the 'Status' column
    'Country': 'USA'
}

# Input Excel file
input_file = r"C:\Test_data.xlsx"

# Output folder for generated CSV files
output_folder = r"C:\Generated_Test_Data"

    

# Specify the number of rows to generate (e.g., 10 rows)
num_rows = 10

# Generate, print, and save the test data for each sheet
generate_test_data(input_file, num_rows, output_folder, default_values=default_values, base_email="[email protected]")

Collectives™ on Stack Overflow

How do I generate test data for my Python script?

3 Answers 3

Comments

Comments

1 Comment

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

3 Answers 3

Comments

Comments

1 Comment

Your Answer

Sign up or log in

Post as a guest

Related