Pandas add missing dates depending on 2 variables

Question

I have a time series of 3 different products, which have been sold at 4 different stores over some time period. I want to fill in the missing data so that I have a complete data set. All the missing data should be substituted by 0.

Here is the code to generate the dataset. The randomtimes function was copied from @abarnert [https://stackoverflow.com/questions/50165501/generate-random-list-of-timestamps-# in-python][1]

import datetime
import random
import pandas as pd
import numpy as np

random.seed(42)
np.random.seed(42) 

def randomtimes(start, end, n):
    stime = datetime.datetime.strptime(start, '%d-%m-%Y')
    etime = datetime.datetime.strptime(end, '%d-%m-%Y')
    td = etime - stime
    print(td)
    dates = [round(random.random(),1) * td + stime for _ in range(n)]
    return dates

  # set vars
nsp = 5 # nr of days
nd  = 3 # nr of days
ns  = 3 # nr of stores
npr = 2 # nr of products

# generate data
total = nd*ns*npr
s  = random.sample('1'*nd*ns +'2'*nd*ns+'3'*nd*ns, total)# number of stores
p  = random.sample("a"*nd*ns+ "b"*nd*ns, total)
so = list(np.random.choice(range(20,100),total))
stime = '01-02-2000'
etime   = '03-02-2000'
date   = np.array(randomtimes(stime, etime, nsp)).astype('datetime64[D]')

product = []
store   = []
sold    = []

for x in range(1,len(date)+1):
    product.append(s.pop()) 
    store.append(p.pop()) 
    sold.append(so.pop())
    
    
data = {'date':date,
        'product':product,
        'sold':sold,
        'store':store
       }
df = pd.DataFrame(data )
df 

            date product  sold store
0 2000-02-02       3    95     b
1 2000-02-01       1    88     a 
2 2000-02-02       1    81     a
3 2000-02-03       1    66     a
4 2000-02-02       3    88     a

This result should look like this.

 0 2000-02-01       1    88     a
 1 2000-02-01       2     0     a
 2 2000-02-01       3     0     a
 3 2000-02-01       1     0     b
 4 2000-02-01       2     0     b
 5 2000-02-01       3    95     b

 6 2000-02-02       1    81     a
 7 2000-02-02       2     0     a
 8 2000-02-02       3    88     a
 9 2000-02-02       1     0     b
10 2000-02-02       2     0     b
11 2000-02-02       3     0     b

12 2000-02-03       1    66     a
13 2000-02-03       2     0     a
14 2000-02-03       3     0     a
15 2000-02-03       1     0     b
16 2000-02-03       2     0     b
17 2000-02-03       3     0     b

Also, is there a better way to generate this toy data?

I appreciate you help.

insulanus · Accepted Answer · 2021-04-02 09:45:44Z

I would propose to create a second data frame containing all known values (dates, products, stores) with all "sold" values set to zero. Afterwards, you can loop through the existing "sold" data and copy them to the new data frame.

Below you find a working example illustrating this procedure.

import datetime
import pandas as pd
import numpy as np

np.random.seed(0)
n_days = 5  # number of data frame rows
products = [1,2,3]  # list of product ids
stores = ["a", "b", "c", "d"]  # list of store ids

def simulate_data():
    data = {
        "date": random_days(n_days),
        "product": [np.random.choice(products) for row in range(n_days)],
        "sold": [np.random.choice(range(100)) for row in range(n_days)],
        "store": [np.random.choice(stores) for row in range(n_days)]
        }
    df = pd.DataFrame(data)
    return df

def random_days(n, daydelta=7):
    """
    return: list of random days
    n: number of days
    daydelta: number of days from today on to choose from
    """
    daydeltas = range(daydelta)
    random_daydeltas = [datetime.timedelta(days=int(np.random.choice(daydeltas))) for _ in range(n)]
    today = datetime.date.today()
    random_days = [today + random_daydelta for random_daydelta in random_daydeltas]
    return random_days

def get_dfMask(df):
    duration = range((max(df.date) - min(df.date)).days)
    unique_dates = [min(df.date) + datetime.timedelta(days=days) for days in duration]
    data = {
        "date": get_filledDates(unique_dates),
        "product": products * len(stores) * len(unique_dates),
        "sold": [0] * len(products) * len(stores) * len(unique_dates),
        "store": get_filledStores(unique_dates)
        }
    df_filled = pd.DataFrame(data)
    return df_filled

def get_filledDates(unique_dates):
    dates_filled = []
    for unique_date in unique_dates: 
        dates_filled.extend([unique_date] * len(products) * len(stores))
    return dates_filled

def get_filledStores(unique_dates):
    stores_filled = []
    for store in stores: stores_filled.extend([store] * len(products))
    stores_filled *= len(unique_dates)
    return stores_filled

def copy_soldValues(source_df, destination_df):
    for row in range(source_df.__len__()):
        position = (destination_df.loc[:, "date"] == source_df.loc[row,"date"]) \
                & (destination_df.loc[:, "product"] == source_df.loc[row,"product"]) \
                & (destination_df.loc[:, "store"] == source_df.loc[row, "store"])
        destination_df.loc[position, "sold"] = source_df.loc[row, "sold"]
    return destination_df

def main():
    df = simulate_data()
    filled_df = get_dfMask(df)
    filled_df = copy_soldValues(df, filled_df)
    print(df)
    print(filled_df)

if __name__ == "__main__":
    main()

To me, your second question is difficult to answer since I do not know your expectations regarding the simulation. However, in my example I have used a slightly modified implementation.

Collectives™ on Stack Overflow

Pandas add missing dates depending on 2 variables

1 Answer 1

Comments

Your Answer

Linked

Hot Network Questions

Collectives™ on Stack Overflow

1 Answer 1

Comments

Your Answer

Sign up or log in

Post as a guest

Linked

Related