I have a dataset with multiple ids, values, and dates (large T, large N).
import random
import numpy as np
import polars as pl
from datetime import datetime, timedelta
n_rows = 1000 # You can adjust this as needed
start_date = datetime(2000, 1, 1)
end_date = datetime(2023, 12, 31)
dates = [
start_date + timedelta(days=random.randint(0, (end_date - start_date).days))
for _ in range(n_rows)
]
unique_ids = [random.randint(0, 100) for _ in range(n_rows)]
returns = [random.uniform(-0.01, 0.01) for _ in range(n_rows)]
data = pl.DataFrame({'date': dates, 'values': returns, 'ids': unique_ids})
data = data.with_columns(date=pl.col("date").dt.month_end())
Additionally, there are other datasets, SB_dates with random dates (in blocks) and newBL, a boolean array matrix of the same size as SB_dates.
SB_dates = pl.DataFrame({
'bd0': ['2009-03-31', '2010-05-31', '2011-03-31', '2001-12-31', '2000-06-30', '2015-03-31', '2013-01-31', '2018-01-31', '2020-08-31', '2021-04-30', '2015-12-31', '2002-05-31', '2016-12-31', '2021-09-30', '2001-02-28', '2022-02-28', '2015-04-30', '2016-11-30', '2021-05-31', '2021-10-31'],
'bd1': ['2014-06-30', '2016-11-30', '2009-11-30', '2004-10-31', '2011-06-30', '2022-09-30', '2011-01-31', '2005-12-31', '2001-01-31', '2014-05-31', '2006-03-31', '2002-02-28', '2021-07-31', '2019-12-31', '2000-07-31', '2021-12-31', '2017-09-30', '2000-06-30', '2021-09-30', '2007-06-30'],
'bd2': ['2018-12-31', '2015-08-31', '2016-07-31', '2003-05-31', '2017-03-31', '2009-11-30', '2017-04-30', '2005-04-30', '2008-06-30', '2018-10-31', '2018-04-30', '2013-02-28', '2013-07-31', '2020-02-29', '2015-05-31', '2002-04-30', '2020-02-29', '2011-04-30', '2004-07-31', '2021-10-31'],
'bd3': ['2006-03-31', '2023-05-31', '2002-01-31', '2020-04-30', '2005-05-31', '2002-01-31', '2022-04-30', '2002-12-31', '2013-10-31', '2002-08-31', '2015-08-31', '2000-06-30', '2009-05-31', '2013-10-31', '2014-07-31', '2012-04-30', '2010-08-31', '2016-03-31', '2019-01-31', '2005-10-31'],
'bd4': ['2006-01-31', '2010-01-31', '2001-03-31', '2011-04-30', '2021-01-31', '2017-04-30', '2023-01-31', '2010-09-30', '2017-04-30', '2015-03-31', '2023-05-31', '2006-12-31', '2004-06-30', '2005-05-31', '2007-06-30', '2004-04-30', '2003-10-31', '2016-08-31', '2003-12-31', '2000-03-31'],
'bd5': ['2010-09-30', '2010-10-31', '2012-01-31', '2018-03-31', '2011-07-31', '2022-03-31', '2019-03-31', '2003-03-31', '2001-05-31', '2000-04-30', '2022-03-31', '2014-06-30', '2015-05-31', '2021-03-31', '2010-05-31', '2020-11-30', '2012-05-31', '2011-02-28', '2008-03-31', '2023-07-31']
}).with_columns(pl.all().str.to_date())
newBL = pl.DataFrame({
'bn0': [True, False, False, False, True, True, True, True, True, False, True, True, True, False, True, False, False, False, False, False],
'bn1': [True, True, False, False, False, False, True, True, False, False, True, True, False, False, False, True, True, True, True, True],
'bn2': [True, False, False, False, False, False, True, False, True, True, True, False, False, False, True, True, True, False, True, True],
'bn3': [True, False, True, True, True, True, False, False, True, True, False, True, True, True, False, False, True, True, True, False],
'bn4': [True, False, True, False, False, True, False, False, True, False, False, False, True, True, True, False, False, False, False, False],
'bn5': [True, False, True, False, False, True, True, True, True, False, True, True, True, True, True, False, True, True, True, True]
})
For each new block, I need to select 5 (n) random ids and calculate the average values on that date. If it's not a new block, I should retain the same random ids that were previously selected.
I have have coded the function like this, using the data, SB_dates and newBL DataFrames provided.
def get_mean_chrono_polars(data, SB_dates, newBL, n=5):
n_rows, n_columns = SB_dates.shape
df_sb = pl.DataFrame()
for col in range(n_columns):
date_column = pl.DataFrame( SB_dates[:, col])
newBL_column = newBL[:, col]
mean_values_col = []
for i in range(n_rows):
filter_ids=(data
.select(pl.col("date",'values','ids'))
.filter(pl.col("date") == date_column[i,:]))
if newBL_column[i]:
random_ids=filter_ids.select(pl.col("ids").shuffle(seed=1)).limit(n)
selected_ids_df = (
filter_ids
.select(pl.col("date", 'values', 'ids'))
.filter(pl.col('ids').is_in(random_ids['ids']))
)
mean_values = selected_ids_df['values'].mean()
mean_values_col.append(mean_values)
mean_values_col=pl.Series(str(col),mean_values_col)
df_sb=df_sb.hstack([mean_values_col])
return df_sb
r = get_mean_chrono_polars(data, SB_dates, newBL, n=5)
shape: (20, 6)
┌───────────┬───────────┬───────────┬───────────┬───────────┬───────────┐
│ 0 ┆ 1 ┆ 2 ┆ 3 ┆ 4 ┆ 5 │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 0.00242 ┆ 0.001441 ┆ 0.00196 ┆ 0.003028 ┆ -0.001259 ┆ 0.000704 │
│ null ┆ 0.004338 ┆ null ┆ null ┆ null ┆ null │
│ null ┆ null ┆ null ┆ -0.000406 ┆ 0.000412 ┆ -0.000115 │
│ null ┆ null ┆ null ┆ -0.002489 ┆ null ┆ null │
│ -0.001194 ┆ null ┆ null ┆ -0.002647 ┆ null ┆ null │
│ … ┆ … ┆ … ┆ … ┆ … ┆ … │
│ null ┆ -0.000378 ┆ -0.000107 ┆ null ┆ null ┆ null │
│ null ┆ -0.001728 ┆ -0.001958 ┆ 0.000582 ┆ null ┆ -0.000583 │
│ null ┆ -0.001194 ┆ null ┆ 0.000622 ┆ null ┆ -0.002279 │
│ null ┆ 0.003631 ┆ 0.002787 ┆ -0.003758 ┆ null ┆ -0.002329 │
│ null ┆ 0.00516 ┆ 0.002487 ┆ null ┆ null ┆ -0.002492 │
└───────────┴───────────┴───────────┴───────────┴───────────┴───────────┘
Which works, but the problem is that I have two nested for-loops with a condition on the second one (conditional on another df).
How can I perform this in a more efficient manner?
.item()todate_column[i,:]to get rid of aValueErrorbut thenselected_stocks_dfisn't defined. Can you add an example ofselected_stocks_dfto make the code runnable? (Along with the code forSB_datesandnewBLas opposed to their repr.)