1

I have a dataset with multiple ids, values, and dates (large T, large N).

import random
import numpy as np
import polars as pl
from datetime import datetime, timedelta
     
n_rows = 1000  # You can adjust this as needed
start_date = datetime(2000, 1, 1)
end_date = datetime(2023, 12, 31)

dates = [
    start_date + timedelta(days=random.randint(0, (end_date - start_date).days))
    for _ in range(n_rows)
]

unique_ids = [random.randint(0, 100) for _ in range(n_rows)]
returns = [random.uniform(-0.01, 0.01) for _ in range(n_rows)]

data = pl.DataFrame({'date': dates, 'values': returns, 'ids': unique_ids})
data = data.with_columns(date=pl.col("date").dt.month_end())

Additionally, there are other datasets, SB_dates with random dates (in blocks) and newBL, a boolean array matrix of the same size as SB_dates.

SB_dates = pl.DataFrame({
    'bd0': ['2009-03-31', '2010-05-31', '2011-03-31', '2001-12-31', '2000-06-30', '2015-03-31', '2013-01-31', '2018-01-31', '2020-08-31', '2021-04-30', '2015-12-31', '2002-05-31', '2016-12-31', '2021-09-30', '2001-02-28', '2022-02-28', '2015-04-30', '2016-11-30', '2021-05-31', '2021-10-31'],
    'bd1': ['2014-06-30', '2016-11-30', '2009-11-30', '2004-10-31', '2011-06-30', '2022-09-30', '2011-01-31', '2005-12-31', '2001-01-31', '2014-05-31', '2006-03-31', '2002-02-28', '2021-07-31', '2019-12-31', '2000-07-31', '2021-12-31', '2017-09-30', '2000-06-30', '2021-09-30', '2007-06-30'],
    'bd2': ['2018-12-31', '2015-08-31', '2016-07-31', '2003-05-31', '2017-03-31', '2009-11-30', '2017-04-30', '2005-04-30', '2008-06-30', '2018-10-31', '2018-04-30', '2013-02-28', '2013-07-31', '2020-02-29', '2015-05-31', '2002-04-30', '2020-02-29', '2011-04-30', '2004-07-31', '2021-10-31'],
    'bd3': ['2006-03-31', '2023-05-31', '2002-01-31', '2020-04-30', '2005-05-31', '2002-01-31', '2022-04-30', '2002-12-31', '2013-10-31', '2002-08-31', '2015-08-31', '2000-06-30', '2009-05-31', '2013-10-31', '2014-07-31', '2012-04-30', '2010-08-31', '2016-03-31', '2019-01-31', '2005-10-31'],
    'bd4': ['2006-01-31', '2010-01-31', '2001-03-31', '2011-04-30', '2021-01-31', '2017-04-30', '2023-01-31', '2010-09-30', '2017-04-30', '2015-03-31', '2023-05-31', '2006-12-31', '2004-06-30', '2005-05-31', '2007-06-30', '2004-04-30', '2003-10-31', '2016-08-31', '2003-12-31', '2000-03-31'],
    'bd5': ['2010-09-30', '2010-10-31', '2012-01-31', '2018-03-31', '2011-07-31', '2022-03-31', '2019-03-31', '2003-03-31', '2001-05-31', '2000-04-30', '2022-03-31', '2014-06-30', '2015-05-31', '2021-03-31', '2010-05-31', '2020-11-30', '2012-05-31', '2011-02-28', '2008-03-31', '2023-07-31']
}).with_columns(pl.all().str.to_date())

newBL = pl.DataFrame({
    'bn0': [True, False, False, False, True, True, True, True, True, False, True, True, True, False, True, False, False, False, False, False],
    'bn1': [True, True, False, False, False, False, True, True, False, False, True, True, False, False, False, True, True, True, True, True],
    'bn2': [True, False, False, False, False, False, True, False, True, True, True, False, False, False, True, True, True, False, True, True],
    'bn3': [True, False, True, True, True, True, False, False, True, True, False, True, True, True, False, False, True, True, True, False],
    'bn4': [True, False, True, False, False, True, False, False, True, False, False, False, True, True, True, False, False, False, False, False],
    'bn5': [True, False, True, False, False, True, True, True, True, False, True, True, True, True, True, False, True, True, True, True]
})

For each new block, I need to select 5 (n) random ids and calculate the average values on that date. If it's not a new block, I should retain the same random ids that were previously selected.

I have have coded the function like this, using the data, SB_dates and newBL DataFrames provided.

def get_mean_chrono_polars(data, SB_dates, newBL, n=5):
    n_rows, n_columns = SB_dates.shape
    df_sb = pl.DataFrame()

    for col in range(n_columns):
        date_column = pl.DataFrame( SB_dates[:, col])
        newBL_column = newBL[:, col]
        mean_values_col = []

        for i in range(n_rows):                        
            filter_ids=(data
                             .select(pl.col("date",'values','ids'))
                             .filter(pl.col("date") == date_column[i,:]))            
            if newBL_column[i]:
                random_ids=filter_ids.select(pl.col("ids").shuffle(seed=1)).limit(n)

            selected_ids_df = (
                filter_ids
                .select(pl.col("date", 'values', 'ids'))
                .filter(pl.col('ids').is_in(random_ids['ids']))
            )
            
            mean_values = selected_ids_df['values'].mean()
            mean_values_col.append(mean_values)
        mean_values_col=pl.Series(str(col),mean_values_col)        
        df_sb=df_sb.hstack([mean_values_col])
    return df_sb


r = get_mean_chrono_polars(data, SB_dates, newBL, n=5)
shape: (20, 6)
┌───────────┬───────────┬───────────┬───────────┬───────────┬───────────┐
│ 0         ┆ 1         ┆ 2         ┆ 3         ┆ 4         ┆ 5         │
│ ---       ┆ ---       ┆ ---       ┆ ---       ┆ ---       ┆ ---       │
│ f64       ┆ f64       ┆ f64       ┆ f64       ┆ f64       ┆ f64       │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 0.00242   ┆ 0.001441  ┆ 0.00196   ┆ 0.003028  ┆ -0.001259 ┆ 0.000704  │
│ null      ┆ 0.004338  ┆ null      ┆ null      ┆ null      ┆ null      │
│ null      ┆ null      ┆ null      ┆ -0.000406 ┆ 0.000412  ┆ -0.000115 │
│ null      ┆ null      ┆ null      ┆ -0.002489 ┆ null      ┆ null      │
│ -0.001194 ┆ null      ┆ null      ┆ -0.002647 ┆ null      ┆ null      │
│ …         ┆ …         ┆ …         ┆ …         ┆ …         ┆ …         │
│ null      ┆ -0.000378 ┆ -0.000107 ┆ null      ┆ null      ┆ null      │
│ null      ┆ -0.001728 ┆ -0.001958 ┆ 0.000582  ┆ null      ┆ -0.000583 │
│ null      ┆ -0.001194 ┆ null      ┆ 0.000622  ┆ null      ┆ -0.002279 │
│ null      ┆ 0.003631  ┆ 0.002787  ┆ -0.003758 ┆ null      ┆ -0.002329 │
│ null      ┆ 0.00516   ┆ 0.002487  ┆ null      ┆ null      ┆ -0.002492 │
└───────────┴───────────┴───────────┴───────────┴───────────┴───────────┘

Which works, but the problem is that I have two nested for-loops with a condition on the second one (conditional on another df).

How can I perform this in a more efficient manner?

2
  • I had to add .item() to date_column[i,:] to get rid of a ValueError but then selected_stocks_df isn't defined. Can you add an example of selected_stocks_df to make the code runnable? (Along with the code for SB_dates and newBL as opposed to their repr.) Commented Sep 29, 2023 at 16:35
  • Sorry for that! I've edited the question with the complete DGP and implementable code. The function 'SB' is a modified Stationary Bootstrap, which not only resamples data but also identifies the start of each block as 'newBL.' For more details: arch.readthedocs.io/en/latest/bootstrap/…" Commented Sep 29, 2023 at 17:36

2 Answers 2

1
SB_dates = (
   SB_dates
    .with_row_index("row")
    .unpivot(index="row", variable_name="col", value_name="date")
    .with_columns(pl.col("col").rle_id())
)

df_mask = pl.concat(
   [SB_dates, newBL.unpivot().select(is_block = "value")], 
   how = "horizontal"
)

df_mask = df_mask.with_columns(
   pl.when(pl.col.is_block)
     .then(pl.col.date)
     .forward_fill()
     .alias("date_block")
)

all_ids = data.group_by(pl.col.date.dt.date()).agg("ids")

random_ids = (
   data
    .group_by(pl.col.date.dt.date())
    .agg(random_ids = pl.col.ids.shuffle().head(5))
)

df_selections = (
   df_mask
    .join(all_ids, on="date")
    .join(random_ids, left_on="date_block", right_on="date")
    .with_columns(pl.col.ids.list.set_intersection(pl.col.random_ids))
    .select("row", "col", "date", "ids")
    .explode("ids")
)

result = (
   df_selections
    .join(data, on=[pl.col.date.dt.date(), "ids"])
    .sort("row", "col")
    .pivot(
       on = "col", 
       index = "row", 
       values = "values", 
       aggregate_function = pl.element().mean()
    )
)

Reshaping

Generally, if you can reshape your data from "wide" to "long", or turn "columns" into "rows" - this is where Polars will be most efficient.

.unpivot()

SB_dates = (
   SB_dates
    .with_row_index("row")
    .unpivot(index="row", variable_name="col", value_name="date")
    .with_columns(pl.col("col").rle_id())
)
shape: (120, 3)
┌─────┬─────┬────────────┐
│ row ┆ col ┆ date       │
│ --- ┆ --- ┆ ---        │
│ u32 ┆ u32 ┆ date       │
╞═════╪═════╪════════════╡
│ 0   ┆ 0   ┆ 2009-03-31 │
│ 1   ┆ 0   ┆ 2010-05-31 │
│ 2   ┆ 0   ┆ 2011-03-31 │
│ 3   ┆ 0   ┆ 2001-12-31 │
│ 4   ┆ 0   ┆ 2000-06-30 │
│ …   ┆ …   ┆ …          │
│ 15  ┆ 5   ┆ 2020-11-30 │
│ 16  ┆ 5   ┆ 2012-05-31 │
│ 17  ┆ 5   ┆ 2011-02-28 │
│ 18  ┆ 5   ┆ 2008-03-31 │
│ 19  ┆ 5   ┆ 2023-07-31 │
└─────┴─────┴────────────┘

We also reshape newBL and horizontally concat to pair up the corresponding boolean values.

This is used to then mask out and .forward_fill() the date column.

df_mask = pl.concat([SB_dates, newBL.unpivot().select(is_block = "value")], how="horizontal")

df_mask = df_mask.with_columns(
   pl.when(pl.col.is_block)
     .then(pl.col.date)
     .forward_fill()
     .alias("date_block")
)
shape: (120, 5)
┌─────┬─────┬────────────┬──────────┬────────────┐
│ row ┆ col ┆ date       ┆ is_block ┆ date_block │
│ --- ┆ --- ┆ ---        ┆ ---      ┆ ---        │
│ u32 ┆ u32 ┆ date       ┆ bool     ┆ date       │
╞═════╪═════╪════════════╪══════════╪════════════╡
│ 0   ┆ 0   ┆ 2009-03-31 ┆ true     ┆ 2009-03-31 │ # <-
│ 1   ┆ 0   ┆ 2010-05-31 ┆ false    ┆ 2009-03-31 │ # <-
│ 2   ┆ 0   ┆ 2011-03-31 ┆ false    ┆ 2009-03-31 │ # <-
│ 3   ┆ 0   ┆ 2001-12-31 ┆ false    ┆ 2009-03-31 │ # <-
│ 4   ┆ 0   ┆ 2000-06-30 ┆ true     ┆ 2000-06-30 │
│ …   ┆ …   ┆ …          ┆ …        ┆ …          │
│ 15  ┆ 5   ┆ 2020-11-30 ┆ false    ┆ 2010-05-31 │
│ 16  ┆ 5   ┆ 2012-05-31 ┆ true     ┆ 2012-05-31 │
│ 17  ┆ 5   ┆ 2011-02-28 ┆ true     ┆ 2011-02-28 │
│ 18  ┆ 5   ┆ 2008-03-31 ┆ true     ┆ 2008-03-31 │
│ 19  ┆ 5   ┆ 2023-07-31 ┆ true     ┆ 2023-07-31 │
└─────┴─────┴────────────┴──────────┴────────────┘

By using date_block to search for ids, we will emulate the desired behaviour:

If it's not a new block, I should retain the same random ids that were previously selected.

List is_in() other_list

If we reshape data so that we have lists of ids and random_ids

all_ids = data.group_by(pl.col.date.dt.date()).agg("ids")

random_ids = (
   data
    .group_by(pl.col.date.dt.date())
    .agg(random_ids = pl.col.ids.shuffle().head(5))
)

.list.set_intersection() can be used to do the same as the mulitple .filter + .is_in checks you're using.

(
   df_mask
    .join(all_ids, on="date")
    .join(random_ids, left_on="date_block", right_on="date")
    .with_columns(selection = pl.col.ids.list.set_intersection(pl.col.random_ids))
)
shape: (114, 8)
┌─────┬─────┬────────────┬──────────┬────────────┬────────────────┬────────────────┬────────────────┐
│ row ┆ col ┆ date       ┆ is_block ┆ date_block ┆ ids            ┆ random_ids     ┆ selection      │
│ --- ┆ --- ┆ ---        ┆ ---      ┆ ---        ┆ ---            ┆ ---            ┆ ---            │
│ u32 ┆ u32 ┆ date       ┆ bool     ┆ date       ┆ list[i64]      ┆ list[i64]      ┆ list[i64]      │
╞═════╪═════╪════════════╪══════════╪════════════╪════════════════╪════════════════╪════════════════╡
│ 19  ┆ 3   ┆ 2005-10-31 ┆ false    ┆ 2019-01-31 ┆ [73, 38]       ┆ [80]           ┆ []             │
│ 10  ┆ 0   ┆ 2015-12-31 ┆ true     ┆ 2015-12-31 ┆ [97, 57, … 12] ┆ [57, 40, … 81] ┆ [97, 57, … 12] │
│ 4   ┆ 5   ┆ 2011-07-31 ┆ false    ┆ 2012-01-31 ┆ [98, 40, 31]   ┆ [48, 6]        ┆ []             │
│ 2   ┆ 5   ┆ 2012-01-31 ┆ true     ┆ 2012-01-31 ┆ [48, 6]        ┆ [48, 6]        ┆ [48, 6]        │
│ 16  ┆ 3   ┆ 2010-08-31 ┆ true     ┆ 2010-08-31 ┆ [35, 15, … 54] ┆ [15, 19, … 35] ┆ [35, 15, … 54] │
│ …   ┆ …   ┆ …          ┆ …        ┆ …          ┆ …              ┆ …              ┆ …              │
│ 5   ┆ 2   ┆ 2009-11-30 ┆ false    ┆ 2018-12-31 ┆ [92, 52, … 58] ┆ [50, 33, … 97] ┆ []             │
│ 6   ┆ 5   ┆ 2019-03-31 ┆ true     ┆ 2019-03-31 ┆ [88, 12, … 2]  ┆ [35, 2, … 88]  ┆ [88, 12, … 2]  │
│ 9   ┆ 0   ┆ 2021-04-30 ┆ false    ┆ 2020-08-31 ┆ [19]           ┆ [74, 34, … 87] ┆ []             │
│ 16  ┆ 1   ┆ 2017-09-30 ┆ true     ┆ 2017-09-30 ┆ [83, 93, … 13] ┆ [93, 66, … 59] ┆ [83, 93, … 13] │
│ 1   ┆ 5   ┆ 2010-10-31 ┆ false    ┆ 2010-09-30 ┆ [90, 13]       ┆ [75, 45]       ┆ []             │
└─────┴─────┴────────────┴──────────┴────────────┴────────────────┴────────────────┴────────────────┘

From here, we keep only the needed columns and .explode() the list back into rows.

df_selections = (
   df_mask
    .join(all_ids, on="date")
    .join(random_ids, left_on="date_block", right_on="date")
    .with_columns(pl.col.ids.list.set_intersection(pl.col.random_ids))
    .select("row", "col", "date", "ids")
    .explode("ids")
)
shape: (258, 4)
┌─────┬─────┬────────────┬──────┐
│ row ┆ col ┆ date       ┆ ids  │
│ --- ┆ --- ┆ ---        ┆ ---  │
│ u32 ┆ u32 ┆ date       ┆ i64  │
╞═════╪═════╪════════════╪══════╡
│ 0   ┆ 4   ┆ 2006-01-31 ┆ 60   │
│ 0   ┆ 4   ┆ 2006-01-31 ┆ 7    │
│ 0   ┆ 4   ┆ 2006-01-31 ┆ 70   │
│ 0   ┆ 4   ┆ 2006-01-31 ┆ 62   │
│ 0   ┆ 4   ┆ 2006-01-31 ┆ 82   │
│ …   ┆ …   ┆ …          ┆ …    │
│ 9   ┆ 4   ┆ 2015-03-31 ┆ null │
│ 8   ┆ 2   ┆ 2008-06-30 ┆ 70   │
│ 8   ┆ 2   ┆ 2008-06-30 ┆ 12   │
│ 19  ┆ 5   ┆ 2023-07-31 ┆ 12   │
│ 19  ┆ 5   ┆ 2023-07-31 ┆ 67   │
└─────┴─────┴────────────┴──────┘

Pivot

With only the wanted ids remaining, a .join() can be used to get the corresponding values.

(df_selections
  .join(data, on=[pl.col.date.dt.date(), "ids"])
  .sort("row", "col")
)
shape: (261, 9)
┌─────┬─────┬────────────┬──────────┬───┬──────┬─────────────────────┬───────────┬───────────┐
│ row ┆ col ┆ date       ┆ is_block ┆ … ┆ ids  ┆ date_right          ┆ values    ┆ ids_right │
│ --- ┆ --- ┆ ---        ┆ ---      ┆   ┆ ---  ┆ ---                 ┆ ---       ┆ ---       │
│ u32 ┆ u32 ┆ date       ┆ bool     ┆   ┆ i64  ┆ datetime[μs]        ┆ f64       ┆ i64       │
╞═════╪═════╪════════════╪══════════╪═══╪══════╪═════════════════════╪═══════════╪═══════════╡
│ 0   ┆ 0   ┆ 2009-03-31 ┆ true     ┆ … ┆ 24   ┆ 2009-03-31 00:00:00 ┆ -0.006142 ┆ 24        │
│ 0   ┆ 1   ┆ 2014-06-30 ┆ true     ┆ … ┆ 27   ┆ 2014-06-30 00:00:00 ┆ -0.001445 ┆ 27        │
│ 0   ┆ 1   ┆ 2014-06-30 ┆ true     ┆ … ┆ 57   ┆ 2014-06-30 00:00:00 ┆ 0.005532  ┆ 57        │
│ 0   ┆ 1   ┆ 2014-06-30 ┆ true     ┆ … ┆ 96   ┆ 2014-06-30 00:00:00 ┆ -0.00793  ┆ 96        │
│ 0   ┆ 1   ┆ 2014-06-30 ┆ true     ┆ … ┆ 14   ┆ 2014-06-30 00:00:00 ┆ 0.003196  ┆ 14        │
│ …   ┆ …   ┆ …          ┆ …        ┆ … ┆ …    ┆ …                   ┆ …         ┆ …         │
│ 19  ┆ 2   ┆ 2021-10-31 ┆ true     ┆ … ┆ 15   ┆ 2021-10-31 00:00:00 ┆ 0.003261  ┆ 15        │
│ 19  ┆ 3   ┆ 2005-10-31 ┆ false    ┆ … ┆ null ┆ null                ┆ null      ┆ null      │
│ 19  ┆ 4   ┆ 2000-03-31 ┆ false    ┆ … ┆ null ┆ null                ┆ null      ┆ null      │
│ 19  ┆ 5   ┆ 2023-07-31 ┆ true     ┆ … ┆ 12   ┆ 2023-07-31 00:00:00 ┆ -0.000579 ┆ 12        │
│ 19  ┆ 5   ┆ 2023-07-31 ┆ true     ┆ … ┆ 67   ┆ 2023-07-31 00:00:00 ┆ -0.009836 ┆ 67        │
└─────┴─────┴────────────┴──────────┴───┴──────┴─────────────────────┴───────────┴───────────┘

.pivot() can then be used to return back to the original "wide" shape and aggregate the mean.

(df_selections
  .join(data, on=[pl.col.date.dt.date(), "ids"])
  .sort("row", "col")
  .pivot(on="col", index="row", values="values", aggregate_function=pl.element().mean())
)
shape: (20, 7)
┌─────┬───────────┬───────────┬───────────┬───────────┬───────────┬───────────┐
│ row ┆ 0         ┆ 1         ┆ 2         ┆ 3         ┆ 4         ┆ 5         │
│ --- ┆ ---       ┆ ---       ┆ ---       ┆ ---       ┆ ---       ┆ ---       │
│ u32 ┆ f64       ┆ f64       ┆ f64       ┆ f64       ┆ f64       ┆ f64       │
╞═════╪═══════════╪═══════════╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 0   ┆ -0.006142 ┆ -0.000162 ┆ 0.001333  ┆ 0.000646  ┆ 0.004068  ┆ -0.007742 │
│ 1   ┆ null      ┆ -0.000778 ┆ null      ┆ null      ┆ null      ┆ null      │
│ 2   ┆ null      ┆ null      ┆ null      ┆ null      ┆ -0.001934 ┆ -0.004242 │
│ 3   ┆ null      ┆ null      ┆ -0.002209 ┆ -0.005019 ┆ null      ┆ 0.004758  │
│ 4   ┆ null      ┆ null      ┆ -0.000181 ┆ 0.001981  ┆ null      ┆ null      │
│ …   ┆ …         ┆ …         ┆ …         ┆ …         ┆ …         ┆ …         │
│ 15  ┆ null      ┆ -0.00059  ┆ 0.003549  ┆ null      ┆ null      ┆ null      │
│ 16  ┆ null      ┆ -0.003033 ┆ -0.003727 ┆ -0.002882 ┆ null      ┆ 0.009226  │
│ 17  ┆ null      ┆ null      ┆ null      ┆ -0.003837 ┆ null      ┆ 0.009743  │
│ 18  ┆ null      ┆ -0.001557 ┆ 0.000865  ┆ -0.008088 ┆ null      ┆ 0.001735  │
│ 19  ┆ 0.008602  ┆ 0.000155  ┆ -0.001692 ┆ null      ┆ null      ┆ -0.005207 │
└─────┴───────────┴───────────┴───────────┴───────────┴───────────┴───────────┘
Sign up to request clarification or add additional context in comments.

Comments

1
  • unpivoting the two static dataframes, SB_dates and newBL, gives you all the data in column1, column2..., order, which is exactly what is needed for this problem to eliminate the first loop. (Even if not, a sort could have been done to get it in this order.) Each element is uniquely identified by index and the variable column (renamed to col) in this newly reshaped DataFrame.
  • A left join (which maintains the order of the left DataFrame) of this static dataframe with the main data can do all the filter_ids = ... step.
  • A group_by further readies "choose n random IDs on that day" step.
  • With maintain_order=True, we further maintain the original order from the left join, and thus each element of the final returned DataFrame will correctly correspond to each element of SB_dates and newBL.
  • A pl.when can be used in the aggregation to null out any unneeded calculations.
  • Note also we only need the values after the aggregation, the ids were just needed to pair with the corresponding values in the sample. Now we discard everything but the values.
  • Unnesting the values list with list.to_struct, then unnest is needed to avoid lists of nulls / struct of nulls - just a null per new column if newBL is False.
  • Each mean is then calculated and a strategy of forward is used on fill_nulls at this time.

That gets you the mean_vals in a one-column dataframe, which can optionally be reshaped back into the original SB_dates shape if desired.

def get_mean_chrono_polars(data, SB_dates, newBL, n=5):
    static_data = (
        SB_dates.with_row_index()
        .unpivot(index='index', variable_name='col', value_name='date')
        .with_columns(newBL.unpivot().select(bl='value'))
    )

    return (
        static_data.join(data, on=pl.col('date').dt.date(), how='left')
        .group_by('index', 'col', maintain_order=True)
        .agg(pl.when(pl.col('bl')).then(pl.col('ids', 'values').sample(n)))
        .select(
            pl.col('values').list.to_struct(
                n_field_strategy='max_width',
                fields=[f'val{j}' for j in range(n)],
                upper_bound=n,
            ),
        )
        .unnest('values')
        .select(
            mean_val=(pl.sum_horizontal(pl.all()) / n)
            .fill_null(strategy='forward')
        # steps from this point are optional
            .reshape(SB_dates.shape)
            .arr.to_struct()
        )
        .unnest('mean_val')
    )

5 Comments

My initial thought was perhaps melt then the left join would be a starting point to do it without loops: sb_data.with_row_count().melt('row_nr', variable_name='col', value_name='date').with_columns(blnew_data.melt().select(bl = 'value')) - Haven't investigated further yet so not sure if it complicates things or not.
@Wayoshi This new approach seems to do the job much faster. Thank you for all the details. However, I can't retrieve the columns when newBL is set to False after using pl.agg(pl.when. PS. I just modified the selectors part to prevent repeated ones when the number of columns exceeds 10. Thank you again!
Yeah, I took some liberties on defining my versions of SB_dates and newBL (started writing the answer when you were editing the original post), so whatever tweaks. I think the main points are how the join (or melt as jqurious suggests), only filling nulls at the end, etc., really cuts the work down.
Indeed, an extra join and a melt worked. Thanks to both.
Great, I edited the answer just for completeness at this point.

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.