As the original poster, I have revised the code based on the many answers as follows:
Add a docstring
Use the magic doctest for unit testing
Removed the numpy parameter (a separate function would probably be better)
Separate the if-else expression into an if-else block
Renamed the numpy variable to use_numpy for clarity
Use slicing to extract the correct part of co_varnames which correspond to the argument names. The docs seem to imply that this works:
co_varnames
Returns a tuple containing the names of the local variables (starting with the argument names).
Using inspect.signature instead of co_varnames causes a performance hit, so I reverted to using co_varnames.
import pandas as pd
from typing import Callable
def pda(df: pd.DataFrame, f: Callable, use_numpy: bool = True):
"""Performs a function `f` on columns of DataFrame `df`,
as NumPy arrays or as Pandas' Series.
Function `f` will be performed on the columns of `df`
corresponding to the argument names of `f`.
Args:
df (pd.DataFrame): input DataFrame
f (Callable): function to be performed
use_numpy (bool, optional, defaults to True): use NumPy arrays instead of Series
Returns:
resulting numpy array if `use_numpy` else resulting Series
Example:
```
>>> df = pd.DataFrame({
... "d": [1, 2, 3, 4],
... "a": [2, 3, 4, 5],
... "c": [3, 4, 5, 6],
... "b": [4, 5, 6, 7]
... })
>>> df["e"] = pda(df, lambda c, a: c - a)
>>> print(df)
d a c b e
0 1 2 3 4 1
1 2 3 4 5 1
2 3 4 5 6 1
3 4 5 6 7 1
```
"""
if use_numpy:
return f(*(df[f.__code__.co_varnames[i]].values
for i in range(f.__code__.co_argcount)))
else:
return f(*(df[f.__code__.co_varnames[i]]
for i in range(f.__code__.co_argcount)))
if __name__ == "__main__":
import doctest
doctest.testmod()
I also did some timing comparisons between the methods.
#!/usr/bin/env python3
import inspect
import random
from collections import defaultdict
from typing import Callable
import numpy as np
import pandas as pd
def main():
import doctest
doctest.testmod()
import timeit
df = pd.DataFrame({
"d": np.random.random(100000),
"a": np.random.random(100000),
"c": np.random.random(100000),
"b": np.random.random(100000)
})
tests = [
test_pda, test_pda_series, test_pda2, test_lambda, test_eval,
test_index, test_dot, test_assign
]
timings = defaultdict(float)
for i in range(1000):
random.shuffle(tests)
for test in tests:
timings[test.__name__] += timeit.timeit("test(df)",
number=1,
globals={
"test": test,
"df": df
})
for test_name, timing in timings.items():
print(test_name, timing)
def test_pda(df):
"""
```
>>> df = pd.DataFrame({
... "d": [1, 2, 3, 4],
... "a": [2, 3, 4, 5],
... "c": [3, 4, 5, 6],
... "b": [4, 5, 6, 7]
... })
>>> df = test_pda(df)
>>> print(df)
d a c b e
0 1 2 3 4 1
1 2 3 4 5 1
2 3 4 5 6 1
3 4 5 6 7 1
```
"""
df["e"] = pda(df, lambda c, a: c - a)
return df
def test_pda_series(df):
"""
```
>>> df = pd.DataFrame({
... "d": [1, 2, 3, 4],
... "a": [2, 3, 4, 5],
... "c": [3, 4, 5, 6],
... "b": [4, 5, 6, 7]
... })
>>> df = test_pda_series(df)
>>> print(df)
d a c b e
0 1 2 3 4 1
1 2 3 4 5 1
2 3 4 5 6 1
3 4 5 6 7 1
```
"""
df["e"] = pda(df, lambda c, a: c - a, False)
return df
def test_pda2(df):
"""
```
>>> df = pd.DataFrame({
... "d": [1, 2, 3, 4],
... "a": [2, 3, 4, 5],
... "c": [3, 4, 5, 6],
... "b": [4, 5, 6, 7]
... })
>>> df = test_pda2(df)
>>> print(df)
d a c b e
0 1 2 3 4 1
1 2 3 4 5 1
2 3 4 5 6 1
3 4 5 6 7 1
```
"""
df["e"] = pda2(df, lambda c, a: c - a)
return df
def test_lambda(df):
"""
```
>>> df = pd.DataFrame({
... "d": [1, 2, 3, 4],
... "a": [2, 3, 4, 5],
... "c": [3, 4, 5, 6],
... "b": [4, 5, 6, 7]
... })
>>> df = test_pda(df)
>>> print(df)
d a c b e
0 1 2 3 4 1
1 2 3 4 5 1
2 3 4 5 6 1
3 4 5 6 7 1
```
"""
df["e"] = (lambda x: x["c"].values - x["a"].values)(df)
return df
def test_eval(df):
"""
```
>>> df = pd.DataFrame({
... "d": [1, 2, 3, 4],
... "a": [2, 3, 4, 5],
... "c": [3, 4, 5, 6],
... "b": [4, 5, 6, 7]
... })
>>> df = test_eval(df)
>>> print(df)
d a c b e
0 1 2 3 4 1
1 2 3 4 5 1
2 3 4 5 6 1
3 4 5 6 7 1
```
"""
df["e"] = df.eval("c - a")
return df
def test_index(df):
"""
```
>>> df = pd.DataFrame({
... "d": [1, 2, 3, 4],
... "a": [2, 3, 4, 5],
... "c": [3, 4, 5, 6],
... "b": [4, 5, 6, 7]
... })
>>> df = test_index(df)
>>> print(df)
d a c b e
0 1 2 3 4 1
1 2 3 4 5 1
2 3 4 5 6 1
3 4 5 6 7 1
```
"""
df["e"] = df["c"].values - df["a"].values
return df
def test_dot(df):
"""
```
>>> df = pd.DataFrame({
... "d": [1, 2, 3, 4],
... "a": [2, 3, 4, 5],
... "c": [3, 4, 5, 6],
... "b": [4, 5, 6, 7]
... })
>>> df = test_dot(df)
>>> print(df)
d a c b e
0 1 2 3 4 1
1 2 3 4 5 1
2 3 4 5 6 1
3 4 5 6 7 1
```
"""
df["e"] = df.c.values - df.a.values
return df
def test_assign(df: pd.DataFrame):
"""
```
>>> df = pd.DataFrame({
... "d": [1, 2, 3, 4],
... "a": [2, 3, 4, 5],
... "c": [3, 4, 5, 6],
... "b": [4, 5, 6, 7]
... })
>>> df = test_assign(df)
>>> print(df)
d a c b e
0 1 2 3 4 1
1 2 3 4 5 1
2 3 4 5 6 1
3 4 5 6 7 1
```
"""
return df.assign(e=lambda x: x["c"].values - x["a"].values)
def pda(df: pd.DataFrame, f: Callable, use_numpy: bool = True):
"""Performs a function `f` on columns of DataFrame `df`,
as NumPy arrays or as Pandas' Series.
Function `f` will be performed on the columns of `df`
corresponding to the argument names of `f`.
Args:
df (pd.DataFrame): input DataFrame
f (Callable): function to be performed
use_numpy (bool, optional, defaults to True): use NumPy arrays instead of Series
Returns:
resulting numpy array if `use_numpy` else resulting Series
Example:
```
>>> df = pd.DataFrame({
... "d": [1, 2, 3, 4],
... "a": [2, 3, 4, 5],
... "c": [3, 4, 5, 6],
... "b": [4, 5, 6, 7]
... })
>>> df["e"] = pda(df, lambda c, a: c - a)
>>> print(df)
d a c b e
0 1 2 3 4 1
1 2 3 4 5 1
2 3 4 5 6 1
3 4 5 6 7 1
```
"""
if use_numpy:
return f(*(df[f.__code__.co_varnames[i]].values
for i in range(f.__code__.co_argcount)))
else:
return f(*(df[f.__code__.co_varnames[i]]
for i in range(f.__code__.co_argcount)))
def pda2(df: pd.DataFrame, f: Callable, use_numpy: bool = True):
if use_numpy:
return f(*(df[param.name].values
for param in inspect.signature(f).parameters.values()))
else:
return f(*(df[param.name]
for param in inspect.signature(f).parameters.values()))
if __name__ == "__main__":
main()
The results for Python 3.11.2, Pandas 2.1.1 and NumPy 1.26.0 show that pda is surprisingly on par in terms of performance as the best other methods (indexing and member access). As expected, .assign has terrible performance because it is copying the entire DataFrame.
Timings (lower is better):
test_index 0.16944104398862692
test_assign 2.891109986925585
test_pda 0.1570397199393483
test_eval 0.8307543109549442
test_pda2 0.18781333995138993
test_lambda 0.1599503229081165
test_dot 0.16240537503472297
test_pda_series 0.2198283309226099
df['i'] = pda(df, lambda _, __, c, ___, ____, f, _____, ______: c + f)instead ofdf['i'] = df.c + df.f. Is that right ? \$\endgroup\$df['i'] = pda(df, lambda c, f: c + f)\$\endgroup\$