I created a function that works a bit like the subset function in R.
similar to what is asked for here
I haven't found a way to use both the %in% = list, while using And or Or operators, but I just do those subsets one by one:
def subset(df, query=None, select=None, unselect=None, asindex=False, returnFullDFIfError=False, **kwargs):
"""
Subsets a pandas DataFrame based on query conditions, and selects or unselects specified columns.
Parameters:
df (pd.DataFrame): The DataFrame to be subsetted.
query (str, optional): A query string to filter rows. Default is None.
select (list, optional): Columns to be selected. Default is None.
unselect (list, optional): Columns to be unselected. Default is None.
asindex (bool, optional): Whether to return only the index if True. Default is False.
returnFullDFIfError (bool, optional): Whether to return the full DataFrame if an error occurs. Default is True.
Returns:
pd.DataFrame or pd.Index: The subsetted DataFrame or Index, based on the given parameters.
Examples:
#>>> df = pd.DataFrame({'A': [1, 2], 'B': [3, 4], 'C': [5, 6]})
#>>> subset(df, query='A > 1', select=['B', 'C'])
#>>> subset(df, query='A < 2', unselect=['C'])
names_list = ['Alice', 'David']
result = subset(df, query="Name %in% names_list", select=['Name', 'Age'],names_list=names_list)
print(result)
Name Age
0 Alice 25
3 David 40
IMPORTANT:
You cannot combine the use of %in% and other operators like and, or, & and |.
names_list = ['Alice', 'David']
result = subset(df, query="Name %in% names_list or Age > 16" , select=['Name', 'Age'],names_list=names_list)
print(result)
Name Age
0 Alice 25
3 David 40
"""
import pandas as pd
import numpy as np
import re
# Ensure proper types for select and unselect
select = list(select) if select else []
unselect = list(unselect) if unselect else []
# Preprocess for %in% and %!in% conditions and standardize logical operators
if query:
df, query = _preprocess_query(df, query, kwargs)
# Execute query
try:
if asindex:
return df.query(query).index
else:
filtered_df = df.query(query) if query else df
if select:
return filtered_df[select]
elif unselect:
return filtered_df[[col for col in df.columns if col not in unselect]]
else:
return filtered_df
except Exception as e:
if returnFullDFIfError:
return df
else:
raise e
def _preprocess_query(df, query, variables):
"""
Preprocesses the DataFrame for %in% and %!in% conditions and standardizes logical operators.
Parameters:
df (pd.DataFrame): The DataFrame to be processed.
query (str): The query string.
variables (dict): A dictionary of variables to be used in the query.
Returns:
tuple: The processed DataFrame and the updated query string.
"""
# Standardize logical operators
query = query.replace(" or ", " | ").replace(" OR ", " | ").replace(" Or ", " | ")
query = query.replace(" and ", " & ").replace(" AND ", " & ").replace(" And ", " & ")
# Process %in% and %!in% conditions
in_conditions = re.findall(r'(\w+)\s*%(!?in)%\s*(\w+)', query)
for col, operator, var in in_conditions:
values = variables.get(var, [])
if operator == 'in':
df = df[df[col].isin(values)]
else: # operator == '!in'
df = df[~df[col].isin(values)]
# Remove %in% and %!in% from the query
updated_query = re.sub(r'\w+\s*%!?in%\s*\w+', '', query)
return df, updated_query.strip()
df.queryandpd.evalseem like good fits for this use case. For information on thepd.eval()family of functions, their features and use cases, please visit Dynamic Expression Evaluation in pandas using pd.eval().