Don't eval() unless the user is entirely trusted. Instead, implement a simple language for such conditionals.
For example, we can define a data model that represents the structure of a logical expression and can be evaluated on a dictionary:
class Operator:
def evaluate(self, context: dict) -> bool:
raise NotImplementedError()
@dataclass
class OpEq(Operator):
variable: str
value: object
def evaluate(self, context: dict) -> bool:
return context.get(self.variable) == self.value
@dataclass
class OpAnd(Operator):
clauses: list[Operator]
def evaluate(self, context: dict) -> bool:
return all(c.evaluate(context) for c in self.clauses)
@dataclass
class OpOr(Operator):
clauses: list[Operator]
def evaluate(self, context: dict) -> bool:
return any(c.evaluate(context) for c in self.clauses)
For example, the expression (type = "a" OR type = "b") AND val3="off" would correspond to the objects:
predicate = OpAnd([
OpOr([
OpEq("type", "a"),
OpEq("type", "b"),
]),
OpEq("val3", "off"),
])
Next, we can write a parser that handles such expressions. There are many ways to write parsers. The following uses a “recursive descent” approach, i.e. just spells out a parser by hand. Note that the parsing functions have the structure (offset) -> (value, offset).
class InvalidSyntax(Exception):
def __init__(self, message: str, input: str, offset: int) -> None:
super().__init__(message, input, offset)
self.message = message
self.input = input
self.offset = offset
def __str__(self):
# format a nice error message that points to the error location
return f"{self.message}\n|{self.input}\n|{'^':>{self.offset+1}}"
RE_VAR = re.compile(r"\b(\w+)\b")
RE_KW_OR = re.compile(r"\bOR\b")
RE_KW_AND = re.compile(r"\bAND\b")
RE_INT = re.compile(r"\b([0-9]+)\b")
RE_STR = re.compile(r'"([^"]*)"') # could be extended to cover escaping
RE_SPACE = re.compile(r"\s+")
class Parser:
def __init__(self, input: str) -> None:
self.input = input
@classmethod
def parse(cls, input: str) -> Operator:
parser = cls(input)
op, i = parser.parse_logical(0)
if i != len(input):
raise parser.error(i, "expected end of expression")
return op
def parse_logical(self, i: int) -> Tuple[Operator, int]:
"""Parse `a OR b OR …` or `a AND b AND …` chains"""
op, i = self.parse_comparison(i)
i = self.skip_space(i)
# look ahead to see if this is an AND-chain or OR-chain
op_type: Callable[[list[Operator]], Operator]
if RE_KW_AND.match(self.input, i):
separator = RE_KW_AND
op_type = OpAnd
elif RE_KW_OR.match(self.input, i):
separator = RE_KW_OR
op_type = OpOr
else:
return op, i # no logical operator
# parse the entire chain
clauses = [op]
while m := separator.match(self.input, i):
i = self.skip_space(m.end())
op, i = self.parse_comparison(i)
clauses.append(op)
try:
i = self.skip_space(i)
except InvalidSyntax:
break
return op_type(clauses), i
def parse_comparison(self, i: int) -> Tuple[Operator, int]:
"""parse `var = value` comparisons and parenthesized expressions"""
if self.input.startswith("(", i):
return self.parse_parens(i)
if m := RE_VAR.match(self.input, i):
var = m.group(1)
i = m.end()
else:
raise self.error(i, "expected variable name")
i = self.skip_space(i)
if self.input.startswith("=", i):
i += 1
op_type = OpEq
else:
raise self.error(i, "expected: =")
i = self.skip_space(i)
value: object
if m := RE_INT.match(self.input, i):
value = int(m.group(1))
i = m.end()
elif m := RE_STR.match(self.input, i):
value = m.group(1)
i = m.end()
else:
raise self.error(i, "expected int or quoted string")
return op_type(var, value), i
def parse_parens(self, i: int) -> Tuple[Operator, int]:
"""parse a parenthesized expression (...)"""
if not self.input.startswith("(", i):
raise self.error(i, "expected: (")
i = self.skip_space(i)
op, i = self.parse_logical(i + 1)
i = self.skip_space(i)
if not self.input.startswith(")", i):
raise self.error(i, "expected: )")
return op, i + 1
def skip_space(self, i: int) -> int:
if m := RE_SPACE.match(self.input, i):
return m.end()
return i
def error(self, i, message) -> InvalidSyntax:
return InvalidSyntax(message, self.input, i)
This parser will produce somewhat nice error messages, for example:
>>> Parser.parse('(type = "a" OR type = "b" AND val3="off"')
Traceback (most recent call last):
...
test.InvalidSyntax: expected: )
|(type = "a" OR type = "b" AND val3="off"
| ^
This is of course a lot of boilerplate. Instead of writing a parser by hand, it would be possible to e.g. use a parser combinator library like parsec:
import parsec as p
def parse(input: str) -> Operator:
return logical.parse_strict(input)
string = p.regex(RE_STR).parsecmap(lambda s: s[1:-1])
integer = p.regex(RE_INT).parsecmap(int)
@p.generate
def compare_eq():
variable = yield p.regex(RE_VAR)
yield p.spaces()
yield p.string("=")
yield p.spaces()
value = yield string | integer
return OpEq(variable, value)
@p.generate
def parenthesized():
yield p.string("(")
yield p.spaces()
expr = yield logical
yield p.spaces()
yield p.string(")")
return expr
comparison = parenthesized | compare_eq
@p.generate
def logical():
op = yield comparison << p.spaces()
chain_type = yield p.optional(
p.lookahead(
p.regex(RE_KW_AND).result((RE_KW_AND, OpAnd))
| p.regex(RE_KW_OR).result((RE_KW_OR, OpOr))))
if not chain_type:
return op
separator, op_type = chain_type
clauses = yield p.many1(
p.spaces() >> p.regex(separator) >> p.spaces() >> comparison)
return op_type([op, *clauses])
This is a bit more declarative, but TBH Python is not a good language for techniques like combinator parsing that originate from functional programming. I tend to prefer writing parsers by hand since it's easier to reason about performance characteristics and to produce good error messages.
If you already have a syntax that fits your needs, you might be able to reuse a parser. For example, Python ships with the ast module that can parse Python expressions. Once that is done, we only have to convert the Python AST to our data model. For example:
def convert_logical(node: ast.AST) -> Operator:
if isinstance(node, ast.Expression):
return convert_logical(node.body)
if isinstance(node, ast.BoolOp):
if isinstance(node.op, ast.And):
return OpAnd([convert_logical(n) for n in node.values])
if isinstance(node.op, ast.Or):
return OpOr([convert_logical(n) for n in node.values])
elif isinstance(node, ast.Compare):
if len(node.ops) == 1 and isinstance(node.ops[0], ast.Eq):
name = node.left
[value] = node.comparators
assert isinstance(name, ast.Name)
assert isinstance(value, ast.Constant)
return OpEq(name.id, value.value)
raise InvalidSyntax("unsupported syntax", input, node.col_offset)
predicate = convert_logical(ast.parse(expression, mode="eval"))
This is very powerful, but limits you to an exact subset of the Python syntax – no syntax customizations are possible, you can only change how the syntax is interpreted. If the user input is untrusted, I would also be worried that this approach exposes a lot of attack surface (but Python's parser is probably much better tested than a hand-written one).
Incidentally, this approach is used by the Pandas library to support queries on dataframes. Uses like df.query("foo < 3") are parsed as Python syntax, but usually not evaluated as Python code. Instead, the operations are directly translated to queries on the dataframe. Here, the query would select all rows where the foo column is less-than 3.
Source code for this answer is available at https://gist.github.com/latk/3fa19e3c5ac0564b11539def09aed1e8