Here is a somewhat more robust approach which will properly handle formulas with nested sub-expressions, such as Na(OH)2 or Al(NO3)3:
# Loosely based on example code from
# http://pyparsing.wikispaces.com/file/detail/chemicalFormulas.py
from pyparsing import Group, Forward, Literal, nums, oneOf, OneOrMore, Optional, Word
# from http://pyparsing-public.wikispaces.com/Helpful+Expressions
# element("He") => "He"
element = oneOf(
"""H He Li Be B C N O F Ne Na Mg Al Si P S Cl
Ar K Ca Sc Ti V Cr Mn Fe Co Ni Cu Zn Ga Ge
As Se Br Kr Rb Sr Y Zr Nb Mo Tc Ru Rh Pd Ag
Cd In Sn Sb Te I Xe Cs Ba Lu Hf Ta W Re Os
Ir Pt Au Hg Tl Pb Bi Po At Rn Fr Ra Lr Rf
Db Sg Bh Hs Mt Ds Rg Uub Uut Uuq Uup Uuh Uus
Uuo La Ce Pr Nd Pm Sm Eu Gd Tb Dy Ho Er Tm
Yb Ac Th Pa U Np Pu Am Cm Bk Cf Es Fm Md No"""
)
# integer("123") => 123
to_int = lambda tokens: int(tokens[0])
integer = Word(nums).setParseAction(to_int)
# item("He") => {"He": 1}
# item("O2") => {"O": 2}
item_to_dict = lambda tokens: {a:b for a,b in tokens}
item = Group(element + Optional(integer, default=1)).setParseAction(item_to_dict)
# allow recursive definition of formula
Formula = Forward()
# expr("(OH)2") => {"O": 2, "H": 2}
lpar = Literal("(").suppress()
rpar = Literal(")").suppress()
expr_to_dict = lambda tokens: {el: num*tokens[1] for el,num in tokens[0].items()}
expr = (lpar + Formula + rpar + integer).setParseAction(expr_to_dict)
# ... complete the recursive definition
def formula_to_dict(tokens):
total = {}
for expr in tokens:
for el,num in expr.items():
total[el] = total.get(el, 0) + num
return total
Formula <<= OneOrMore(item | expr).setParseAction(formula_to_dict)
# Finally, wrap it in an easy-to-use function:
def get_elements(s):
return Formula.parseString(s)[0]
You can use it like:
>>> get_elements("Na(OH)2")
{'H': 2, 'Na': 1, 'O': 2}
>>> get_elements("Al(NO3)3")
{'Al': 1, 'N': 3, 'O': 9}
>>> get_elements("Ba4H2Ba5Li3")
{'Ba': 9, 'H': 2, 'Li': 3}
HinHgH2be 2 or 3?{'Hg': 1, 'H': 2}. :-)