I have written the code below to parse this XML file. You can see it's still a bit messy, but that I'm on the right track for most of it.
You can see one part that I'm stuck on is the 'targets' section (I've left the code that I've tried for this section in here with triple quotes, but you can see that section doesn't work).
I'm wondering if someone could help show me where I'm going wrong/how to parse the targets section? If you look at the HTML of the XML file here, I basically just want to extract the information in the targets section, for each gene/entry (or if it was possible, there seems to be more info in the targets section of the XML file, so if I could take that either)?
Thanks
import requests
import xml.etree.ElementTree as ET
import urllib2
#get the XML file
#response = requests.get('https://www.drugbank.ca/drugs/DB01048.xml')
#with open('output.txt', 'w') as input:
# input.write(response.content)
tree = ET.parse('output.txt')
root = tree.getroot()
val = lambda x: "{http://www.drugbank.ca}" + str(x)
key_list = ['drugbank-id','name','description','cas-number','unii','average-mass','monoisotopic-mass','state','indication','pharmacodynamics','mechanism-of-action','toxicity','metabolism','absorption','half-life','protein-binding','route-of-elimination','volume-of-distribution','fda-label','msds']
key_dict = {}
for i in key_list:
for child in root.getchildren():
key_dict[i] = child.find(val(i)).text.encode('utf-8')
#print key_dict
def method1(str_name,list_name):
if subnode.tag == str_name:
list_name = []
for i in subnode:
list_name.append(i.text)
return list_name
def method2(list1_name,list2_name,list3_name,list4_name):
if subnode.tag == list1_name:
for i in subnode:
if i.tag == list2_name:
for a in i:
if a.tag == list3_name:
for u in a:
if u.tag == list4_name:
yield u.text
def method3(list1_name, list2_name):
list_of_tuples = []
if subnode.tag == list1_name:
for i in subnode:
if i.tag == list2_name:
temp_list = []
for a in i:
temp_list.append(a.text)
list_of_tuples.append(temp_list)
return list_of_tuples
alternative_parents = []
substituents = []
list_to_run_thru = ['description','direct-parent','kingdom','superclass','class','subclass']
ap_sub = lambda x:'{http://www.drugbank.ca}'+ x
for node in root:
for subnode in node:
print method1('{http://www.drugbank.ca}groups','group_list')
print method1('{http://www.drugbank.ca}synonyms','synonym_list')
print method1('{http://www.drugbank.ca}patent','patent_list')
print method2('{http://www.drugbank.ca}general-references','{http://www.drugbank.ca}articles','{http://www.drugbank.ca}article','{http://www.drugbank.ca}pubmed-id')#
if subnode.tag == '{http://www.drugbank.ca}classification':
for each_item in list_to_run_thru:
for i in subnode:
if i.tag == ap_sub(each_item):
print i.text
if i.tag == '{http://www.drugbank.ca}alternative-parent':
alternative_parents.append(i.text)
if i.tag == '{http://www.drugbank.ca}substituent':
substituents.append(i.text)
print method3('{http://www.drugbank.ca}salts','{http://www.drugbank.ca}salt')
print method3('{http://www.drugbank.ca}products','{http://www.drugbank.ca}product')
print method3('{http://www.drugbank.ca}mixtures','{http://www.drugbank.ca}mixture')
print method3('{http://www.drugbank.ca}packagers','{http://www.drugbank.ca}packager')
print method3('{http://www.drugbank.ca}categories','{http://www.drugbank.ca}category')
print method3('{http://www.drugbank.ca}dosages','{http://www.drugbank.ca}dosage')
print method3('{http://www.drugbank.ca}atc-codes','{http://www.drugbank.ca}atc-code')
print method3('{http://www.drugbank.ca}ahfs-codes','{http://www.drugbank.ca}ahfs-code')
print method3('{http://www.drugbank.ca}pdb-entries','{http://www.drugbank.ca}pdb-entry')
print method3('{http://www.drugbank.ca}food-interactions','{http://www.drugbank.ca}food-interaction')
print method3('{http://www.drugbank.ca}drug-interactions','{http://www.drugbank.ca}drug-interaction')
print method3('{http://www.drugbank.ca}calculated-properties','{http://www.drugbank.ca}property')
print method3('{http://www.drugbank.ca}external-identifiers','{http://www.drugbank.ca}external-identifier')
print method3('{http://www.drugbank.ca}external-links','{http://www.drugbank.ca}external-link')
print method3('{http://www.drugbank.ca}snp-adverse-drug-reactions','{http://www.drugbank.ca}reaction')
print substituents
print alternative_parents
'''
if subnode.tag == '{http://www.drugbank.ca}pathways':
for i in subnode:
if i.tag == '{http://www.drugbank.ca}pathway':
for a in i:
print a.text
for u in a:
if u.tag == '{http://www.drugbank.ca}drug':
for x in u:
print x.text
#missing a bit of data here
if subnode.tag == '{http://www.drugbank.ca}targets':
for i in subnode:
if i.tag == '{http://www.drugbank.ca}target':
print i.text
for a in i:
print a.text
if a.tag == '{http://www.drugbank.ca}actions':
for u in a:
print u.text
if a.tag == '{http://www.drugbank.ca}references':
for u in a:
if u.tag == '{http://www.drugbank.ca}articles':
for x in u:
if x.tag == '{http://www.drugbank.ca}article':
for z in x:
print z.text
'''