I have a python script that downloads, unzip and parses an XML file published by a Canadian institution. Only some very specific tags are extracted and then all put into a pandas dataframe for later processing.
Everything works well. I just wonder if there is room for improvement here, specially in the parsing part. I am not sure if the nested for I use are a good idea or there is a better and cleaner way to parse.
import requests
import zipfile
import os
import glob
from lxml import etree
from io import StringIO, BytesIO
import pandas as pd
import xml.etree.ElementTree as ET
def download_file(url,filename):
r = requests.get(url, allow_redirects=True)
open(filename, 'wb').write(r.content)
def unzip_and_delete(filename):
zip_file = zipfile.ZipFile(filename,'r')
zip_file.extractall()
zip_file.close()
os.remove(filename)
def parse_xml_fields(file, base_tag, tag_list,final_list):
root = etree.parse(file)
nodes = root.findall("//{}".format(base_tag))
for node in nodes:
item = {}
for tag in tag_list:
if node.find(".//{}".format(tag)) is not None:
item[tag] = node.find(".//{}".format(tag)).text.strip()
final_list.append(item)
# My variables
field_list = ["MsbRegistrationNumber","StatusDescriptionEnglish","Surname","GivenName","MiddleName","Name","StreetAddress"]
entities_list = []
download_file('http://www10.fintrac-canafe.gc.ca/msb-esm/public/msb-search/zipdownload-eng/', 'fintrac.zip')
unzip_and_delete('fintrac.zip')
parse_xml_fields("MsbRegistryPublicDataFile.xml", "MsbInformation", field_list, entities_list)
df = pd.DataFrame(entities_list, columns=field_list)
df.to_excel("Canada_MSB_List.xlsx")