EDIT: Anyone coming to this in the future, the solution I used was to switch to cElementTree. It not only runs with less memory, it is significantly faster.
This works on files up to about 600mb in size, larger than that and I run out of memory (I have a 16gb machine). What can I do to read in a file in pieces, or read in a certain percentage of the xml at a time or is there a less memory intensive approach?
import csv
import xml.etree.ElementTree as ET
from lxml import etree
import time
import sys
def main(argv):
start_time = time.time()
#file_name = 'sample.xml'
file_name = argv
root = ET.ElementTree(file=file_name).getroot()
csv_file_name = '.'.join(file_name.split('.')[:-1]) + ".txt"
print '\n'
print 'Output file:'
print csv_file_name
with open(csv_file_name, 'w') as file_:
writer = csv.writer(file_, delimiter="\t")
header = [ <the names of the tags here> ]
writer.writerow(header)
tags = [
<bunch of xml tags here>
]
#write the values
# for index in range(8,1000):
for index in range(3,len(root)):
#print index
row=[]
for tagindex,val in enumerate(tags):
searchQuery = "tags"+tags[tagindex]
# print searchQuery
# print root[index]
# print root[index].find(searchQuery).text
if (root[index].find(searchQuery) is None) or (root[index].find(searchQuery).text == None):
row.extend([""])
#print tags[tagindex]+" blank"
else:
row.extend([root[index].find(searchQuery).text])
#print tags[tagindex]+" "+root[index].find(searchQuery).text
writer.writerow(row)
#for i,child in enumerate(root):
#print root[i]
print '\nNumber of elements is: %s' % len(root)
print '\nTotal run time: %s seconds' % (time.time() - start_time)
if __name__ == "__main__":
main(sys.argv[1])
import xml.etree.cElementTree as ET