I have 30 csv files. Each file has 200,000 row and 10 columns.
I want to read these files and do some process. Below is the code without multi-thread:
import os
import time
csv_dir = './csv'
csv_save_dir = './save_csv'
csv_files = os.listdir(csv_dir)
if __name__ == '__main__':
if not os.path.exists(csv_save_dir):
os.makedirs(csv_save_dir)
start = time.perf_counter()
for csv_file in csv_files:
csv_file_path = os.path.join(csv_dir,csv_file)
with open(csv_file_path,'r') as f:
lines = f.readlines()
csv_file_save_path = os.path.join(csv_save_dir,'1_'+csv_file)
with open(csv_file_save_path,'w') as f:
f.writelines(lines[:20])
print(f'CSV File saved...')
finish = time.perf_counter()
print(f'Finished in {round(finish-start, 2)} second(s)')
The elapsed time of the above code is about 7 seconds. This time, I modified the above code with multi-thread. The code is as follows:
import os
import time
import concurrent.futures
csv_dir = './csv'
csv_save_dir = './save_csv'
csv_files = os.listdir(csv_dir)
def read_and_write_csv(csv_file):
csv_file_path = os.path.join(csv_dir,csv_file)
with open(csv_file_path,'r') as f:
lines = f.readlines()
csv_file_save_path = os.path.join(csv_save_dir,'1_'+csv_file)
with open(csv_file_save_path,'w') as f:
f.writelines(lines[:20])
print(f'CSV File saved...')
if __name__ == '__main__':
if not os.path.exists(csv_save_dir):
os.makedirs(csv_save_dir)
start = time.perf_counter()
with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
executor.map(read_and_write_csv, [csv_file for csv_file in csv_files])
finish = time.perf_counter()
print(f'Finished in {round(finish-start, 2)} second(s)')
I expected the above code take less time than my first code because of using multi-threads. But the elapsed time is about 7 seconds!!
Is there way to speed up using multi-threads?
ProcessPoolExecutor? Maybe it's that simple.