I have a csv with 400 000 rows. For each row I take all the previous rows, I filter them based on multiple conditions based on the current row and create some statistics. In total it takes too much time, so I am trying to find a way to speed it up. So I am in between the things below:
- a sqlite3 database
- using pandas
- using itetools.ifilter with the csv file
The following is the actual code I have, using the for loop as mentioned here:
# -*- coding: utf-8 -*-
import csv
import numpy as np
import time
import itertools
# Functions
def get_shmeio_stats(data, reference_list):
shmeio_stats = '-'
if data:
tally = (data.count(i) for i in reference_list)
shmeio_stats = '-'.join(map(str, tally))
else:
shmeio_stats ='-'
if shmeio_stats == '0-0-0':
shmeio_stats ='-'
return shmeio_stats
bet365_data_stats = []
shmeia_list = ['1', 'x', '2']
with open('BET365.csv', 'rb') as f:
bet365_data = csv.reader(f)
bet365_matches = list(bet365_data)[1:]
start = time.time()
for index, each_match in enumerate(bet365_matches):
print index,
start_each = time.time()
id = index
# print index
protathlima, xronia, match_date, home, odd_1, odd_x, odd_2, away, score, score_1, score_2, simeio, favori, under_over = each_match
previous_matches = bet365_matches[:index]
home_1, home_x, home_2 = [], [], [] # home_1, home_x, home_2
away_1, away_x, away_2 = [], [], [] # away_1, away_x, away_2
home_all_yrs_protathlima, away_all_yrs_protathlima = [], [] # home_all_yrs_protathlima, away_all_yrs_protathlima
home_forma_last_6_home, away_forma_last_6_away = [], [] # home_forma_last_6_home, away_forma_last_6_away
home_forma_last_6_home_away, away_forma_last_6_home_away = [], [] # home_forma_last_6_home_away, away_forma_last_6_home_away
akrivis_protathlima, akrivis_genika = [], [] # akrivis_protathlima, akrivis_genika
mesos_oros_goal_home_last_6_home, mesos_oros_goal_away_last_6_away = [], [] # mesos_oros_goal_home_last_6_home, mesos_oros_goal_away_last_6_away
for each_item in previous_matches:
if each_item[3] == home:
# home_1, home_x, home_2
if each_item[4] == odd_1:
home_1.append(each_item[11])
if each_item[5] == odd_x:
home_x.append(each_item[11])
if each_item[6] == odd_2:
home_2.append(each_item[11])
# home_all_yrs_protathlima
if each_item[0] == protathlima:
home_all_yrs_protathlima.append(each_item[11])
# home_forma_last_6_home
if each_item[1] == xronia:
home_forma_last_6_home.append(each_item[11])
home_forma_last_6_home_away.append(each_item[11])
mesos_oros_goal_home_last_6_home.append(float(each_item[9]))
if each_item[7] == home:
if each_item[0] == protathlima:
# home_forma_last_6_home_away
if each_item[1] == xronia:
home_forma_last_6_home_away.append(each_item[11])
if each_item[3] == away:
if each_item[0] == protathlima:
# away_forma_last_6_away
if each_item[1] == xronia:
away_forma_last_6_home_away.append(each_item[11])
if each_item[7] == away:
# away_1, away_x, away_2
if each_item[4] == odd_1:
away_1.append(each_item[11])
if each_item[5] == odd_x:
away_x.append(each_item[11])
if each_item[6] == odd_2:
away_2.append(each_item[11])
# away_all_yrs_protathlima
if each_item[0] == protathlima:
away_all_yrs_protathlima.append(each_item[11])
# away_forma_last_6_away
if each_item[1] == xronia:
away_forma_last_6_away.append(each_item[11])
away_forma_last_6_home_away.append(each_item[11])
mesos_oros_goal_away_last_6_away.append(float(each_item[10]))
# akrivis_protathlima, akrivis_genika
if each_item[4] == odd_1 and each_item[5] == odd_x and each_item[6] == odd_2:
akrivis_genika.append(each_item[11])
if each_item[0] == protathlima:
akrivis_protathlima.append(each_item[11])
stop_filter = time.time() - start_each
print round(stop_filter, 6),
# Calculate statistics
# home_1, home_x, home_2
home_1 = get_shmeio_stats(home_1, shmeia_list)
home_x = get_shmeio_stats(home_x, shmeia_list)
home_2 = get_shmeio_stats(home_2, shmeia_list)
# away_1, away_x, away_2
away_1 = get_shmeio_stats(away_1, shmeia_list)
away_x = get_shmeio_stats(away_x, shmeia_list)
away_2 = get_shmeio_stats(away_2, shmeia_list)
# home_all_yrs_protathlima, away_all_yrs_protathlima
home_all_yrs_protathlima = get_shmeio_stats(home_all_yrs_protathlima, shmeia_list)
away_all_yrs_protathlima = get_shmeio_stats(away_all_yrs_protathlima, shmeia_list)
# home_forma_last_6_home, away_forma_last_6_away
home_forma_last_6_home = get_shmeio_stats(home_forma_last_6_home[-6:], shmeia_list)
away_forma_last_6_away = get_shmeio_stats(away_forma_last_6_away[-6:], shmeia_list)
# home_forma_last_6_home_away, away_forma_last_6_home_away
home_forma_last_6_home_away = get_shmeio_stats(home_forma_last_6_home_away[-6:], shmeia_list)
away_forma_last_6_home_away = get_shmeio_stats(away_forma_last_6_home_away[-6:], shmeia_list)
# akrivis_protathlima, akrivis_genika
akrivis_protathlima = get_shmeio_stats(akrivis_protathlima, shmeia_list)
akrivis_genika = get_shmeio_stats(akrivis_genika, shmeia_list)
# mesos_oros_goal_home_last_6_home, mesos_oros_goal_away_last_6_away
try:
if mesos_oros_goal_home_last_6_home:
mesos_oros_goal_home_last_6_home = round(np.average(mesos_oros_goal_home_last_6_home[-6:]), 2)
else:
mesos_oros_goal_home_last_6_home = '-'
except:
mesos_oros_goal_home_last_6_home = '-'
try:
if mesos_oros_goal_away_last_6_away:
mesos_oros_goal_away_last_6_away = round(np.average(mesos_oros_goal_away_last_6_away[-6:]), 2)
else:
mesos_oros_goal_away_last_6_away = '-'
except:
mesos_oros_goal_away_last_6_away = '-'
stop_function = time.time() - start_each
print round(stop_function, 6),
match_stats = [id, protathlima, xronia, match_date, home, odd_1, odd_x,
odd_2, away, score, score_1, score_2, simeio,
favori, under_over, home_1, home_x, home_2, away_1,
away_x, away_2, home_all_yrs_protathlima,
away_all_yrs_protathlima, home_forma_last_6_home,
away_forma_last_6_away, home_forma_last_6_home_away,
away_forma_last_6_home_away, akrivis_protathlima,
akrivis_genika, mesos_oros_goal_home_last_6_home,
mesos_oros_goal_away_last_6_away]
bet365_data_stats.append(match_stats)
stop_each = time.time() - start_each
print round(stop_each, 6)
stop = time.time() - start
print 'Completed in:', stop
with open('BET365_stats_loop.csv', 'wb') as f:
bet365_stats = csv.writer(f)
bet365_stats.writerows(bet365_data_stats)
This is a part of my csv:

I have run it up to 180 000 rows, and each row took the maximum 0.3s.
Do you think that because of the amount of the data no matter what I choose the whole process will take too long ?
Update: At end I used a dictionary adding new keys when needed and updating them accordingly. It got really fast that way.