Does anyone know why test1b() is so much faster than test1a()? How do you identify which line is the bottleneck and choose the alternative function to speed it up? Please share your experience
import numpy as np
import pandas as pd
import time
def test1a():
cols = 13
rows = 10000000
raw_data = np.random.randint(2, size=cols * rows).reshape(rows, cols)
col_names = ['v01', 'v02', 'v03', 'v04', 'v05', 'v06', 'v07',
'v08', 'v09', 'v10', 'v11', 'v12', 'outcome']
df = pd.DataFrame(raw_data, columns=col_names)
df['v11'] = df['v03'].apply(lambda x: ['t1', 't2', 't3', 't4'][np.random.randint(4)])
df['v12'] = df['v03'].apply(lambda x: ['p1', 'p2'][np.random.randint(2)])
return df
def test1b():
cols = 13
rows = 10000000
raw_data = np.random.randint(2, size=(rows,cols))
col_names = ['v01', 'v02', 'v03', 'v04', 'v05', 'v06', 'v07',
'v08', 'v09', 'v10', 'v11', 'v12', 'outcome']
df = pd.DataFrame(raw_data, columns=col_names)
df['v11'] = np.take(
np.array(['t1', 't2', 't3', 't4'], dtype=object),
np.random.randint(4, size=rows))
df['v12'] = np.take(
np.array(['p1', 'p2'], dtype=object),
np.random.randint(2, size=rows))
return df
start_time = time.time()
test1a()
t1a = time.time() - start_time
start_time = time.time()
test1b()
t1b = time.time() - start_time
print("Test1a: {}sec, Test1b: {}sec".format(t1a, t1b))