When talking performance, testing is key. Here's the runtime (on my machine, ofc) of some of the codes presented in the other answers, the original code and a numpy-based answer (all codes run on the same data):
import numpy as np
from collections import Counter
import time
random_data = np.random.randint(0, 100, [100, 100000])
#### Original code
def mostActive(customers):
unique_customers = set(customers)
count = len(customers)
result = []
for customer in unique_customers:
if customers.tolist().count(customer) / count >= .05: # had to add .tolist() to make it compatible with the data
result.append(customer)
return sorted(result)
start = time.time()
for i in range(100):
_ = mostActive(random_data[i])
end = time.time()
print(f'Avg time: {(end-start)*10} ms') # would be /100*1000 -> simplified to *10
# Avg time: 1394.4847583770752 ms
#### Sorted + Counter
def mostActive(customers):
return sorted(customer
for customer, count in Counter(customers).items()
if count / len(customers) >= .05)
start = time.time()
for i in range(100):
_ = mostActive(random_data[i])
end = time.time()
print(f'Avg time: {(end-start)*10} ms')
# Avg time: 16.061179637908936 ms
#### Numpy
start = time.time()
for i in range(100):
unique_elements, counts = np.unique(random_data[i], return_counts=True)
active = sorted(unique_elements[counts > 0.05*len(random_data[i])])
end = time.time()
print(f'Avg time: {(end-start)*10} ms')
# Avg time: 3.5660386085510254 ms
Unsurprisingly, the numpy-only solution is ligtning-fast (thanks to the underlying high-performance C implementation)