Here's my code:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import pandas as pd
import matplotlib as mpl
import numpy as np
import matplotlib.pyplot as plt
#create a list of each year where data will be extract
years_list = [2001, 2002, 2008, 2012, 2015,2018, 2020 , 2021]
player_list = ['Mac Jones', 'Aaron Rodgers', 'Deshaun Watson', 'Patrick Mahomes',
'Josh Allen', 'Ryan Tannehill', 'Drew Bress', 'Russel Wilson',
'Kirk Cousins', 'Tom Brady', 'Derek Carr']
#selecting stats
cols = ['Player', 'Tm','Cmp%', 'Yds', 'TD', 'Int', 'Y/A', 'Rate', 'G']
df_list = []
#loop for extract data
for year in years_list:
url_mac = f'https://www.pro-football-reference.com/years/{year}/passing.htm'
temp_df = pd.read_html(url_mac)[0][cols]
temp_df['Season'] = year
df_list.append(temp_df)
print(f'Collected: {year}')
data_radar = pd.concat(df_list)
#renaming columns
new_columns = data_radar.columns.values
new_columns[-6] = 'y_sack'
data_radar.columns = new_columns
#picking stats
mid_data = pd.DataFrame()
for player in player_list:
mid_data = mid_data.append(data_radar[data_radar['Player'] == player + '*'])
mid_data = mid_data.append(data_radar[data_radar['Player'] == player + '*' + '+'])
mid_data = mid_data.append(data_radar[data_radar['Player'] == player])
mid_data = mid_data.append(data_radar[data_radar['Player'] == player + '+'])
#relevant stats
cols = ['Cmp%', 'Yds', 'Int', 'Y/A','Rate', 'G', 'Season']
final_data = pd.DataFrame()
#fixing names
mid_data = mid_data.replace({'Tom Brady*':'Tom Brady', 'Aaron Rodgers*':'Aaron Rodgers','Aaron Rodgers*+':'Aaron Rodgers',
'Deshaun Watson*':'Deshaun Watson', 'Josh Allen*':'Josh Allen',
'Derek Carr*':'Derek Carr','Patrick Mahomes*':'Patrick Mahomes', 'Patrick Mahomes*+':'Patrick Mahomes' })
#Select informations about players and ordering
final_data = mid_data[['Player', 'Tm'] + cols]
final_data.sort_values(by = 'Player', ascending=True)
final_data.drop_duplicates(subset = 'Player')
What i want with that code is that my df final_data returns me first season of each player, but that dont work with some players that i needed use replace method.
Where i write to sort_value that's my result, before drop.duplicates()
My idea was sort these values, then use drop.duplicates() to select just first of each player.
This happen with all players that i needed use replace method. How fix this ?
