This project presents a simple yet insightful streamlit application that processes and presents analysed soccer player data.
Presented with raw data, the challenge was to wrangle and process the data to make it simpler to understand for technical soccer audiences..
The data used in this project is from wyscout and the tournament is the CAF Champions League 2022/2023, Africa’s top club competition. The data was cleaned and processed using jupyter notebooks. Some methodologies included:
Data Cleaning to filter players to meet a criteria/threshold on minutes played
Feature engineering to create new variables like xG/Shot
Modelling was done by standardising numerical variables to a value between 0 and 1
More modelling was also done by weighting different features to create composite score attributes like passing, dribbling, ball playing, creativity, among defence among others
You can visit the app page where it is hosted to see it in action, or visit the github repository to see its code:
The app site is below.
A Tableau Dashboard for this project is below
The github repository is below
The code to process the raw wyscout data and creates a csv to be used by the app is as below
import os
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
# Determine the directory of the notebook file
path_dir = os.path.dirname(os.path.abspath('__file__'))
#Check files of the directory
files = os.listdir(path_dir)
files
path_dir
#Select files for outfield plaers(excludes GKs)
files = ['Search results (1).xlsx', 'Search results (2).xlsx', 'Search results.xlsx']
#Initialize Empty List to store indivivual Datfarames
appended_data = []
for f in files:
data = pd.read_excel(f,)
appended_data.append(data)
player_data = pd.concat(appended_data)
#Set display options
pd.set_option('display.max_columns', None)
player_data
def process_wyscout_data(df):
# Filter data
threshold = df[(df['Matches played'] >= 3) & (df['Minutes played'] > 270)]
## FEATURE ENGINEERING
# Create goal contributions column
threshold['g+a'] = threshold['Goals'] + threshold['Assists']
threshold['xG per Shot'] = threshold['xG'] / threshold['Shots']
# Replace NaN values in 'xG per Shot' with zeros
threshold['xG per Shot'] = threshold['xG per Shot'].fillna(0)
#Columns to subset
desired_cols = ['Player', 'Team', 'Age', 'Birth country', 'Passport country', 'Foot', 'Height', 'Weight', 'Position',
'Position Group', 'Matches played', 'Minutes played', 'Goals', 'xG', 'Assists', 'xA', 'g+a',
'Duels per 90', 'Duels won, %', 'Successful defensive actions per 90', 'Defensive duels per 90',
'Defensive duels won, %', 'Aerial duels per 90', 'Aerial duels won, %', 'Sliding tackles per 90',
'PAdj Sliding tackles', 'Shots blocked per 90', 'Interceptions per 90', 'PAdj Interceptions',
'Fouls per 90', 'Yellow cards', 'Yellow cards per 90', 'Red cards', 'Red cards per 90',
'Successful attacking actions per 90', 'Goals per 90', 'Non-penalty goals', 'Non-penalty goals per 90',
'xG per 90', 'Shots', 'Shots per 90', 'Shots on target, %', 'xG per Shot', 'Goal conversion, %', 'Assists per 90',
'Crosses per 90', 'Accurate crosses, %', 'Dribbles per 90', 'Successful dribbles, %',
'Offensive duels per 90', 'Offensive duels won, %', 'Touches in box per 90',
'Progressive runs per 90', 'Received passes per 90', 'Received long passes per 90', 'Fouls suffered per 90',
'Passes per 90', 'Accurate passes, %', 'Forward passes per 90', 'Accurate forward passes, %',
'Back passes per 90', 'Accurate back passes, %', 'Lateral passes per 90', 'Accurate lateral passes, %',
'Short / medium passes per 90', 'Accurate short / medium passes, %', 'Long passes per 90',
'Accurate long passes, %', 'Average pass length, m', 'Average long pass length, m', 'xA per 90',
'Shot assists per 90', 'Second assists per 90', 'Third assists per 90', 'Smart passes per 90',
'Accurate smart passes, %', 'Key passes per 90', 'Passes to final third per 90',
'Accurate passes to final third, %', 'Passes to penalty area per 90', 'Accurate passes to penalty area, %',
'Through passes per 90', 'Accurate through passes, %', 'Deep completions per 90',
'Deep completed crosses per 90', 'Progressive passes per 90', 'Accurate progressive passes, %']
#Columns to standardize
std_cols = ['xG', 'xA','Duels per 90', 'Duels won, %', 'Successful defensive actions per 90', 'Defensive duels per 90',
'Defensive duels won, %', 'Aerial duels per 90', 'Aerial duels won, %', 'Sliding tackles per 90',
'PAdj Sliding tackles', 'Shots blocked per 90', 'Interceptions per 90', 'PAdj Interceptions',
'Fouls per 90', 'Yellow cards', 'Yellow cards per 90', 'Red cards', 'Red cards per 90',
'Successful attacking actions per 90', 'Goals per 90', 'Non-penalty goals', 'Non-penalty goals per 90',
'xG per 90', 'Shots', 'Shots per 90', 'Shots on target, %', 'xG per Shot', 'Goal conversion, %', 'Assists per 90',
'Crosses per 90', 'Accurate crosses, %', 'Dribbles per 90', 'Successful dribbles, %', 'Offensive duels per 90',
'Offensive duels won, %', 'Touches in box per 90', 'Progressive runs per 90', 'Received passes per 90',
'Received long passes per 90', 'Fouls suffered per 90', 'Passes per 90', 'Accurate passes, %',
'Forward passes per 90', 'Accurate forward passes, %', 'Back passes per 90', 'Accurate back passes, %',
'Lateral passes per 90', 'Accurate lateral passes, %', 'Short / medium passes per 90',
'Accurate short / medium passes, %', 'Long passes per 90', 'Accurate long passes, %', 'Average pass length, m',
'Average long pass length, m', 'xA per 90', 'Shot assists per 90', 'Second assists per 90',
'Third assists per 90', 'Smart passes per 90', 'Accurate smart passes, %', 'Key passes per 90',
'Passes to final third per 90', 'Accurate passes to final third, %', 'Passes to penalty area per 90',
'Accurate passes to penalty area, %', 'Through passes per 90', 'Accurate through passes, %',
'Deep completions per 90', 'Deep completed crosses per 90', 'Progressive passes per 90',
'Accurate progressive passes, %']
# List of position groups and their associated position keywords
position_to_group = {
'RW': 'W10', 'LW': 'W10', 'RAMF': 'W10', 'LAMF': 'W10', 'RWF': 'W10', 'LWF': 'W10', 'AMF': 'W10',
'CB': 'CB', 'RCB': 'CB', 'LCB': 'CB',
'RB': 'FB', 'LB': 'FB',
'RWB': 'WB', 'LWB': 'WB',
'LDMF': 'DM', 'DMF': 'DM', 'RDMF': 'DM', 'LCMF': 'CM', 'RCMF': 'CM', 'CMF': 'CM',
'CF': 'CF'
}
# Loop through each row and populate positional Dataframes
new_rows = []
for index, row in threshold.iterrows():
positions = row['Position'].split(', ') # Split positions string into a list
for position in positions:
position_group = position_to_group.get(position)
if position_group:
new_row = row.copy()
new_row['Position Group'] = position_group
new_rows.append(new_row)
new_df = pd.DataFrame(new_rows)
#Select desired columns and drop duplicates
new_df = new_df[desired_cols].drop_duplicates()
# Scale the specified columns between 0 and 1
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(new_df[std_cols])
# Create a DataFrame with the scaled data and column names
scaled_df = pd.DataFrame(scaled_data, columns=[col + '_scaled' for col in std_cols])
# Reset the index of new_df before concatenation
new_df.reset_index(drop=True, inplace=True)
# Concatenate the scaled DataFrame with the original DataFrame
new_df = pd.concat([new_df, scaled_df], axis=1)
# Calculate percentile ranks based on position groups and store in columns with _pr suffix
position_groups = new_df['Position Group'].unique()
for group in position_groups:
group_mask = new_df['Position Group'] == group
for col in std_cols:
col_name_pr = col + '_pr'
new_df.loc[group_mask, col_name_pr] = new_df.loc[group_mask, col + '_scaled'].rank(pct=True)
scaled_weights = {
"Passing_Score": {
"Accurate passes, %_scaled": 0.7,
"Accurate forward passes, %_scaled": 0.15,
"Accurate long passes, %_scaled": 0.15
},
"Dribbling_Score": {
"Successful dribbles, %_scaled": 0.7,
"Dribbles per 90_scaled": 0.3
},
"Scoring_Score": {
"xG_scaled": 0.5,
"xG per Shot_scaled": 0.1,
"Non-penalty goals_scaled": 0.1,
"Shots_scaled": 0.3
},
"ChanceCreation_Score": {
"Key passes per 90_scaled": 0.2,
"xA_scaled": 0.2,
"Accurate passes to final third, %_scaled": 0.25,
"Accurate crosses, %_scaled": 0.25,
"Successful attacking actions per 90_scaled": 0.1
},
"Duels_Score": {
"Aerial duels won, %_scaled": 0.3,
"Aerial duels per 90_scaled": 0.2,
"Defensive duels won, %_scaled": 0.3,
"Defensive duels per 90_scaled": 0.2
},
"Defense_Score": {
"Successful defensive actions per 90_scaled": 0.4,
"PAdj Sliding tackles_scaled": 0.2,
"PAdj Interceptions_scaled": 0.2,
"Shots blocked per 90_scaled": 0.2
}
}
for score_name, score_weights in scaled_weights.items():
for col, weight in score_weights.items():
weighted_col_name = col + '_weights'
new_df[weighted_col_name] = new_df[col] * weight
new_df['Passing'] = new_df['Accurate passes, %_scaled_weights'] + new_df['Accurate forward passes, %_scaled_weights'] + new_df['Accurate long passes, %_scaled_weights']
new_df['Dribbling'] = new_df['Successful dribbles, %_scaled_weights'] + new_df['Dribbles per 90_scaled_weights']
new_df['Scoring'] = new_df['xG_scaled_weights'] + new_df['xG per Shot_scaled_weights'] + new_df['Non-penalty goals_scaled_weights'] + new_df['Shots_scaled_weights']
new_df['Creativity'] = new_df['Key passes per 90_scaled_weights'] + new_df['xA_scaled_weights'] + new_df['Accurate passes to final third, %_scaled_weights'] + new_df['Accurate crosses, %_scaled_weights'] + new_df['Successful attacking actions per 90_scaled_weights']
new_df['Duels'] = new_df['Aerial duels won, %_scaled_weights'] + new_df['Aerial duels per 90_scaled_weights'] + new_df['Defensive duels won, %_scaled_weights'] + new_df['Defensive duels per 90_scaled_weights']
new_df['Defense'] = new_df['Successful defensive actions per 90_scaled_weights'] + new_df['PAdj Sliding tackles_scaled_weights'] + new_df['PAdj Interceptions_scaled_weights'] + new_df['Shots blocked per 90_scaled_weights']
new_df['Passing'] = new_df['Passing'] * 100
new_df['Dribbling'] = new_df['Dribbling'] * 100
new_df['Scoring'] = new_df['Scoring'] * 100
new_df['Creativity'] = new_df['Creativity'] * 100
new_df['Duels'] = new_df['Duels'] * 100
new_df['Defense'] = new_df['Defense'] * 100
bp_weights = {
"Ball Playing": {
"Passing": 0.5,
"Dribbling": 0.5
}
}
ip_weights = {
"In Possession": {
"Creativity": 0.4,
"Scoring": 0.2,
"Ball Playing": 0.4
}
}
op_weights = {
"Out Of Possession": {
"Duels": 0.6,
"Defense": 0.4
}
}
score_weights = {
"Overall Score": {
"In Possession": 0.5,
"Out Of Possession": 0.5
}
}
# Initialize the additional columns for sequential processing
new_df['Ball Playing'] = 0
new_df['In Possession'] = 0
new_df['Out Of Possession'] = 0
new_df['Overall Score'] = 0
# Create Ball Playing score
for bp_col, bp_weight in bp_weights['Ball Playing'].items():
new_df['Ball Playing'] += new_df[bp_col] * bp_weight
# Create In Possession score
for ip_col, ip_weight in ip_weights['In Possession'].items():
new_df['In Possession'] += new_df[ip_col] * ip_weight
# Create Out Of Possession score
for op_col, op_weight in op_weights['Out Of Possession'].items():
new_df['Out Of Possession'] += new_df[op_col] * op_weight
# Create Overall score
for score_col, score_weight in score_weights['Overall Score'].items():
new_df['Overall Score'] += new_df[score_col] * score_weight
#Calculate Overall Score Percentile Rank
position_groups = new_df['Position Group'].unique()
for group in position_groups:
group_mask = new_df['Position Group'] == group
overall_score_col = 'Overall Score'
overall_score_pr_col = overall_score_col + '_pr'
new_df.loc[group_mask, overall_score_pr_col] = new_df.loc[group_mask, overall_score_col].rank(pct=True)
new_df['Overall Score_pr'] = new_df['Overall Score_pr'] * 100
return new_df
dd = process_wyscout_data(player_data)
dd
dd.to_csv('ccl_final.csv', index=False)