In [59]:
from google.colab import drive
drive.mount("/content/drive")
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Imports¶
In [60]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Dict
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix
import warnings
warnings.filterwarnings('ignore')
In [61]:
import os
data_path="/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files"
season_csv_list=os.listdir(data_path)
#file path for each csv
season_files=[os.path.join(data_path,s) for s in season_csv_list]
print(season_files)
['/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2024.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2023.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2022.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2021.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2020.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2019.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2018.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2017.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2016.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2015.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2014.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2013.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2012.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2011.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2010.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2009.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2008.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2007.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2006.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2005.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2004.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2003.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2002.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2001.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2000.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2025.csv']
Functions calculating Teams league position based on wins and goals scored¶
In [62]:
def prepare_league_table_stats(df:pd.DataFrame)->pd.DataFrame:
table:Dict[str,Dict[str,int]]=defaultdict(lambda:{
"team":0,
"points":0,
"goals_for":0,
"goals_against":0,
"wins":0,
"losses":0,
"draws":0
})
for _,row in df.iterrows():
home,away=row["HomeTeam"],row["AwayTeam"]
hg,ag=row["FTHG"],row["FTAG"]
table[home]["team"]=home
table[away]["team"]=away
table[home]["goals_for"]+=hg
table[home]["goals_against"]+=ag
table[away]["goals_for"]+=ag
table[away]["goals_against"]+=hg
if hg>ag:
table[home]["points"]+=3
table[home]["wins"]+=1
table[away]["losses"]+=1
elif ag>hg:
table[away]["points"]+=3
table[away]["wins"]+=1
table[home]["losses"]+=1
else:
table[home]["points"]+=1
table[away]["points"]+=1
table[home]["draws"]+=1
table[away]["draws"]+=1
table_df=pd.DataFrame(table).T
table_df["goal_diff"]=table_df["goals_for"]-table_df["goals_against"]
table_df.sort_values(["points","wins","goals_for"],ascending=[False,False,False],inplace=True)
table_df=table_df.reset_index(drop=True)
table_df["position"]=table_df.index+1
return table_df
In [63]:
def add_league_stats_to_matches(df:pd.DataFrame)->pd.DataFrame:
"""
Parameter:Season matches stats
Function:Matches data frame will be used to compute league table stats for each teams.The stats will be merged to corresponding team matches in h2h df
Return:Matches stats with teams league positions,points,goals difference for both home and away team
"""
#preparing league positions stats
table_df=prepare_league_table_stats(df)
#merging league stats on h2h df
df = df.drop(columns=[col for col in df.columns if col.startswith(('h_', 'a_'))], errors='ignore')
merged_df = df.merge(
table_df.add_prefix("h_"),
how='left',
left_on='HomeTeam',
right_on='h_team'
).merge(
table_df.add_prefix("a_"),
how='left',
left_on='AwayTeam',
right_on='a_team'
)
return merged_df
Utility functions¶
In [64]:
def parse_date_df(df: pd.DataFrame) -> pd.DataFrame:
from pandas.api.types import is_datetime64_any_dtype
if not is_datetime64_any_dtype(df['Date']):
try:
df['Date'] = pd.to_datetime(df['Date'], dayfirst=True, errors='coerce')
except Exception:
# Fallback format
df['Date'] = pd.to_datetime(df['Date'], format="%d/%m/%y", errors='coerce')
# Optional: drop rows with invalid dates
df = df.dropna(subset=['Date'])
return df
In [65]:
def get_season(df: pd.DataFrame) -> pd.DataFrame:
"""
Assign Premier League season based on date.
Season starts in August and ends next year.
"""
years = df['Date'].dt.year
df['season'] = (years - (df['Date'].dt.month < 8)).astype(str) + '-' + (years - (df['Date'].dt.month < 8) + 1).astype(str)
return df
In [66]:
def get_last5_stats(df:pd.DataFrame,team:str):
team_matches5=df[(df["HomeTeam"]==team)|(df["AwayTeam"]==team)].sort_values(['Date'],ascending=False).head()
avg_goals_for_last5=( (team_matches5['HomeTeam']==team) * team_matches5['FTHG'] +
(team_matches5['AwayTeam']==team) * team_matches5['FTAG'] ).mean()
avg_goals_against_last5=((team_matches5['HomeTeam']==team) * team_matches5['FTAG']+
(team_matches5['AwayTeam']==team) * team_matches5['FTHG']).mean()
avg_corners_last5=((team_matches5['HomeTeam']==team)* team_matches5['HC']+
(team_matches5['AwayTeam']==team)* team_matches5['AC']).mean()
avg_redcards_last5=((team_matches5['HomeTeam']==team) * team_matches5['HR']+
(team_matches5['AwayTeam']==team) * team_matches5['AR']).mean()
# win rate in last 5 matches
wins=((team_matches5['HomeTeam']==team) & (team_matches5['FTR']=='H') |
(team_matches5['AwayTeam']==team)& (team_matches5['FTR']=='A')).sum()
win_rate=wins/len(team_matches5)
draws=((team_matches5["HomeTeam"]==team) & (team_matches5['FTR']=='D') |
(team_matches5["AwayTeam"]==team) & (team_matches5['FTR']=='D')).sum()
draw_rate=draws/len(team_matches5)
return{
"avg_goals_for_last5":avg_goals_for_last5,
"avg_goals_against_last5":avg_goals_against_last5,
"avg_corners_last5":avg_corners_last5,
"avg_redcards_last5":avg_redcards_last5,
"win_rate_last5":win_rate,
"draw_rate_last5":draw_rate
}
In [67]:
def add_last5_features(fixtures:pd.DataFrame)->pd.DataFrame:
features=[]
for _,row in fixtures.iterrows():
home_stats_last5=get_last5_stats(fixtures,row['HomeTeam'])
away_stats_last5=get_last5_stats(fixtures,row['AwayTeam'])
features.append({
**row,
**{f"h_{k}": v for k,v in home_stats_last5.items()},
**{f"a_{k}":v for k, v in away_stats_last5.items()},
})
return pd.DataFrame(features)
In [68]:
def calculate_h2h_win_rates(df: pd.DataFrame) -> pd.DataFrame:
"""Calculate head-to-head win rates for each match."""
h2h: Dict = {}
home_rates, away_rates,draw_rates = [], [], []
for _, row in df.iterrows():
home, away = row["HomeTeam"], row["AwayTeam"]
key = tuple(sorted([home, away]))
stats = h2h.get(key, {"team1": 0, "team2": 0,"draws":0, "matches": 0})
total = stats["matches"]
# Compute current H2H rates
if total == 0:
home_rate = away_rate = 0.5
draw_rate=0.33
else:
if home == key[0]:
home_rate = stats["team1"] / total
away_rate = stats["team2"] / total
else:
home_rate = stats["team2"] / total
away_rate = stats["team1"] / total
draw_rate=stats["draws"]/total
home_rates.append(home_rate)
away_rates.append(away_rate)
draw_rates.append(draw_rate)
# Update stats with current match result
stats["matches"]+=1
if row["FTR"]=='H':
stats["team1" if home==key[0] else "team2"]+=1
elif row["FTR"]=='A':
stats["team2" if home==key[0] else "team1"]+=1
else:
stats["draws"]+=1
h2h[key] = stats
# Assign to dataframe
df["home_vs_away_winrate"] = home_rates
df["away_vs_home_winrate"] = away_rates
df["h2h_draw_rate"]=draw_rates
return df
Preparing Training dataset¶
In [69]:
#preparing dataset
season_fixtures:Dict[str,pd.DataFrame]={}
for file_path in season_files:
raw = pd.read_csv(file_path, encoding='ISO-8859-1', on_bad_lines='skip')
# parsing date and sorting them in desending order
raw=parse_date_df(raw)
raw=raw.sort_values(["Date"],ascending=False).reset_index(drop=True)
raw=get_season(raw)
for season_name, group in raw.groupby('season'):
season_fixtures[season_name] = group
group=add_league_stats_to_matches(group)
season_fixtures[season_name] = add_last5_features(group)
In [70]:
season_df = pd.concat(season_fixtures.values(), ignore_index=True)
# Shot efficiency
h_shot_eff = season_df["HST"].div(season_df["HS"]).fillna(0)
a_shot_eff = season_df["AST"].div(season_df["AS"]).fillna(0)
season_df["home_shot_efficiency"] = h_shot_eff
season_df["away_shot_efficiency"] = a_shot_eff
# Gap features
season_df["goal_diff_gap"] = season_df["h_goal_diff"] - season_df["a_goal_diff"]
season_df["points_gap"] = season_df["h_points"] - season_df["a_points"]
season_df["form_gap"] = season_df["HomeTeam_Form"] - season_df["AwayTeam_Form"]
season_df["win_rate_gap"] = season_df["h_win_rate_last5"] - season_df["a_win_rate_last5"]
season_df["attack_strength_gap"] = season_df["h_avg_goals_for_last5"] - season_df["a_avg_goals_for_last5"]
season_df["defense_strength_gap"] = season_df["a_avg_goals_against_last5"] - season_df["h_avg_goals_against_last5"]
season_df["league_position_gap"] = season_df["h_position"] - season_df["a_position"]
# Head-to-head win rates
season_df = calculate_h2h_win_rates(season_df)
# =====feature engineering from odd scores of B365 agency====
season_df["home_away_win_ratio"]=season_df["B365H"]/season_df["B365A"]
season_df["draw_odd_ratio"]=season_df["B365D"]/season_df[["B365H","B365A"]].min(axis=1)
# calculating implied probablity of bet odd
season_df["h_implied_prob"]=1/season_df["B365H"]
season_df["a_implied_prob"]=1/season_df["B365D"]
season_df["d_implied_prob"]=1/season_df["B365A"]
season_df["betting_confidence"]=season_df["h_implied_prob"]-season_df["a_implied_prob"]
"""
favorite strength-> Strong team probabilty to win
underdog strength-> Weak team probabilty to win
"""
season_df["favorite_strength"]=season_df[["h_implied_prob","a_implied_prob"]].min(axis=1)
season_df["underdog_strength"]=season_df[["h_implied_prob","a_implied_prob"]].max(axis=1)
#calculating odd variance from different book markers
home_odds_cols = [
"B365H", "BWH", "BFH", "PSH", "WHH", "1XBH", "MaxH", "AvgH", "BFEH"
]
away_odds_cols = [
"B365A", "BWA", "BFA", "PSA", "WHA", "1XBA", "MaxA", "AvgA", "BFEA"
]
#==========last 5 matches ===================
season_df['home_draw_rate_last5'] = season_df.groupby('HomeTeam')['FTR'].apply(
lambda x: (x.shift()=='D').rolling(5, min_periods=1).mean()
).reset_index(level=0, drop=True)
season_df['away_draw_rate_last5'] = season_df.groupby('AwayTeam')['FTR'].apply(
lambda x: (x.shift()=='D').rolling(5, min_periods=1).mean()
).reset_index(level=0, drop=True)
season_df['draw_rate_gap'] = season_df['home_draw_rate_last5'] - season_df['away_draw_rate_last5']
season_df['home_low_score_rate'] = season_df.groupby('HomeTeam')['FTHG'].apply(lambda x: (x.shift()<=1).rolling(5,min_periods=1).mean()).reset_index(level=0, drop=True)
season_df['away_low_score_rate'] = season_df.groupby('AwayTeam')['FTAG'].apply(lambda x: (x.shift()<=1).rolling(5,min_periods=1).mean()).reset_index(level=0, drop=True)
season_df['low_score_gap'] = season_df['home_low_score_rate'] - season_df['away_low_score_rate']
season_df["home_odd_variance"]=season_df[home_odds_cols].var(axis=1)
season_df["away_odd_variance"]=season_df[away_odds_cols].var(axis=1)
#=============================================================
season_df.sort_values(["season", "Date"], ascending=False, inplace=True)
# Updated selected columns
selected_cols = [
"HomeTeam", "AwayTeam",
"FTHG", "FTAG",
# Head to head win rates
"home_vs_away_winrate",
"away_vs_home_winrate",
# Gap features
"goal_diff_gap",
"points_gap",
"form_gap",
"win_rate_gap",
"attack_strength_gap",
"defense_strength_gap",
"league_position_gap",
# Match performance stats
"home_shot_efficiency",
"away_shot_efficiency",
# Home advantage
"HomeTeam_Form",
"AwayTeam_Form",
"home_away_win_ratio",
"draw_odd_ratio",
"h_implied_prob",
"a_implied_prob",
"d_implied_prob",
"home_odd_variance",
"away_odd_variance",
"betting_confidence",
"favorite_strength",
"underdog_strength",
"home_draw_rate_last5",
"away_draw_rate_last5",
"draw_rate_gap",
"home_low_score_rate",
"away_low_score_rate",
"low_score_gap",
"FTR"
]
store_df = season_df.copy()
store_df.replace([-np.inf,np.inf],np.nan,inplace=True)
store_df.fillna(0.5,inplace=True)
season_df = season_df[selected_cols]
season_df.reset_index(drop=True, inplace=True)
# Drop rows with missing critical data
season_df.dropna(subset=["HomeTeam", "AwayTeam", "FTHG", "FTAG", "FTR"], inplace=True)
season_df.head()
Out[70]:
| HomeTeam | AwayTeam | FTHG | FTAG | home_vs_away_winrate | away_vs_home_winrate | goal_diff_gap | points_gap | form_gap | win_rate_gap | ... | betting_confidence | favorite_strength | underdog_strength | home_draw_rate_last5 | away_draw_rate_last5 | draw_rate_gap | home_low_score_rate | away_low_score_rate | low_score_gap | FTR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Brentford | Man City | 0.0 | 1.0 | 0.250000 | 0.625000 | -12.0 | -6 | 0.0 | -0.4 | ... | -0.011696 | 0.210526 | 0.222222 | 0.2 | 0.2 | 0.0 | 0.6 | 0.4 | 0.2 | A |
| 1 | Wolves | Brighton | 1.0 | 1.0 | 0.142857 | 0.428571 | -9.0 | -7 | 0.0 | -0.4 | ... | 0.007508 | 0.270270 | 0.277778 | 0.2 | 0.2 | 0.0 | 0.8 | 0.8 | 0.0 | D |
| 2 | Newcastle | Nott'm Forest | 2.0 | 0.0 | 0.833333 | 0.166667 | 8.0 | 4 | 0.0 | 0.4 | ... | 0.398847 | 0.238095 | 0.636943 | 0.2 | 0.2 | 0.0 | 0.6 | 1.0 | -0.4 | H |
| 3 | Everton | Crystal Palace | 2.0 | 1.0 | 0.500000 | 0.115385 | -2.0 | -1 | 0.0 | -0.2 | ... | 0.100471 | 0.307692 | 0.408163 | 0.4 | 0.4 | 0.0 | 0.6 | 1.0 | -0.4 | H |
| 4 | Aston Villa | Burnley | 2.0 | 1.0 | 0.416667 | 0.166667 | 7.0 | 5 | -0.4 | 0.4 | ... | 0.375000 | 0.250000 | 0.625000 | 0.4 | 0.0 | 0.4 | 0.4 | 0.8 | -0.4 | H |
5 rows × 34 columns
EDA¶
In [90]:
numeric_cols=season_df.select_dtypes(include=["number"])
plt.figure(figsize=(20,11))
sns.heatmap(numeric_cols.corr(),annot=True,cmap='coolwarm')
Out[90]:
<Axes: >
Distribution of match outcome¶
In [72]:
sns.countplot(x=season_df['FTR'])
Out[72]:
<Axes: xlabel='FTR', ylabel='count'>
Dataset is biased on home wins
Feature Selection¶
In [73]:
# selecting features for Training and Test sets
features = [
# head to head win rates
"home_vs_away_winrate",
"away_vs_home_winrate",
# Match performance stats
"home_shot_efficiency",
"away_shot_efficiency",
"goal_diff_gap",
"points_gap",
"form_gap",
"win_rate_gap",
"attack_strength_gap",
"defense_strength_gap",
"league_position_gap",
"home_away_win_ratio",
"draw_odd_ratio",
"h_implied_prob",
"a_implied_prob",
"d_implied_prob",
"home_odd_variance",
"away_odd_variance",
"betting_confidence",
"favorite_strength",
"underdog_strength",
"home_draw_rate_last5",
"away_draw_rate_last5",
"draw_rate_gap",
"home_low_score_rate",
"away_low_score_rate",
"low_score_gap"
]
target='FTR'
X=season_df[features]
X = X[features].fillna(0)
y=season_df[target]
outcome_to_int = {
"H": 0,
"D": 1,
"A": 2
}
y=y.map(outcome_to_int)
scaler=StandardScaler()
X=X.replace([np.inf,-np.inf],np.nan)
X=X.fillna(0.5)
#scaling X
X_scaled=scaler.fit_transform(X)
# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,
stratify=y)
Random Forest Classifier¶
In [74]:
rf=RandomForestClassifier( n_estimators=300, # more trees
max_depth=12, # limit tree depth
min_samples_split=5, # prevent splits with too few samples
min_samples_leaf=2, # leaf must have at least 2 samples
max_features='sqrt', # sqrt(num_features) for split
class_weight="balanced", # handle class imbalance
random_state=42)
rf.fit(X_train,y_train)
y_pred=rf.predict(X_test)
# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
Accuracy: 0.5922995780590717
precision recall f1-score support
0 0.69 0.74 0.71 869
1 0.36 0.26 0.30 467
2 0.58 0.64 0.61 560
accuracy 0.59 1896
macro avg 0.54 0.55 0.54 1896
weighted avg 0.57 0.59 0.58 1896
In [75]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns
cm = confusion_matrix(y_test, y_pred)
cm
Out[75]:
array([[644, 123, 102],
[188, 121, 158],
[107, 95, 358]])
In [76]:
import numpy as np
classes = ["Home", "Draw", "Away"]
def predict_match_rf(df, model, home_team, away_team, features,alpha=0.7):
"""
Predict the outcome of a single match using a trained Random Forest.
Parameters:
df (pd.DataFrame): Dataset with match features.
model: Trained RandomForestClassifier.
home_team (str): Name of the home team.
away_team (str): Name of the away team.
features (list): List of feature column names.
draw_threshold (float): Minimum probability to predict Draw.
Returns:
predicted_class (int or None): 0=Home, 1=Draw, 2=Away
predicted_probs (np.array or None): probability array [Home, Draw, Away]
"""
# Select the match row
match_row = df[(df['HomeTeam'] == home_team) & (df['AwayTeam'] == away_team)]
if match_row.empty:
return None, None
X_match = match_row[features]
# Predict probabilities
y_pred_proba = model.predict_proba(X_match)[0]
implied_prob_home=match_row["h_implied_prob"].values[0]
implied_prob_draw=match_row["d_implied_prob"].values[0]
implied_prob_away=match_row["a_implied_prob"].values[0]
final_home_prob=y_pred_proba[0]*alpha+(1-alpha)*implied_prob_home
final_draw_prob=y_pred_proba[1]*alpha+(1-alpha)*implied_prob_draw
final_away_prob=y_pred_proba[2]*alpha+(1-alpha)*implied_prob_away
total=final_home_prob+final_draw_prob+final_away_prob
final_y_proba=np.array([final_home_prob,final_draw_prob,final_away_prob])/total
return final_y_proba
# ==================================
# Example usage: loop through fixtures
# ==================================
fixtures = [
("Nott'm Forest", "Chelsea"),
("Brighton", "Newcastle"),
("Burnley", "Leeds"),
("Crystal Palace", "Bournemouth"),
("Man City", "Everton"),
("Sunderland", "Wolves"),
("Fulham", "Arsenal"),
("Tottenham", "Aston Villa"),
("Liverpool", "Man United"),
("West Ham", "Brentford"),
]
for home_team, away_team in fixtures:
pred_probs = predict_match_rf(store_df, rf, home_team, away_team, features)
pred_class=pred_probs.argmax()
if pred_class is not None:
print(f"{home_team} vs {away_team}")
print(" Home ", f"{pred_probs[0]*100:.2f}%")
print(" Draw ", f"{pred_probs[1]*100:.2f}%")
print(" Away ", f"{pred_probs[2]*100:.2f}%")
print("-" * 40)
Nott'm Forest vs Chelsea Home 20.28% Draw 34.20% Away 45.52% ---------------------------------------- Brighton vs Newcastle Home 24.35% Draw 53.20% Away 22.45% ---------------------------------------- Burnley vs Leeds Home 27.63% Draw 54.39% Away 17.98% ---------------------------------------- Crystal Palace vs Bournemouth Home 23.42% Draw 53.72% Away 22.86% ---------------------------------------- Man City vs Everton Home 45.25% Draw 40.90% Away 13.85% ---------------------------------------- Sunderland vs Wolves Home 42.66% Draw 42.12% Away 15.22% ---------------------------------------- Fulham vs Arsenal Home 16.70% Draw 62.32% Away 20.98% ---------------------------------------- Tottenham vs Aston Villa Home 36.39% Draw 31.74% Away 31.87% ---------------------------------------- Liverpool vs Man United Home 55.57% Draw 35.36% Away 9.07% ---------------------------------------- West Ham vs Brentford Home 21.15% Draw 30.79% Away 48.06% ----------------------------------------
Neural network¶
In [77]:
from tensorflow import keras
from tensorflow.keras import layers
model=keras.Sequential([
layers.Dense(128,activation='relu',input_shape=(X_train.shape[1],)),
layers.Dropout(0.3),
layers.Dense(64,activation='relu'),
layers.Dropout(0.3),
layers.Dense(32,activation='relu'),
layers.Dense(3,activation='softmax')
])
model.compile(optimizer='adam',metrics=['accuracy'],loss='sparse_categorical_crossentropy')
In [78]:
from sklearn.utils.class_weight import compute_class_weight
class_weights=compute_class_weight('balanced',classes=np.unique(y_train),y=y_train)
class_weights=dict(enumerate(class_weights))
history=model.fit(X_train,y_train,
batch_size=32,epochs=60,
validation_split=0.2,class_weight=class_weights)
Epoch 1/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 2s 4ms/step - accuracy: 0.4141 - loss: 1.6111 - val_accuracy: 0.5201 - val_loss: 0.9689 Epoch 2/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.4887 - loss: 1.0220 - val_accuracy: 0.5188 - val_loss: 0.9921 Epoch 3/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.4925 - loss: 1.0147 - val_accuracy: 0.5122 - val_loss: 1.0036 Epoch 4/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - accuracy: 0.5131 - loss: 1.0021 - val_accuracy: 0.5069 - val_loss: 0.9963 Epoch 5/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - accuracy: 0.5309 - loss: 0.9876 - val_accuracy: 0.4819 - val_loss: 1.0162 Epoch 6/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - accuracy: 0.5128 - loss: 0.9928 - val_accuracy: 0.5049 - val_loss: 0.9804 Epoch 7/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - accuracy: 0.5139 - loss: 0.9967 - val_accuracy: 0.5016 - val_loss: 0.9896 Epoch 8/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - accuracy: 0.5216 - loss: 0.9859 - val_accuracy: 0.4957 - val_loss: 1.0072 Epoch 9/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5194 - loss: 0.9877 - val_accuracy: 0.5129 - val_loss: 0.9728 Epoch 10/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5100 - loss: 0.9995 - val_accuracy: 0.5115 - val_loss: 0.9666 Epoch 11/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5164 - loss: 0.9972 - val_accuracy: 0.5201 - val_loss: 0.9587 Epoch 12/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5232 - loss: 0.9890 - val_accuracy: 0.5082 - val_loss: 0.9907 Epoch 13/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5152 - loss: 0.9866 - val_accuracy: 0.5221 - val_loss: 0.9571 Epoch 14/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5302 - loss: 0.9760 - val_accuracy: 0.5221 - val_loss: 0.9719 Epoch 15/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5322 - loss: 0.9743 - val_accuracy: 0.5115 - val_loss: 0.9797 Epoch 16/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5221 - loss: 0.9716 - val_accuracy: 0.5096 - val_loss: 0.9716 Epoch 17/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5240 - loss: 0.9743 - val_accuracy: 0.5115 - val_loss: 0.9778 Epoch 18/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5289 - loss: 0.9737 - val_accuracy: 0.5142 - val_loss: 0.9725 Epoch 19/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5405 - loss: 0.9687 - val_accuracy: 0.5274 - val_loss: 0.9704 Epoch 20/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5321 - loss: 0.9807 - val_accuracy: 0.5214 - val_loss: 0.9836 Epoch 21/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5345 - loss: 0.9760 - val_accuracy: 0.5194 - val_loss: 0.9728 Epoch 22/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5281 - loss: 0.9780 - val_accuracy: 0.5254 - val_loss: 0.9826 Epoch 23/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - accuracy: 0.5439 - loss: 0.9677 - val_accuracy: 0.5379 - val_loss: 0.9744 Epoch 24/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - accuracy: 0.5383 - loss: 0.9714 - val_accuracy: 0.5320 - val_loss: 0.9686 Epoch 25/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - accuracy: 0.5241 - loss: 0.9718 - val_accuracy: 0.5339 - val_loss: 0.9635 Epoch 26/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - accuracy: 0.5329 - loss: 0.9708 - val_accuracy: 0.5412 - val_loss: 0.9515 Epoch 27/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - accuracy: 0.5500 - loss: 0.9530 - val_accuracy: 0.5478 - val_loss: 0.9592 Epoch 28/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5733 - loss: 0.9509 - val_accuracy: 0.5419 - val_loss: 0.9571 Epoch 29/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5494 - loss: 0.9562 - val_accuracy: 0.5485 - val_loss: 0.9588 Epoch 30/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5548 - loss: 0.9534 - val_accuracy: 0.5630 - val_loss: 0.9591 Epoch 31/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5587 - loss: 0.9460 - val_accuracy: 0.5623 - val_loss: 0.9357 Epoch 32/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5723 - loss: 0.9398 - val_accuracy: 0.5590 - val_loss: 0.9323 Epoch 33/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5788 - loss: 0.9412 - val_accuracy: 0.5583 - val_loss: 0.9435 Epoch 34/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5719 - loss: 0.9378 - val_accuracy: 0.5577 - val_loss: 0.9499 Epoch 35/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 6ms/step - accuracy: 0.5663 - loss: 0.9445 - val_accuracy: 0.5498 - val_loss: 0.9454 Epoch 36/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5738 - loss: 0.9407 - val_accuracy: 0.5471 - val_loss: 0.9415 Epoch 37/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5575 - loss: 0.9400 - val_accuracy: 0.5386 - val_loss: 0.9666 Epoch 38/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5685 - loss: 0.9449 - val_accuracy: 0.5557 - val_loss: 0.9381 Epoch 39/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5646 - loss: 0.9546 - val_accuracy: 0.5550 - val_loss: 0.9387 Epoch 40/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5690 - loss: 0.9244 - val_accuracy: 0.5557 - val_loss: 0.9431 Epoch 41/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 2s 5ms/step - accuracy: 0.5730 - loss: 0.9285 - val_accuracy: 0.5610 - val_loss: 0.9316 Epoch 42/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - accuracy: 0.5670 - loss: 0.9315 - val_accuracy: 0.5544 - val_loss: 0.9326 Epoch 43/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 6ms/step - accuracy: 0.5634 - loss: 0.9394 - val_accuracy: 0.5452 - val_loss: 0.9539 Epoch 44/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - accuracy: 0.5650 - loss: 0.9390 - val_accuracy: 0.5557 - val_loss: 0.9283 Epoch 45/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5673 - loss: 0.9315 - val_accuracy: 0.5511 - val_loss: 0.9405 Epoch 46/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5799 - loss: 0.9230 - val_accuracy: 0.5458 - val_loss: 0.9278 Epoch 47/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5753 - loss: 0.9284 - val_accuracy: 0.5557 - val_loss: 0.9325 Epoch 48/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5750 - loss: 0.9282 - val_accuracy: 0.5531 - val_loss: 0.9327 Epoch 49/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5833 - loss: 0.9241 - val_accuracy: 0.5419 - val_loss: 0.9591 Epoch 50/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5711 - loss: 0.9239 - val_accuracy: 0.5498 - val_loss: 0.9307 Epoch 51/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5753 - loss: 0.9244 - val_accuracy: 0.5353 - val_loss: 0.9555 Epoch 52/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5800 - loss: 0.9236 - val_accuracy: 0.5583 - val_loss: 0.9346 Epoch 53/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5714 - loss: 0.9215 - val_accuracy: 0.5577 - val_loss: 0.9521 Epoch 54/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5802 - loss: 0.9168 - val_accuracy: 0.5564 - val_loss: 0.9364 Epoch 55/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5874 - loss: 0.9123 - val_accuracy: 0.5478 - val_loss: 0.9511 Epoch 56/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5665 - loss: 0.9248 - val_accuracy: 0.5491 - val_loss: 0.9346 Epoch 57/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5839 - loss: 0.9101 - val_accuracy: 0.5465 - val_loss: 0.9399 Epoch 58/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 2s 10ms/step - accuracy: 0.5774 - loss: 0.9132 - val_accuracy: 0.5412 - val_loss: 0.9546 Epoch 59/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 3s 15ms/step - accuracy: 0.5631 - loss: 0.9299 - val_accuracy: 0.5531 - val_loss: 0.9284 Epoch 60/60 190/190 ━━━━━━━━━━━━━━━━━━━━ 2s 12ms/step - accuracy: 0.5844 - loss: 0.9103 - val_accuracy: 0.5485 - val_loss: 0.9310
In [79]:
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc:.3f}")
from sklearn.metrics import classification_report
y_pred=model.predict(X_test).argmax(axis=1)
print(classification_report(y_test,y_pred))
60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.5548 - loss: 0.9276 Test Accuracy: 0.563 60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step precision recall f1-score support 0 0.71 0.67 0.69 869 1 0.31 0.33 0.32 467 2 0.58 0.60 0.59 560 accuracy 0.56 1896 macro avg 0.53 0.53 0.53 1896 weighted avg 0.57 0.56 0.57 1896
In [80]:
import numpy as np
classes = ["Home", "Draw", "Away"]
def predict_upcoming_match_nn(df, model, home_team, away_team, features, alpha=0.7):
"""
Predict match outcome by blending neural network predictions with implied odds
Args:
df: DataFrame containing match data
model: Trained neural network model
home_team: Home team name
away_team: Away team name
features: List of feature column names
alpha: Weight for model predictions (1-alpha for betting odds)
Returns:
Tuple of (predicted_class, probability_array)
"""
match = df[(df["HomeTeam"] == home_team) & (df["AwayTeam"] == away_team)]
if match.empty:
print(f"No match found between {home_team} and {away_team}")
return None, None
X_match = match[features].values
# NN predicts probabilities directly
y_pred_proba = model.predict(X_match)[0] # shape (3,)
# Get implied probabilities from betting odds
implied_home = match['h_implied_prob'].values[0]
implied_draw = match['a_implied_prob'].values[0]
implied_away = match['d_implied_prob'].values[0]
# Blend model probs with odds
final_home = alpha * y_pred_proba[0] + (1 - alpha) * implied_home
final_draw = alpha * y_pred_proba[1] + (1 - alpha) * implied_draw
final_away = alpha * y_pred_proba[2] + (1 - alpha) * implied_away
# Normalize to sum = 1
total = final_home + final_draw + final_away
final_probs = np.array([final_home, final_draw, final_away]) / total
# Final prediction (0=Home, 1=Draw, 2=Away)
y_pred = int(np.argmax(final_probs))
return y_pred, final_probs
# List of upcoming fixtures
fixtures = [
("Nott'm Forest", "Chelsea"),
("Brighton", "Newcastle"),
("Burnley", "Leeds"),
("Crystal Palace", "Bournemouth"),
("Man City", "Everton"),
("Sunderland", "Wolves"),
("Fulham", "Arsenal"),
("Tottenham", "Aston Villa"),
("Liverpool", "Man United"),
("West Ham", "Brentford"),
]
# Make predictions for all fixtures
for home_team, away_team in fixtures:
pred_class, pred_probs = predict_upcoming_match_nn(
store_df, model, home_team, away_team, features
)
if pred_class is None:
continue
print(f"{home_team} vs {away_team}")
print(f"→ Predicted: {pred_class}")
print(f" Home: {pred_probs[0]*100:.2f}%")
print(f" Draw: {pred_probs[1]*100:.2f}%")
print(f" Away: {pred_probs[2]*100:.2f}%")
print("-" * 40)
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 45ms/step Nott'm Forest vs Chelsea → Predicted: 2 Home: 24.47% Draw: 36.89% Away: 38.64% ---------------------------------------- 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 43ms/step Brighton vs Newcastle → Predicted: 2 Home: 23.35% Draw: 37.42% Away: 39.23% ---------------------------------------- 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 42ms/step Burnley vs Leeds → Predicted: 1 Home: 34.09% Draw: 38.04% Away: 27.87% ---------------------------------------- 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 42ms/step Crystal Palace vs Bournemouth → Predicted: 2 Home: 21.89% Draw: 37.25% Away: 40.86% ---------------------------------------- 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 41ms/step Man City vs Everton → Predicted: 0 Home: 54.30% Draw: 28.49% Away: 17.21% ---------------------------------------- 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 40ms/step Sunderland vs Wolves → Predicted: 0 Home: 46.96% Draw: 33.40% Away: 19.64% ---------------------------------------- 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 44ms/step Fulham vs Arsenal → Predicted: 0 Home: 35.79% Draw: 34.12% Away: 30.09% ---------------------------------------- 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 44ms/step Tottenham vs Aston Villa → Predicted: 0 Home: 36.91% Draw: 27.62% Away: 35.47% ---------------------------------------- 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 39ms/step Liverpool vs Man United → Predicted: 0 Home: 67.55% Draw: 23.07% Away: 9.38% ---------------------------------------- 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 41ms/step West Ham vs Brentford → Predicted: 1 Home: 27.66% Draw: 37.67% Away: 34.67% ----------------------------------------
Neural Network accuracy progression and loss minimization over epochs¶
In [81]:
cm=confusion_matrix(y_test,y_pred)
sns.heatmap(cm,cmap='coolwarm',annot=True,fmt="d")
plt.xlabel('actual')
plt.ylabel('predicited')
plt.show()
In [82]:
plt.plot(history.history['accuracy'],label='Train acc')
plt.plot(history.history['val_accuracy'],label='Val acc')
plt.xlabel('epochs')
plt.ylabel('accuracy')
plt.legend()
plt.show()
In [83]:
plt.plot(history.history['loss'],label='Train loss')
plt.plot(history.history['val_loss'],label='Val loss')
plt.xlabel('epochs')
plt.ylabel('loss')
plt.legend()
plt.show()
Logistic Regression¶
In [84]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression(class_weight='balanced')
lr.fit(X_train,y_train)
Out[84]:
LogisticRegression(class_weight='balanced')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(class_weight='balanced')
In [85]:
y_pred=lr.predict(X_test)
print(classification_report(y_test,y_pred))
precision recall f1-score support
0 0.71 0.65 0.68 869
1 0.32 0.37 0.34 467
2 0.58 0.59 0.58 560
accuracy 0.56 1896
macro avg 0.54 0.53 0.53 1896
weighted avg 0.58 0.56 0.57 1896
In [86]:
import numpy as np
classes = ["Home", "Draw", "Away"]
def predict_match_lr(df, model, home_team, away_team, features,alpha=0.7):
"""
Parameters:
df (pd.DataFrame): Dataset with match features.
model: Trained RandomForestClassifier.
home_team (str): Name of the home team.
away_team (str): Name of the away team.
features (list): List of feature column names.
draw_threshold (float): Minimum probability to predict Draw.
Returns:
predicted_class (int or None): 0=Home, 1=Draw, 2=Away
predicted_probs (np.array or None): probability array [Home, Draw, Away]
"""
# Select the match row
match_row = df[(df['HomeTeam'] == home_team) & (df['AwayTeam'] == away_team)]
if match_row.empty:
return None, None
X_match = match_row[features]
# Predict probabilities
y_pred_proba = model.predict_proba(X_match)[0]
implied_prob_home=match_row["h_implied_prob"].values[0]
implied_prob_draw=match_row["d_implied_prob"].values[0]
implied_prob_away=match_row["a_implied_prob"].values[0]
final_home_prob=y_pred_proba[0]*alpha+(1-alpha)*implied_prob_home
final_draw_prob=y_pred_proba[1]*alpha+(1-alpha)*implied_prob_draw
final_away_prob=y_pred_proba[2]*alpha+(1-alpha)*implied_prob_away
total=final_home_prob+final_draw_prob+final_away_prob
final_y_proba=np.array([final_home_prob,final_draw_prob,final_away_prob])/total
return final_y_proba
# ==================================
# Example usage: loop through fixtures
# ==================================
fixtures = [
("Nott'm Forest", "Chelsea"),
("Brighton", "Newcastle"),
("Burnley", "Leeds"),
("Crystal Palace", "Bournemouth"),
("Man City", "Everton"),
("Sunderland", "Wolves"),
("Fulham", "Arsenal"),
("Tottenham", "Aston Villa"),
("Liverpool", "Man United"),
("West Ham", "Brentford"),
]
for home_team, away_team in fixtures:
pred_probs = predict_match_lr(store_df, lr, home_team, away_team, features)
print(f"{home_team} vs {away_team}")
print(" Home ", f"{pred_probs[0]*100:.2f}%")
print(" Draw ", f"{pred_probs[1]*100:.2f}%")
print(" Away ", f"{pred_probs[2]*100:.2f}%")
predicted_class=pred_probs.argmax()
print(predicted_class)
print("-" * 40)
Nott'm Forest vs Chelsea Home 26.73% Draw 41.95% Away 31.31% 1 ---------------------------------------- Brighton vs Newcastle Home 29.06% Draw 41.58% Away 29.36% 1 ---------------------------------------- Burnley vs Leeds Home 35.17% Draw 47.79% Away 17.04% 1 ---------------------------------------- Crystal Palace vs Bournemouth Home 23.25% Draw 46.99% Away 29.76% 1 ---------------------------------------- Man City vs Everton Home 47.19% Draw 25.38% Away 27.43% 0 ---------------------------------------- Sunderland vs Wolves Home 51.99% Draw 27.06% Away 20.95% 0 ---------------------------------------- Fulham vs Arsenal Home 47.98% Draw 38.00% Away 14.02% 0 ---------------------------------------- Tottenham vs Aston Villa Home 29.74% Draw 42.26% Away 28.01% 1 ---------------------------------------- Liverpool vs Man United Home 70.44% Draw 20.54% Away 9.02% 0 ---------------------------------------- West Ham vs Brentford Home 24.65% Draw 41.67% Away 33.68% 1 ----------------------------------------
In [87]:
cm=confusion_matrix(y_test,y_pred)
sns.heatmap(cm,cmap='coolwarm',annot=True,fmt='d')
Out[87]:
<Axes: >