from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Dict
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix
import warnings
warnings.filterwarnings('ignore')

import os
data_path="/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files"
season_csv_list=os.listdir(data_path)



#file path for each csv
season_files=[os.path.join(data_path,s) for s in season_csv_list]
print(season_files)

['/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2024.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2023.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2022.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2021.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2020.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2019.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2018.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2017.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2016.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2015.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2014.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2013.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2012.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2011.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2010.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2009.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2008.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2007.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2006.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2005.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2004.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2003.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2002.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2001.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2000.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2025.csv']

def prepare_league_table_stats(df:pd.DataFrame)->pd.DataFrame:
  table:Dict[str,Dict[str,int]]=defaultdict(lambda:{
    "team":0,
    "points":0,
    "goals_for":0,
    "goals_against":0,
    "wins":0,
    "losses":0,
    "draws":0
  })
  for _,row in df.iterrows():
    home,away=row["HomeTeam"],row["AwayTeam"]
    hg,ag=row["FTHG"],row["FTAG"]

    table[home]["team"]=home
    table[away]["team"]=away
    table[home]["goals_for"]+=hg
    table[home]["goals_against"]+=ag
    table[away]["goals_for"]+=ag
    table[away]["goals_against"]+=hg
    if hg>ag:
       table[home]["points"]+=3
       table[home]["wins"]+=1
       table[away]["losses"]+=1
    elif ag>hg:
       table[away]["points"]+=3
       table[away]["wins"]+=1
       table[home]["losses"]+=1
    else:
      table[home]["points"]+=1
      table[away]["points"]+=1
      table[home]["draws"]+=1
      table[away]["draws"]+=1
  table_df=pd.DataFrame(table).T
  table_df["goal_diff"]=table_df["goals_for"]-table_df["goals_against"]
  table_df.sort_values(["points","wins","goals_for"],ascending=[False,False,False],inplace=True)
  table_df=table_df.reset_index(drop=True)
  table_df["position"]=table_df.index+1
  return table_df

def add_league_stats_to_matches(df:pd.DataFrame)->pd.DataFrame:
  """
  Parameter:Season matches stats
  Function:Matches data frame will be used to compute league table stats for each teams.The stats will be merged to corresponding team matches in h2h df
  Return:Matches stats with teams league positions,points,goals difference for both home and away team
  """
  #preparing league positions stats
  table_df=prepare_league_table_stats(df)
  #merging league stats on h2h df
  df = df.drop(columns=[col for col in df.columns if col.startswith(('h_', 'a_'))], errors='ignore')
  merged_df = df.merge(
      table_df.add_prefix("h_"),
      how='left',
      left_on='HomeTeam',
      right_on='h_team'
  ).merge(
      table_df.add_prefix("a_"),
      how='left',
      left_on='AwayTeam',
      right_on='a_team'
  )
  return merged_df

def parse_date_df(df: pd.DataFrame) -> pd.DataFrame:

    from pandas.api.types import is_datetime64_any_dtype

    if not is_datetime64_any_dtype(df['Date']):
        try:
            df['Date'] = pd.to_datetime(df['Date'], dayfirst=True, errors='coerce')
        except Exception:
            # Fallback format
            df['Date'] = pd.to_datetime(df['Date'], format="%d/%m/%y", errors='coerce')

    # Optional: drop rows with invalid dates
    df = df.dropna(subset=['Date'])

    return df

def get_season(df: pd.DataFrame) -> pd.DataFrame:
    """
    Assign Premier League season based on date.
    Season starts in August and ends next year.
    """
    years = df['Date'].dt.year
    df['season'] = (years - (df['Date'].dt.month < 8)).astype(str) + '-' + (years - (df['Date'].dt.month < 8) + 1).astype(str)
    return df

def get_last5_stats(df:pd.DataFrame,team:str):
   team_matches5=df[(df["HomeTeam"]==team)|(df["AwayTeam"]==team)].sort_values(['Date'],ascending=False).head()
   avg_goals_for_last5=(  (team_matches5['HomeTeam']==team) * team_matches5['FTHG']   +
               (team_matches5['AwayTeam']==team) * team_matches5['FTAG'] ).mean()

   avg_goals_against_last5=((team_matches5['HomeTeam']==team) * team_matches5['FTAG']+
                   (team_matches5['AwayTeam']==team) * team_matches5['FTHG']).mean()

   avg_corners_last5=((team_matches5['HomeTeam']==team)* team_matches5['HC']+
                (team_matches5['AwayTeam']==team)* team_matches5['AC']).mean()
   avg_redcards_last5=((team_matches5['HomeTeam']==team) * team_matches5['HR']+
                 (team_matches5['AwayTeam']==team) * team_matches5['AR']).mean()
   # win rate in last 5 matches
   wins=((team_matches5['HomeTeam']==team) & (team_matches5['FTR']=='H') |
         (team_matches5['AwayTeam']==team)& (team_matches5['FTR']=='A')).sum()
   win_rate=wins/len(team_matches5)

   draws=((team_matches5["HomeTeam"]==team) & (team_matches5['FTR']=='D') |
          (team_matches5["AwayTeam"]==team) & (team_matches5['FTR']=='D')).sum()
   draw_rate=draws/len(team_matches5)

   return{
       "avg_goals_for_last5":avg_goals_for_last5,
       "avg_goals_against_last5":avg_goals_against_last5,
       "avg_corners_last5":avg_corners_last5,
       "avg_redcards_last5":avg_redcards_last5,
       "win_rate_last5":win_rate,
       "draw_rate_last5":draw_rate
   }

def add_last5_features(fixtures:pd.DataFrame)->pd.DataFrame:
    features=[]
    for _,row in fixtures.iterrows():
       home_stats_last5=get_last5_stats(fixtures,row['HomeTeam'])
       away_stats_last5=get_last5_stats(fixtures,row['AwayTeam'])

       features.append({
           **row,
           **{f"h_{k}": v for k,v in home_stats_last5.items()},
           **{f"a_{k}":v for k, v in away_stats_last5.items()},

       })
    return pd.DataFrame(features)

def calculate_h2h_win_rates(df: pd.DataFrame) -> pd.DataFrame:
    """Calculate head-to-head win rates for each match."""

    h2h: Dict = {}
    home_rates, away_rates,draw_rates = [], [], []

    for _, row in df.iterrows():
        home, away = row["HomeTeam"], row["AwayTeam"]
        key = tuple(sorted([home, away]))


        stats = h2h.get(key, {"team1": 0, "team2": 0,"draws":0, "matches": 0})
        total = stats["matches"]

        # Compute current H2H rates
        if total == 0:
            home_rate = away_rate = 0.5
            draw_rate=0.33
        else:
            if home == key[0]:
                home_rate = stats["team1"] / total
                away_rate = stats["team2"] / total
            else:
                home_rate = stats["team2"] / total
                away_rate = stats["team1"] / total
            draw_rate=stats["draws"]/total


        home_rates.append(home_rate)
        away_rates.append(away_rate)
        draw_rates.append(draw_rate)

        # Update stats with current match result
        stats["matches"]+=1
        if row["FTR"]=='H':
           stats["team1" if home==key[0] else "team2"]+=1
        elif row["FTR"]=='A':
           stats["team2" if home==key[0] else "team1"]+=1
        else:
          stats["draws"]+=1
        h2h[key] = stats

    # Assign to dataframe
    df["home_vs_away_winrate"] = home_rates
    df["away_vs_home_winrate"] = away_rates
    df["h2h_draw_rate"]=draw_rates
    return df

#preparing dataset
season_fixtures:Dict[str,pd.DataFrame]={}
for file_path in season_files:
    raw = pd.read_csv(file_path, encoding='ISO-8859-1', on_bad_lines='skip')
    # parsing date and sorting them in desending order
    raw=parse_date_df(raw)
    raw=raw.sort_values(["Date"],ascending=False).reset_index(drop=True)
    raw=get_season(raw)

    for season_name, group in raw.groupby('season'):
        season_fixtures[season_name] = group
        group=add_league_stats_to_matches(group)
        season_fixtures[season_name] = add_last5_features(group)

season_df = pd.concat(season_fixtures.values(), ignore_index=True)

# Shot efficiency
h_shot_eff = season_df["HST"].div(season_df["HS"]).fillna(0)
a_shot_eff = season_df["AST"].div(season_df["AS"]).fillna(0)
season_df["home_shot_efficiency"] = h_shot_eff
season_df["away_shot_efficiency"] = a_shot_eff


# Gap features
season_df["goal_diff_gap"] = season_df["h_goal_diff"] - season_df["a_goal_diff"]
season_df["points_gap"] = season_df["h_points"] - season_df["a_points"]
season_df["form_gap"] = season_df["HomeTeam_Form"] - season_df["AwayTeam_Form"]
season_df["win_rate_gap"] = season_df["h_win_rate_last5"] - season_df["a_win_rate_last5"]
season_df["attack_strength_gap"] = season_df["h_avg_goals_for_last5"] - season_df["a_avg_goals_for_last5"]
season_df["defense_strength_gap"] = season_df["a_avg_goals_against_last5"] - season_df["h_avg_goals_against_last5"]
season_df["league_position_gap"] = season_df["h_position"] - season_df["a_position"]

# Head-to-head win rates
season_df = calculate_h2h_win_rates(season_df)

# =====feature engineering from odd scores of B365 agency====

season_df["home_away_win_ratio"]=season_df["B365H"]/season_df["B365A"]
season_df["draw_odd_ratio"]=season_df["B365D"]/season_df[["B365H","B365A"]].min(axis=1)

# calculating implied probablity of bet odd
season_df["h_implied_prob"]=1/season_df["B365H"]
season_df["a_implied_prob"]=1/season_df["B365D"]
season_df["d_implied_prob"]=1/season_df["B365A"]

season_df["betting_confidence"]=season_df["h_implied_prob"]-season_df["a_implied_prob"]
"""
favorite strength-> Strong team probabilty to win
underdog strength-> Weak team probabilty to win
"""
season_df["favorite_strength"]=season_df[["h_implied_prob","a_implied_prob"]].min(axis=1)
season_df["underdog_strength"]=season_df[["h_implied_prob","a_implied_prob"]].max(axis=1)

#calculating odd variance from different book markers
home_odds_cols = [
    "B365H", "BWH", "BFH", "PSH", "WHH", "1XBH", "MaxH", "AvgH", "BFEH"
]

away_odds_cols = [
    "B365A", "BWA", "BFA", "PSA", "WHA", "1XBA", "MaxA", "AvgA", "BFEA"
]

#==========last 5 matches ===================
season_df['home_draw_rate_last5'] = season_df.groupby('HomeTeam')['FTR'].apply(
    lambda x: (x.shift()=='D').rolling(5, min_periods=1).mean()
).reset_index(level=0, drop=True)

season_df['away_draw_rate_last5'] = season_df.groupby('AwayTeam')['FTR'].apply(
    lambda x: (x.shift()=='D').rolling(5, min_periods=1).mean()
).reset_index(level=0, drop=True)

season_df['draw_rate_gap'] = season_df['home_draw_rate_last5'] - season_df['away_draw_rate_last5']


season_df['home_low_score_rate'] = season_df.groupby('HomeTeam')['FTHG'].apply(lambda x: (x.shift()<=1).rolling(5,min_periods=1).mean()).reset_index(level=0, drop=True)
season_df['away_low_score_rate'] = season_df.groupby('AwayTeam')['FTAG'].apply(lambda x: (x.shift()<=1).rolling(5,min_periods=1).mean()).reset_index(level=0, drop=True)
season_df['low_score_gap'] = season_df['home_low_score_rate'] - season_df['away_low_score_rate']

season_df["home_odd_variance"]=season_df[home_odds_cols].var(axis=1)
season_df["away_odd_variance"]=season_df[away_odds_cols].var(axis=1)
#=============================================================
season_df.sort_values(["season", "Date"], ascending=False, inplace=True)

# Updated selected columns
selected_cols = [
    "HomeTeam", "AwayTeam",
    "FTHG", "FTAG",

    # Head to head win rates
    "home_vs_away_winrate",
    "away_vs_home_winrate",

    # Gap features
    "goal_diff_gap",
    "points_gap",
    "form_gap",
    "win_rate_gap",
    "attack_strength_gap",
    "defense_strength_gap",
    "league_position_gap",

    # Match performance stats
    "home_shot_efficiency",
    "away_shot_efficiency",

    # Home advantage
    "HomeTeam_Form",
    "AwayTeam_Form",

    "home_away_win_ratio",
    "draw_odd_ratio",
    "h_implied_prob",
    "a_implied_prob",
    "d_implied_prob",
    "home_odd_variance",
    "away_odd_variance",
    "betting_confidence",
    "favorite_strength",
    "underdog_strength",

"home_draw_rate_last5",
    "away_draw_rate_last5",
"draw_rate_gap",
"home_low_score_rate",
    "away_low_score_rate",
  "low_score_gap",

    "FTR"
]

store_df = season_df.copy()
store_df.replace([-np.inf,np.inf],np.nan,inplace=True)
store_df.fillna(0.5,inplace=True)
season_df = season_df[selected_cols]

season_df.reset_index(drop=True, inplace=True)

# Drop rows with missing critical data
season_df.dropna(subset=["HomeTeam", "AwayTeam", "FTHG", "FTAG", "FTR"], inplace=True)

season_df.head()

numeric_cols=season_df.select_dtypes(include=["number"])
plt.figure(figsize=(20,11))
sns.heatmap(numeric_cols.corr(),annot=True,cmap='coolwarm')

<Axes: >

sns.countplot(x=season_df['FTR'])

<Axes: xlabel='FTR', ylabel='count'>

# selecting features for Training and Test sets
features = [

    # head to head win rates
    "home_vs_away_winrate",
    "away_vs_home_winrate",

    # Match performance stats
    "home_shot_efficiency",
    "away_shot_efficiency",



    "goal_diff_gap",
    "points_gap",
    "form_gap",
    "win_rate_gap",
    "attack_strength_gap",
    "defense_strength_gap",
    "league_position_gap",


    "home_away_win_ratio",
    "draw_odd_ratio",
    "h_implied_prob",
    "a_implied_prob",
    "d_implied_prob",
    "home_odd_variance",
    "away_odd_variance",
    "betting_confidence",
    "favorite_strength",
    "underdog_strength",

  "home_draw_rate_last5",
  "away_draw_rate_last5",
  "draw_rate_gap",
  "home_low_score_rate",
  "away_low_score_rate",
  "low_score_gap"
]

target='FTR'
X=season_df[features]
X = X[features].fillna(0)

y=season_df[target]
outcome_to_int = {
    "H": 0,
    "D": 1,
    "A": 2
}
y=y.map(outcome_to_int)
scaler=StandardScaler()

X=X.replace([np.inf,-np.inf],np.nan)
X=X.fillna(0.5)

#scaling X
X_scaled=scaler.fit_transform(X)

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,
                                                    stratify=y)

rf=RandomForestClassifier(    n_estimators=300,          # more trees
    max_depth=12,              # limit tree depth
    min_samples_split=5,       # prevent splits with too few samples
    min_samples_leaf=2,        # leaf must have at least 2 samples
    max_features='sqrt',       # sqrt(num_features) for split
    class_weight="balanced",   # handle class imbalance
    random_state=42)
rf.fit(X_train,y_train)
y_pred=rf.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.5922995780590717
              precision    recall  f1-score   support

           0       0.69      0.74      0.71       869
           1       0.36      0.26      0.30       467
           2       0.58      0.64      0.61       560

    accuracy                           0.59      1896
   macro avg       0.54      0.55      0.54      1896
weighted avg       0.57      0.59      0.58      1896

import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns

cm = confusion_matrix(y_test, y_pred)

cm

array([[644, 123, 102],
       [188, 121, 158],
       [107,  95, 358]])

import numpy as np

classes = ["Home", "Draw", "Away"]

def predict_match_rf(df, model, home_team, away_team, features,alpha=0.7):
    """
    Predict the outcome of a single match using a trained Random Forest.

    Parameters:
        df (pd.DataFrame): Dataset with match features.
        model: Trained RandomForestClassifier.
        home_team (str): Name of the home team.
        away_team (str): Name of the away team.
        features (list): List of feature column names.
        draw_threshold (float): Minimum probability to predict Draw.

    Returns:
        predicted_class (int or None): 0=Home, 1=Draw, 2=Away
        predicted_probs (np.array or None): probability array [Home, Draw, Away]
    """
    # Select the match row
    match_row = df[(df['HomeTeam'] == home_team) & (df['AwayTeam'] == away_team)]
    if match_row.empty:
        return None, None

    X_match = match_row[features]

    # Predict probabilities
    y_pred_proba = model.predict_proba(X_match)[0]

    implied_prob_home=match_row["h_implied_prob"].values[0]
    implied_prob_draw=match_row["d_implied_prob"].values[0]
    implied_prob_away=match_row["a_implied_prob"].values[0]

    final_home_prob=y_pred_proba[0]*alpha+(1-alpha)*implied_prob_home
    final_draw_prob=y_pred_proba[1]*alpha+(1-alpha)*implied_prob_draw
    final_away_prob=y_pred_proba[2]*alpha+(1-alpha)*implied_prob_away
    total=final_home_prob+final_draw_prob+final_away_prob
    final_y_proba=np.array([final_home_prob,final_draw_prob,final_away_prob])/total



    return  final_y_proba

# ==================================
# Example usage: loop through fixtures
# ==================================
fixtures = [
    ("Nott'm Forest", "Chelsea"),
    ("Brighton", "Newcastle"),
    ("Burnley", "Leeds"),
    ("Crystal Palace", "Bournemouth"),
    ("Man City", "Everton"),
    ("Sunderland", "Wolves"),
    ("Fulham", "Arsenal"),
    ("Tottenham", "Aston Villa"),
    ("Liverpool", "Man United"),
    ("West Ham", "Brentford"),
]

for home_team, away_team in fixtures:
     pred_probs = predict_match_rf(store_df, rf, home_team, away_team, features)
     pred_class=pred_probs.argmax()

     if pred_class is not None:
        print(f"{home_team} vs {away_team}")

        print("   Home ", f"{pred_probs[0]*100:.2f}%")
        print("   Draw ", f"{pred_probs[1]*100:.2f}%")
        print("   Away ", f"{pred_probs[2]*100:.2f}%")
        print("-" * 40)

Nott'm Forest vs Chelsea
   Home  20.28%
   Draw  34.20%
   Away  45.52%
----------------------------------------
Brighton vs Newcastle
   Home  24.35%
   Draw  53.20%
   Away  22.45%
----------------------------------------
Burnley vs Leeds
   Home  27.63%
   Draw  54.39%
   Away  17.98%
----------------------------------------
Crystal Palace vs Bournemouth
   Home  23.42%
   Draw  53.72%
   Away  22.86%
----------------------------------------
Man City vs Everton
   Home  45.25%
   Draw  40.90%
   Away  13.85%
----------------------------------------
Sunderland vs Wolves
   Home  42.66%
   Draw  42.12%
   Away  15.22%
----------------------------------------
Fulham vs Arsenal
   Home  16.70%
   Draw  62.32%
   Away  20.98%
----------------------------------------
Tottenham vs Aston Villa
   Home  36.39%
   Draw  31.74%
   Away  31.87%
----------------------------------------
Liverpool vs Man United
   Home  55.57%
   Draw  35.36%
   Away  9.07%
----------------------------------------
West Ham vs Brentford
   Home  21.15%
   Draw  30.79%
   Away  48.06%
----------------------------------------

from tensorflow import keras
from tensorflow.keras import layers
model=keras.Sequential([
    layers.Dense(128,activation='relu',input_shape=(X_train.shape[1],)),
    layers.Dropout(0.3),

    layers.Dense(64,activation='relu'),
    layers.Dropout(0.3),

    layers.Dense(32,activation='relu'),
    layers.Dense(3,activation='softmax')
])

model.compile(optimizer='adam',metrics=['accuracy'],loss='sparse_categorical_crossentropy')

from sklearn.utils.class_weight import compute_class_weight
class_weights=compute_class_weight('balanced',classes=np.unique(y_train),y=y_train)
class_weights=dict(enumerate(class_weights))
history=model.fit(X_train,y_train,
                  batch_size=32,epochs=60,
                  validation_split=0.2,class_weight=class_weights)

Epoch 1/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 2s 4ms/step - accuracy: 0.4141 - loss: 1.6111 - val_accuracy: 0.5201 - val_loss: 0.9689
Epoch 2/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.4887 - loss: 1.0220 - val_accuracy: 0.5188 - val_loss: 0.9921
Epoch 3/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.4925 - loss: 1.0147 - val_accuracy: 0.5122 - val_loss: 1.0036
Epoch 4/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - accuracy: 0.5131 - loss: 1.0021 - val_accuracy: 0.5069 - val_loss: 0.9963
Epoch 5/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - accuracy: 0.5309 - loss: 0.9876 - val_accuracy: 0.4819 - val_loss: 1.0162
Epoch 6/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - accuracy: 0.5128 - loss: 0.9928 - val_accuracy: 0.5049 - val_loss: 0.9804
Epoch 7/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - accuracy: 0.5139 - loss: 0.9967 - val_accuracy: 0.5016 - val_loss: 0.9896
Epoch 8/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - accuracy: 0.5216 - loss: 0.9859 - val_accuracy: 0.4957 - val_loss: 1.0072
Epoch 9/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5194 - loss: 0.9877 - val_accuracy: 0.5129 - val_loss: 0.9728
Epoch 10/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5100 - loss: 0.9995 - val_accuracy: 0.5115 - val_loss: 0.9666
Epoch 11/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5164 - loss: 0.9972 - val_accuracy: 0.5201 - val_loss: 0.9587
Epoch 12/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5232 - loss: 0.9890 - val_accuracy: 0.5082 - val_loss: 0.9907
Epoch 13/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5152 - loss: 0.9866 - val_accuracy: 0.5221 - val_loss: 0.9571
Epoch 14/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5302 - loss: 0.9760 - val_accuracy: 0.5221 - val_loss: 0.9719
Epoch 15/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5322 - loss: 0.9743 - val_accuracy: 0.5115 - val_loss: 0.9797
Epoch 16/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5221 - loss: 0.9716 - val_accuracy: 0.5096 - val_loss: 0.9716
Epoch 17/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5240 - loss: 0.9743 - val_accuracy: 0.5115 - val_loss: 0.9778
Epoch 18/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5289 - loss: 0.9737 - val_accuracy: 0.5142 - val_loss: 0.9725
Epoch 19/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5405 - loss: 0.9687 - val_accuracy: 0.5274 - val_loss: 0.9704
Epoch 20/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5321 - loss: 0.9807 - val_accuracy: 0.5214 - val_loss: 0.9836
Epoch 21/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5345 - loss: 0.9760 - val_accuracy: 0.5194 - val_loss: 0.9728
Epoch 22/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5281 - loss: 0.9780 - val_accuracy: 0.5254 - val_loss: 0.9826
Epoch 23/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - accuracy: 0.5439 - loss: 0.9677 - val_accuracy: 0.5379 - val_loss: 0.9744
Epoch 24/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - accuracy: 0.5383 - loss: 0.9714 - val_accuracy: 0.5320 - val_loss: 0.9686
Epoch 25/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - accuracy: 0.5241 - loss: 0.9718 - val_accuracy: 0.5339 - val_loss: 0.9635
Epoch 26/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - accuracy: 0.5329 - loss: 0.9708 - val_accuracy: 0.5412 - val_loss: 0.9515
Epoch 27/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - accuracy: 0.5500 - loss: 0.9530 - val_accuracy: 0.5478 - val_loss: 0.9592
Epoch 28/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5733 - loss: 0.9509 - val_accuracy: 0.5419 - val_loss: 0.9571
Epoch 29/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5494 - loss: 0.9562 - val_accuracy: 0.5485 - val_loss: 0.9588
Epoch 30/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5548 - loss: 0.9534 - val_accuracy: 0.5630 - val_loss: 0.9591
Epoch 31/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5587 - loss: 0.9460 - val_accuracy: 0.5623 - val_loss: 0.9357
Epoch 32/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5723 - loss: 0.9398 - val_accuracy: 0.5590 - val_loss: 0.9323
Epoch 33/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5788 - loss: 0.9412 - val_accuracy: 0.5583 - val_loss: 0.9435
Epoch 34/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5719 - loss: 0.9378 - val_accuracy: 0.5577 - val_loss: 0.9499
Epoch 35/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 6ms/step - accuracy: 0.5663 - loss: 0.9445 - val_accuracy: 0.5498 - val_loss: 0.9454
Epoch 36/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5738 - loss: 0.9407 - val_accuracy: 0.5471 - val_loss: 0.9415
Epoch 37/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5575 - loss: 0.9400 - val_accuracy: 0.5386 - val_loss: 0.9666
Epoch 38/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5685 - loss: 0.9449 - val_accuracy: 0.5557 - val_loss: 0.9381
Epoch 39/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5646 - loss: 0.9546 - val_accuracy: 0.5550 - val_loss: 0.9387
Epoch 40/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5690 - loss: 0.9244 - val_accuracy: 0.5557 - val_loss: 0.9431
Epoch 41/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 2s 5ms/step - accuracy: 0.5730 - loss: 0.9285 - val_accuracy: 0.5610 - val_loss: 0.9316
Epoch 42/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - accuracy: 0.5670 - loss: 0.9315 - val_accuracy: 0.5544 - val_loss: 0.9326
Epoch 43/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 6ms/step - accuracy: 0.5634 - loss: 0.9394 - val_accuracy: 0.5452 - val_loss: 0.9539
Epoch 44/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - accuracy: 0.5650 - loss: 0.9390 - val_accuracy: 0.5557 - val_loss: 0.9283
Epoch 45/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5673 - loss: 0.9315 - val_accuracy: 0.5511 - val_loss: 0.9405
Epoch 46/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5799 - loss: 0.9230 - val_accuracy: 0.5458 - val_loss: 0.9278
Epoch 47/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5753 - loss: 0.9284 - val_accuracy: 0.5557 - val_loss: 0.9325
Epoch 48/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5750 - loss: 0.9282 - val_accuracy: 0.5531 - val_loss: 0.9327
Epoch 49/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5833 - loss: 0.9241 - val_accuracy: 0.5419 - val_loss: 0.9591
Epoch 50/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5711 - loss: 0.9239 - val_accuracy: 0.5498 - val_loss: 0.9307
Epoch 51/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5753 - loss: 0.9244 - val_accuracy: 0.5353 - val_loss: 0.9555
Epoch 52/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5800 - loss: 0.9236 - val_accuracy: 0.5583 - val_loss: 0.9346
Epoch 53/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5714 - loss: 0.9215 - val_accuracy: 0.5577 - val_loss: 0.9521
Epoch 54/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5802 - loss: 0.9168 - val_accuracy: 0.5564 - val_loss: 0.9364
Epoch 55/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5874 - loss: 0.9123 - val_accuracy: 0.5478 - val_loss: 0.9511
Epoch 56/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5665 - loss: 0.9248 - val_accuracy: 0.5491 - val_loss: 0.9346
Epoch 57/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5839 - loss: 0.9101 - val_accuracy: 0.5465 - val_loss: 0.9399
Epoch 58/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 2s 10ms/step - accuracy: 0.5774 - loss: 0.9132 - val_accuracy: 0.5412 - val_loss: 0.9546
Epoch 59/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 3s 15ms/step - accuracy: 0.5631 - loss: 0.9299 - val_accuracy: 0.5531 - val_loss: 0.9284
Epoch 60/60
190/190 ━━━━━━━━━━━━━━━━━━━━ 2s 12ms/step - accuracy: 0.5844 - loss: 0.9103 - val_accuracy: 0.5485 - val_loss: 0.9310

test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc:.3f}")

from sklearn.metrics import classification_report
y_pred=model.predict(X_test).argmax(axis=1)
print(classification_report(y_test,y_pred))

60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.5548 - loss: 0.9276
Test Accuracy: 0.563
60/60 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step
              precision    recall  f1-score   support

           0       0.71      0.67      0.69       869
           1       0.31      0.33      0.32       467
           2       0.58      0.60      0.59       560

    accuracy                           0.56      1896
   macro avg       0.53      0.53      0.53      1896
weighted avg       0.57      0.56      0.57      1896

import numpy as np

classes = ["Home", "Draw", "Away"]

def predict_upcoming_match_nn(df, model, home_team, away_team, features, alpha=0.7):
    """
    Predict match outcome by blending neural network predictions with implied odds

    Args:
        df: DataFrame containing match data
        model: Trained neural network model
        home_team: Home team name
        away_team: Away team name
        features: List of feature column names
        alpha: Weight for model predictions (1-alpha for betting odds)

    Returns:
        Tuple of (predicted_class, probability_array)
    """
    match = df[(df["HomeTeam"] == home_team) & (df["AwayTeam"] == away_team)]
    if match.empty:
        print(f"No match found between {home_team} and {away_team}")
        return None, None

    X_match = match[features].values

    # NN predicts probabilities directly
    y_pred_proba = model.predict(X_match)[0]  # shape (3,)

    # Get implied probabilities from betting odds
    implied_home = match['h_implied_prob'].values[0]
    implied_draw = match['a_implied_prob'].values[0]
    implied_away = match['d_implied_prob'].values[0]

    # Blend model probs with odds
    final_home = alpha * y_pred_proba[0] + (1 - alpha) * implied_home
    final_draw = alpha * y_pred_proba[1] + (1 - alpha) * implied_draw
    final_away = alpha * y_pred_proba[2] + (1 - alpha) * implied_away

    # Normalize to sum = 1
    total = final_home + final_draw + final_away
    final_probs = np.array([final_home, final_draw, final_away]) / total

    # Final prediction (0=Home, 1=Draw, 2=Away)
    y_pred = int(np.argmax(final_probs))

    return y_pred, final_probs


# List of upcoming fixtures
fixtures = [
    ("Nott'm Forest", "Chelsea"),
    ("Brighton", "Newcastle"),
    ("Burnley", "Leeds"),
    ("Crystal Palace", "Bournemouth"),
    ("Man City", "Everton"),
    ("Sunderland", "Wolves"),
    ("Fulham", "Arsenal"),
    ("Tottenham", "Aston Villa"),
    ("Liverpool", "Man United"),
    ("West Ham", "Brentford"),
]

# Make predictions for all fixtures
for home_team, away_team in fixtures:
    pred_class, pred_probs = predict_upcoming_match_nn(
        store_df, model, home_team, away_team, features
    )

    if pred_class is None:
        continue

    print(f"{home_team} vs {away_team}")
    print(f"→ Predicted: {pred_class}")
    print(f"   Home: {pred_probs[0]*100:.2f}%")
    print(f"   Draw: {pred_probs[1]*100:.2f}%")
    print(f"   Away: {pred_probs[2]*100:.2f}%")
    print("-" * 40)

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 45ms/step
Nott'm Forest vs Chelsea
→ Predicted: 2
   Home: 24.47%
   Draw: 36.89%
   Away: 38.64%
----------------------------------------
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 43ms/step
Brighton vs Newcastle
→ Predicted: 2
   Home: 23.35%
   Draw: 37.42%
   Away: 39.23%
----------------------------------------
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 42ms/step
Burnley vs Leeds
→ Predicted: 1
   Home: 34.09%
   Draw: 38.04%
   Away: 27.87%
----------------------------------------
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 42ms/step
Crystal Palace vs Bournemouth
→ Predicted: 2
   Home: 21.89%
   Draw: 37.25%
   Away: 40.86%
----------------------------------------
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 41ms/step
Man City vs Everton
→ Predicted: 0
   Home: 54.30%
   Draw: 28.49%
   Away: 17.21%
----------------------------------------
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 40ms/step
Sunderland vs Wolves
→ Predicted: 0
   Home: 46.96%
   Draw: 33.40%
   Away: 19.64%
----------------------------------------
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 44ms/step
Fulham vs Arsenal
→ Predicted: 0
   Home: 35.79%
   Draw: 34.12%
   Away: 30.09%
----------------------------------------
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 44ms/step
Tottenham vs Aston Villa
→ Predicted: 0
   Home: 36.91%
   Draw: 27.62%
   Away: 35.47%
----------------------------------------
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 39ms/step
Liverpool vs Man United
→ Predicted: 0
   Home: 67.55%
   Draw: 23.07%
   Away: 9.38%
----------------------------------------
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 41ms/step
West Ham vs Brentford
→ Predicted: 1
   Home: 27.66%
   Draw: 37.67%
   Away: 34.67%
----------------------------------------

cm=confusion_matrix(y_test,y_pred)
sns.heatmap(cm,cmap='coolwarm',annot=True,fmt="d")
plt.xlabel('actual')
plt.ylabel('predicited')
plt.show()

plt.plot(history.history['accuracy'],label='Train acc')
plt.plot(history.history['val_accuracy'],label='Val acc')
plt.xlabel('epochs')
plt.ylabel('accuracy')
plt.legend()
plt.show()

plt.plot(history.history['loss'],label='Train loss')
plt.plot(history.history['val_loss'],label='Val loss')
plt.xlabel('epochs')
plt.ylabel('loss')
plt.legend()
plt.show()

from sklearn.linear_model import LogisticRegression
lr=LogisticRegression(class_weight='balanced')
lr.fit(X_train,y_train)

LogisticRegression(class_weight='balanced')

LogisticRegression(class_weight='balanced')

y_pred=lr.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.71      0.65      0.68       869
           1       0.32      0.37      0.34       467
           2       0.58      0.59      0.58       560

    accuracy                           0.56      1896
   macro avg       0.54      0.53      0.53      1896
weighted avg       0.58      0.56      0.57      1896

import numpy as np

classes = ["Home", "Draw", "Away"]

def predict_match_lr(df, model, home_team, away_team, features,alpha=0.7):
    """


    Parameters:
        df (pd.DataFrame): Dataset with match features.
        model: Trained RandomForestClassifier.
        home_team (str): Name of the home team.
        away_team (str): Name of the away team.
        features (list): List of feature column names.
        draw_threshold (float): Minimum probability to predict Draw.

    Returns:
        predicted_class (int or None): 0=Home, 1=Draw, 2=Away
        predicted_probs (np.array or None): probability array [Home, Draw, Away]
    """
    # Select the match row
    match_row = df[(df['HomeTeam'] == home_team) & (df['AwayTeam'] == away_team)]
    if match_row.empty:
        return None, None

    X_match = match_row[features]

    # Predict probabilities
    y_pred_proba = model.predict_proba(X_match)[0]

    implied_prob_home=match_row["h_implied_prob"].values[0]
    implied_prob_draw=match_row["d_implied_prob"].values[0]
    implied_prob_away=match_row["a_implied_prob"].values[0]

    final_home_prob=y_pred_proba[0]*alpha+(1-alpha)*implied_prob_home
    final_draw_prob=y_pred_proba[1]*alpha+(1-alpha)*implied_prob_draw
    final_away_prob=y_pred_proba[2]*alpha+(1-alpha)*implied_prob_away
    total=final_home_prob+final_draw_prob+final_away_prob
    final_y_proba=np.array([final_home_prob,final_draw_prob,final_away_prob])/total



    return  final_y_proba

# ==================================
# Example usage: loop through fixtures
# ==================================
fixtures = [
    ("Nott'm Forest", "Chelsea"),
    ("Brighton", "Newcastle"),
    ("Burnley", "Leeds"),
    ("Crystal Palace", "Bournemouth"),
    ("Man City", "Everton"),
    ("Sunderland", "Wolves"),
    ("Fulham", "Arsenal"),
    ("Tottenham", "Aston Villa"),
    ("Liverpool", "Man United"),
    ("West Ham", "Brentford"),
]

for home_team, away_team in fixtures:
    pred_probs = predict_match_lr(store_df, lr, home_team, away_team, features)
    print(f"{home_team} vs {away_team}")
    print("   Home ", f"{pred_probs[0]*100:.2f}%")
    print("   Draw ", f"{pred_probs[1]*100:.2f}%")
    print("   Away ", f"{pred_probs[2]*100:.2f}%")
    predicted_class=pred_probs.argmax()
    print(predicted_class)
    print("-" * 40)

Nott'm Forest vs Chelsea
   Home  26.73%
   Draw  41.95%
   Away  31.31%
1
----------------------------------------
Brighton vs Newcastle
   Home  29.06%
   Draw  41.58%
   Away  29.36%
1
----------------------------------------
Burnley vs Leeds
   Home  35.17%
   Draw  47.79%
   Away  17.04%
1
----------------------------------------
Crystal Palace vs Bournemouth
   Home  23.25%
   Draw  46.99%
   Away  29.76%
1
----------------------------------------
Man City vs Everton
   Home  47.19%
   Draw  25.38%
   Away  27.43%
0
----------------------------------------
Sunderland vs Wolves
   Home  51.99%
   Draw  27.06%
   Away  20.95%
0
----------------------------------------
Fulham vs Arsenal
   Home  47.98%
   Draw  38.00%
   Away  14.02%
0
----------------------------------------
Tottenham vs Aston Villa
   Home  29.74%
   Draw  42.26%
   Away  28.01%
1
----------------------------------------
Liverpool vs Man United
   Home  70.44%
   Draw  20.54%
   Away  9.02%
0
----------------------------------------
West Ham vs Brentford
   Home  24.65%
   Draw  41.67%
   Away  33.68%
1
----------------------------------------

cm=confusion_matrix(y_test,y_pred)
sns.heatmap(cm,cmap='coolwarm',annot=True,fmt='d')

<Axes: >

Imports¶

Functions calculating Teams league position based on wins and goals scored¶

Utility functions¶

Preparing Training dataset¶

EDA¶

Distribution of match outcome¶

Feature Selection¶

Random Forest Classifier¶

Neural network¶

Neural Network accuracy progression and loss minimization over epochs¶

Logistic Regression¶

	HomeTeam	AwayTeam	FTHG	FTAG	home_vs_away_winrate	away_vs_home_winrate	goal_diff_gap	points_gap	form_gap	win_rate_gap	...	betting_confidence	favorite_strength	underdog_strength	home_draw_rate_last5	away_draw_rate_last5	draw_rate_gap	home_low_score_rate	away_low_score_rate	low_score_gap	FTR
0	Brentford	Man City	0.0	1.0	0.250000	0.625000	-12.0	-6	0.0	-0.4	...	-0.011696	0.210526	0.222222	0.2	0.2	0.0	0.6	0.4	0.2	A
1	Wolves	Brighton	1.0	1.0	0.142857	0.428571	-9.0	-7	0.0	-0.4	...	0.007508	0.270270	0.277778	0.2	0.2	0.0	0.8	0.8	0.0	D
2	Newcastle	Nott'm Forest	2.0	0.0	0.833333	0.166667	8.0	4	0.0	0.4	...	0.398847	0.238095	0.636943	0.2	0.2	0.0	0.6	1.0	-0.4	H
3	Everton	Crystal Palace	2.0	1.0	0.500000	0.115385	-2.0	-1	0.0	-0.2	...	0.100471	0.307692	0.408163	0.4	0.4	0.0	0.6	1.0	-0.4	H
4	Aston Villa	Burnley	2.0	1.0	0.416667	0.166667	7.0	5	-0.4	0.4	...	0.375000	0.250000	0.625000	0.4	0.0	0.4	0.4	0.8	-0.4	H