import numpy as np
import pandas as pd
import math
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
from scipy import stats


nfl_elo = pd.read_csv('nfl_elo.csv')
nfl_elo


teams = np.array([['BUF', 'Buffalo', 'Bills', 'AFC', 'East'], 
                 ['MIA', 'Miami', 'Dolphins', 'AFC', 'East'],
                 ['NE', 'New England', 'Patriots', 'AFC', 'East'],
                 ['NYJ', 'New York', 'Jets', 'AFC', 'East'],
                 ['DEN', 'Denver', 'Broncos', 'AFC', 'West'],
                 ['KC', 'Kansas City', 'Chiefs', 'AFC', 'West'],
                 ['OAK', 'Las Vegas', 'Raiders', 'AFC', 'West'], #The Las Vegas Raiders just moved from Oakland
                 ['LAC', 'Los Angeles', 'Chargers', 'AFC', 'West'],
                 ['BAL', 'Baltimore', 'Ravens', 'AFC', 'North'],
                 ['CIN', 'Cincinnati', 'Bengals', 'AFC', 'North'],
                 ['CLE', 'Cleveland', 'Browns', 'AFC', 'North'],
                 ['PIT', 'Pittsburgh', 'Steelers', 'AFC', 'North'],
                 ['HOU', 'Houston', 'Texans', 'AFC', 'South'],
                 ['IND', 'Indianapolis', 'Colts', 'AFC', 'South'],
                 ['JAX', 'Jacksonville', 'Jaguars', 'AFC', 'South'],
                 ['TEN', 'Tennesse', 'Titans', 'AFC', 'South'],
                 ['DAL', 'Dallas', 'Cowboys', 'NFC', 'East'],
                 ['NYG', 'New York', 'Giants', 'NFC', 'East'],
                 ['PHI', 'Philadelphia', 'Eagles', 'NFC', 'East'],
                 ['WSH', 'Washington', 'Football Team', 'NFC', 'East'], #The Football Team
                 ['ARI', 'Arizona', 'Cardinals', 'NFC', 'West'],
                 ['LAR', 'Los Angeles', 'Rams', 'NFC', 'West'],
                 ['SF', 'San Fransisco', '49ers', 'NFC', 'West'],
                 ['SEA', 'Seattle', 'Seahawks', 'NFC', 'West'],
                 ['CHI', 'Chicago', 'Bears', 'NFC', 'North'],
                 ['DET', 'Detroit', 'Lions', 'NFC', 'North'],
                 ['GB', 'Green Bay', 'Packers', 'NFC', 'North'],
                 ['MIN', 'Minnesota', 'Vikings', 'NFC', 'North'],
                 ['ATL', 'Atlanta', 'Falcons', 'NFC', 'South'],
                 ['CAR', 'Carolina', 'Panthers', 'NFC', 'South'],
                 ['NO', 'New Orleans', 'Saints', 'NFC', 'South'],
                 ['TB', 'Tampa Bay', 'Buccaneers', 'NFC', 'South']]) 

nfl_teams = pd.DataFrame(data=teams, columns = ['team_id', 'city', 'name', 'conference', 'division']) 
nfl_teams = nfl_teams.set_index('team_id')
nfl_teams


#Only include non-playoff games
nfl_elo['playoff'] = nfl_elo['playoff'].fillna(value='n/a')
nfl_elo = nfl_elo.loc[nfl_elo['playoff'] == 'n/a']

#Drop unused columns
nfl_elo = nfl_elo.drop(['neutral', 'playoff', 'elo1_pre', 'elo2_pre', 'elo_prob1', 'elo_prob2', 'elo1_post', 'elo2_post',
                        'qb1', 'qb2', 'qb1_value_pre', 'qb2_value_pre', 'qb1_adj', 'qb2_adj', 'qb1_game_value', 
                        'qb2_game_value', 'qb1_value_post', 'qb2_value_post','qbelo1_post', 'qbelo2_post'], axis=1)

#Restrict data to the 2002-2019 seasons
nfl_elo = nfl_elo.loc[(nfl_elo['season'] >= 2002) & (nfl_elo['season'] < 2020)]

#Rename columns to reflect how we will use them
nfl_elo = nfl_elo.rename(columns={'qbelo1_pre': 'elo1', 'qbelo2_pre': 'elo2', 'qbelo_prob1': 'prob1', 'qbelo_prob2': 'prob2'})


#Given two team_ids, determine whether the teams are in the same division
def same_division(team1, team2):
    conference1 = nfl_teams.loc[team1, 'conference']
    conference2 = nfl_teams.loc[team2, 'conference']
    division1 = nfl_teams.loc[team1, 'division']
    division2 = nfl_teams.loc[team2, 'division']
    return conference1 == conference2 and division1 == division2
 
nfl_elo['same_division'] = nfl_elo.apply(lambda x: same_division(x.team1, x.team2), axis=1)


nfl_elo['pred_margin'] = (nfl_elo['elo1'] - nfl_elo['elo2'])/25
nfl_elo['pred_result'] = nfl_elo['pred_margin'].apply(lambda x: 1 if x > 0 else .5 if x == 0 else 0)
nfl_elo['margin'] = nfl_elo['score1'] - nfl_elo['score2']
nfl_elo['result'] = nfl_elo['margin'].apply(lambda x: 1 if x > 0 else .5 if x == 0 else 0)
nfl_elo


#Plot predicted margin vs margin for all games after 2002
nfl_elo.plot.scatter(x='pred_margin', y='margin', title='Predicted Margin vs Margin')

#Plot predicted margin vs margin for just the 2019 season
nfl_elo.loc[nfl_elo['season'] == 2019].plot.scatter(x='pred_margin', y='margin', title='Predicted Margin vs Margin, 2019')

<AxesSubplot:title={'center':'Predicted Margin vs Margin, 2019'}, xlabel='pred_margin', ylabel='margin'>


#allow multiple datasets on the same plot
plt.close('all')
fig, ax = plt.subplots()

#Plot predicted margin vs margin colored by whether the game was in-division
nfl_elo.query('season >= 2018 & ~same_division').plot.scatter(x='pred_margin', y='margin', ax=ax, label='Different Division',
                                                             title='Predicted Margin vs Margin 2018-2019')
nfl_elo.query('season >= 2018 & same_division').plot.scatter(x='pred_margin', y='margin', ax=ax, label='Same Division',
                                                             color='orange')

<AxesSubplot:title={'center':'Predicted Margin vs Margin 2018-2019'}, xlabel='pred_margin', ylabel='margin'>


#Create 20 buckets of width 5
buckets = list(range(0, 100, 5))
bucket_labels = [str(i) + '-' + str(i+5) for i in buckets]
buckets.append(100)
buckets = [i/100 for i in buckets]

#Cut the data into the buckets, and label all games with which bucket they are in
nfl_elo['Predicted Win%'] = pd.cut(nfl_elo['prob1'], labels=bucket_labels, bins=buckets)

#Calculate what percent of games team1 won for each bucket
win_pcts = nfl_elo.groupby('Predicted Win%').mean()
win_pcts = win_pcts.reset_index()
win_pcts['result'] = win_pcts['result'].apply(lambda x: x*100)

#Plot the percent of games won for each bucket
win_pcts.plot.bar(x='Predicted Win%', y='result', legend=False, ylabel='Actual Win%', 
                  title='Percent of Games Won vs Predicted Win Probability')

<AxesSubplot:title={'center':'Percent of Games Won vs Predicted Win Probability'}, xlabel='Predicted Win%', ylabel='Actual Win%'>


#Repeat the calculations we just did, this time only including in-division games
win_pcts_in = nfl_elo.loc[nfl_elo['same_division']].groupby('Predicted Win%').mean()
win_pcts_in = win_pcts_in.reset_index()
win_pcts_in['result'] = win_pcts_in['result'].apply(lambda x: x*100)
win_pcts_in = win_pcts_in[['Predicted Win%', 'result']]

#Repeat the calculations a third time, using out of division games
win_pcts_out = nfl_elo.loc[nfl_elo['same_division'] == False].groupby('Predicted Win%').mean()
win_pcts_out = win_pcts_out.reset_index()
win_pcts_out['result'] = win_pcts_out['result'].apply(lambda x: x*100)
win_pcts_out = win_pcts_out[['Predicted Win%', 'result']]

#combine and plot
win_pcts_both = pd.merge(win_pcts_in, win_pcts_out, on='Predicted Win%')
win_pcts_both.plot.bar(x='Predicted Win%', y=['result_x', 'result_y'], color=['orange', 'steelblue'], ylabel='Actual Win%',
                       label=['In Division', 'Out of Division'], title='Percent of Games Won vs Predicted Win Probability')

<AxesSubplot:title={'center':'Percent of Games Won vs Predicted Win Probability'}, xlabel='Predicted Win%', ylabel='Actual Win%'>


#allow multiple datasets on the same plot
plt.close('all')
fig, ax = plt.subplots()

#Plot total games played for each bucket in red
nfl_elo.groupby('Predicted Win%').count().reset_index().plot.bar(x='Predicted Win%', y='result', ax=ax,  
                                                                 label='Losses', ylabel='# of Games', color='red',
                                                                 title='Result of Every Game by Predicted Win Probability')

#Plot wins for each bucket in green on top of the total games played
nfl_elo.groupby('Predicted Win%').sum().reset_index().plot.bar(x='Predicted Win%', y='result', ax=ax, 
                                                               color='green', label='Wins')

<AxesSubplot:title={'center':'Result of Every Game by Predicted Win Probability'}, xlabel='Predicted Win%', ylabel='# of Games'>


#Bucket labels are no longer needed
nfl_elo = nfl_elo.drop('Predicted Win%', axis=1)


#Count how many games were predicted correctly and incorrectly
correct = nfl_elo.query('pred_result == result').size
wrong = nfl_elo.query('pred_result != result').size

#Repeat for in division games and out of division games
correct_in = nfl_elo.query('pred_result == result & same_division').size
wrong_in = nfl_elo.query('pred_result != result & same_division').size
correct_out = nfl_elo.query('pred_result == result & ~same_division').size
wrong_out = nfl_elo.query('pred_result != result & ~same_division').size

#calculate accuracy
pct_overall = 100*correct/(correct+wrong)
pct_in = 100*correct_in/(correct_in+wrong_in)
pct_out = 100*correct_out/(correct_out+wrong_out)

print('Overall Accuracy:', pct_overall)
print('In Division Accuracy:', pct_in)
print('Out of Division Accuracy:', pct_out)

Overall Accuracy: 62.673611111111114
In Division Accuracy: 63.19444444444444
Out of Division Accuracy: 62.361111111111114


#Calculate log error between predicted and actual categorical data
def log_error(y_pred, y):
    m = y.shape[0]
    errors = np.zeros(m)
    for i in range(m):
        errors[i] = -y[i]*math.log(y_pred[i]) - (1-y[i])*math.log(1-y_pred[i])
    return errors

#Calculate square error between predicted and actual continuous data
def square_error(y_pred, y):
    m = y.shape[0]
    errors = np.zeros(m)
    for i in range(m):
        errors[i] = (y[i]-y_pred[i])**2
    return errors


#These are the two datasets we will be comparing with t-tests
same_div = nfl_elo.loc[nfl_elo['same_division']]
diff_div = nfl_elo.loc[nfl_elo['same_division'] == False]

#Calculate log error for both datasets
same_log_error = log_error(same_div['prob1'].to_numpy(), same_div['result'].to_numpy())
diff_log_error = log_error(diff_div['prob1'].to_numpy(), diff_div['result'].to_numpy())

#Preform a t-test on log error 
log_results = stats.ttest_ind(same_log_error, diff_log_error)

#Calculate square error for both datasets
same_square_error = square_error(same_div['pred_margin'].to_numpy(), same_div['margin'].to_numpy())
diff_square_error = square_error(diff_div['pred_margin'].to_numpy(), diff_div['margin'].to_numpy())

#Preform a t-test on square error 
square_results = stats.ttest_ind(same_square_error, diff_square_error)

print('Game Result Log Error p-value:', log_results[1])
print('Point Margin Square Error p-value:', square_results[1])

Game Result Log Error p-value: 0.4085243629181847
Point Margin Square Error p-value: 0.07299930085432252


#Convert results to a boolean variable and same_division to an int
nfl_elo['team1_won'] = nfl_elo['margin'].apply(lambda x: x > 0)
nfl_elo['same_division_int'] = nfl_elo['same_division'].apply(lambda x: 1 if x else 0)

#Choose X and y
X_elo = nfl_elo.loc[:, ['elo1', 'elo2']]
X_elo_div = nfl_elo.loc[:, ['elo1', 'elo2', 'same_division_int']]
y_result = nfl_elo.loc[:, 'team1_won']
y_margin = nfl_elo.loc[:, 'margin']


win_predictor = LogisticRegression().fit(X_elo, y_result)
win_scores = cross_val_score(win_predictor, X_elo, y_result, cv=10)
print('Accuracy without division:', win_scores.mean()*100)

Accuracy without division: 64.84424219560502


win_predictor = LogisticRegression().fit(X_elo_div, y_result)
win_scores = cross_val_score(win_predictor, X_elo_div, y_result, cv=10)
print('Accuracy with division:', win_scores.mean()*100)

Accuracy with division: 64.5618692822786


margin_predictor = LinearRegression().fit(X_elo, y_margin)
margin_scores = cross_val_score(margin_predictor, X_elo, y_margin, cv=10)
print('R-squared without division:', margin_scores.mean()*100)

R-squared without division: 14.140889871082493


margin_predictor = LinearRegression().fit(X_elo_div, y_margin)
margin_scores = cross_val_score(margin_predictor, X_elo_div, y_margin, cv=10)
print('R-squared without division:', margin_scores.mean()*100)

R-squared without division: 14.279448891193134

East	West	North	South
Buffalo Bills	Denver Broncos	Baltimore Ravens	Houston Texans
Miami Dolphins	Kansas City Chiefs	Cincinnati Bengals	Indianapolis Colts
New England Patriots	Las Vegas Raiders	Cleveland Browns	Jacksonville Jaguars
New York Jets	Los Angeles Chargers	Pittsburgh Steelers	Tennessee Titans

East	West	North	South
Dallas Cowboys	Arizona Cardinals	Chicago Bears	Atlanta Falcons
New York Giants	Los Angeles Rams	Detroit Lions	Carolina Panthers
Philadelphia Eagles	San Fransisco 49ers	Green Bay Packers	New Orleans Saints
Washington Football Team	Seattle Seahawks	Minnesota Vikings	Tampa Bay Buccaneers

	date	season	neutral	playoff	team1	team2	elo1_pre	elo2_pre	elo_prob1	elo_prob2	...	qbelo_prob1	qbelo_prob2	qb1_game_value	qb2_game_value	qb1_value_post	qb2_value_post	qbelo1_post	qbelo2_post	score1	score2
0	1920-09-26	1920	0	NaN	RII	STP	1503.947	1300.000	0.824651	0.175349	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	48.0	0.0
1	1920-10-03	1920	0	NaN	RCH	ABU	1503.420	1300.000	0.824212	0.175788	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	10.0	0.0
2	1920-10-03	1920	0	NaN	DAY	COL	1493.002	1504.908	0.575819	0.424181	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	14.0	0.0
3	1920-10-03	1920	0	NaN	RII	MUN	1516.108	1478.004	0.644171	0.355829	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	45.0	0.0
4	1920-10-03	1920	0	NaN	CHI	MUT	1368.333	1300.000	0.682986	0.317014	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	20.0	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
16805	2021-01-24	2020	0	d	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
16806	2021-01-24	2020	0	d	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
16807	2021-01-31	2020	0	c	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
16808	2021-01-31	2020	0	c	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
16809	2021-02-07	2020	1	s	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	date	season	team1	team2	elo1	elo2	prob1	prob2	score1	score2	same_division	pred_margin	pred_result	margin	result
11735	2002-09-05	2002	NYG	SF	1466.571698	1572.574177	0.440512	0.559488	13.0	16.0	False	-4.240099	0	-3.0	0.0
11736	2002-09-08	2002	TEN	PHI	1499.453962	1590.374519	0.453437	0.546563	27.0	24.0	False	-3.636822	0	3.0	1.0
11737	2002-09-08	2002	BUF	NYJ	1442.626467	1508.573979	0.481605	0.518395	31.0	37.0	True	-2.637900	0	-6.0	0.0
11738	2002-09-08	2002	JAX	IND	1467.181273	1533.784299	0.480824	0.519176	25.0	28.0	True	-2.664121	0	-3.0	0.0
11739	2002-09-08	2002	CLE	KC	1484.166857	1465.567883	0.577844	0.422156	39.0	40.0	False	0.743959	1	-1.0	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
16525	2019-12-29	2019	DEN	OAK	1500.986566	1408.747148	0.685372	0.314628	16.0	15.0	True	3.689577	1	1.0	1.0
16526	2019-12-29	2019	DAL	WSH	1514.814147	1308.298920	0.823481	0.176519	47.0	16.0	True	8.260609	1	31.0	1.0
16527	2019-12-29	2019	JAX	IND	1396.038870	1495.313527	0.388117	0.611883	38.0	20.0	True	-3.970986	0	18.0	1.0
16528	2019-12-29	2019	LAR	ARI	1560.570816	1394.448770	0.779012	0.220988	31.0	24.0	True	6.644882	1	7.0	1.0
16529	2019-12-29	2019	SEA	SF	1546.000559	1624.530972	0.449329	0.550671	21.0	26.0	True	-3.141217	0	-5.0	0.0

Are Upsets More Likely Between Division Rivals in the NFL?¶

Introduction¶

Contents¶

Background Information¶

Data Collection¶

Elo Data¶

Divisions¶

Data Processing¶

Data Analysis¶

Data Visualization: Win Margins¶

Data Visualization: Win percentage¶

Prediction accuracy¶

Hypothesis Testing¶

The T-Test¶

Prediction¶

Conclusion¶

	city	name	conference	division
team_id
BUF	Buffalo	Bills	AFC	East
MIA	Miami	Dolphins	AFC	East
NE	New England	Patriots	AFC	East
NYJ	New York	Jets	AFC	East
DEN	Denver	Broncos	AFC	West
KC	Kansas City	Chiefs	AFC	West
OAK	Las Vegas	Raiders	AFC	West
LAC	Los Angeles	Chargers	AFC	West
BAL	Baltimore	Ravens	AFC	North
CIN	Cincinnati	Bengals	AFC	North
CLE	Cleveland	Browns	AFC	North
PIT	Pittsburgh	Steelers	AFC	North
HOU	Houston	Texans	AFC	South
IND	Indianapolis	Colts	AFC	South
JAX	Jacksonville	Jaguars	AFC	South
TEN	Tennesse	Titans	AFC	South
DAL	Dallas	Cowboys	NFC	East
NYG	New York	Giants	NFC	East
PHI	Philadelphia	Eagles	NFC	East
WSH	Washington	Football Team	NFC	East
ARI	Arizona	Cardinals	NFC	West
LAR	Los Angeles	Rams	NFC	West
SF	San Fransisco	49ers	NFC	West
SEA	Seattle	Seahawks	NFC	West
CHI	Chicago	Bears	NFC	North
DET	Detroit	Lions	NFC	North
GB	Green Bay	Packers	NFC	North
MIN	Minnesota	Vikings	NFC	North
ATL	Atlanta	Falcons	NFC	South
CAR	Carolina	Panthers	NFC	South
NO	New Orleans	Saints	NFC	South
TB	Tampa Bay	Buccaneers	NFC	South