%matplotlib inline
All of the code until the next header is there to put the data into a more usable form for generating models. The end result is an array of games:
[game, game, game]
where each game is:
{
id : 'you should not bother with this', # This is just an internal detail to group data.
drives : [drive, drive, drive],
scores : { 'TEAM1': 12, 'TEAM2': 19 }, # The final score, calculated by adding up plays in the game.
winner : 'TEAM2', # The winner, either determined by the ScoreDiff in the data or the scores.
}
where each drive is:
{
result : 7, # Could be -8, -7, -2, 0, 2, 3, 7, 8, negative numbers mean the defense scored.
possessing_team: 'TEAM2', # The team which is on offense during this drive
situations : [situation, situation situation],
}
and then every situation looks like:
{
'down': , # One of 1, 2, 3 or 4.
'time' : 1548, # Seconds left in regulation, so it starts at 3600. Overtime has negative values.
'distance_to_goal' : 55, # 1 for a yard to a goal, 99 for being on the other side.
'yards_to_first_down' : 13, # Number of yards for a first down.
'score_differential' : -7, # The lead of the driving team, basically the "ScoreDiff" column.
'scores' : { 'TEAM1': 12, 'TEAM2': 5 }
}
import csv, itertools
from copy import deepcopy
# There isn't always an extra point recorded after a touchdown. Often after an interception or a fumble
# that was run back, things get weird. By looking at the play immediately after the touchdown, if there isn't a
# play explicitly recorded, then assume that it was an extra point kick which was good.
def determine_post_touchdown_play(row):
if row['ExPointResult'] == 'Made':
return 1
elif row['ExPointResult'] == 'Missed':
return 0
elif row['TwoPointConv'] == 'Success':
return 2
elif row['TwoPointConv'] == 'Failure':
return 0
elif row['DefTwoPoint'] == 'Success':
# Basically the Saints vs Broncos game where the blocked an XP at the end of the game.
return -2
else:
# print('Had trouble determining the post touchdown result on this play: ' + row['desc'])
return 1
def cleanup_time(time_string):
if time_string == '2e3':
return '2000'
elif time_string == '1e3':
return '1000'
elif time_string == '3e3':
return '3000'
return time_string
def determine_winning_team(row):
score_diff = int(row['ScoreDiff'])
winner_from_score_diff = None
if score_diff == 0:
winner_from_score_diff = 'nobody'
elif score_diff < 0:
winner_from_score_diff = row['DefensiveTeam']
else:
winner_from_score_diff = row['posteam']
return winner_from_score_diff
def determine_winning_team_from_scores(scores):
winner_from_scores = None
if scores[row['HomeTeam']] == scores[row['AwayTeam']]:
winner_from_scores = 'nobody'
elif scores[row['HomeTeam']] > scores[row['AwayTeam']]:
winner_from_scores = row['HomeTeam']
else:
winner_from_scores = row['AwayTeam']
return winner_from_scores
# Holy moly this is a mess, but so is the data.
# If there is a touchdown, peek ahead and see if there is a two point conversion attempt or an extra point attempt.
# Frequently the play after a touchdown is missing, so we do our best and assume an extra point was kicked. This
# doesn't work very well in some games, like the Snow Bowl game between the Lions and Eagles, where there were two
# two point conversion attempts that failed and don't have an entry. This results in a final score of 34-22, which
# should be 34-20.
# TODO(andrew): Rework this so that it doesn't have side effects.
def get_points(row, next_row, reader1, reader2):
if row['sp'] != '1':
return (0, row, next_row)
if row['Touchdown'] == '1':
points = 6
# If a game ends on a touchdown in overtime, then it moves right into the next game
# and there is no end of game marker or missing extra point play. Sometimes the extra
# point play is missing and it moves right to the next game.
if next_row is not None and row['GameID'] == next_row['GameID']:
# Peek ahead to the next valid play, skipping ones where penalties occurred
row = next(reader1)
# Advance both of them, so they don't get out of sync.
next_row = next(reader2)
while row['PlayType'] == 'No Play' or row['PlayType'] == 'Timeout':
row = next(reader1)
next_row = next(reader2)
next_points = determine_post_touchdown_play(row)
points = points + next_points
# Let's check and see if this was an interception or a fumble run back for a TD
# A return result of TouchDown can also be from a punt or kickoff.
if row['ReturnResult'] == 'Touchdown' and row['PlayType'] != 'Kickoff':
return (-1 * points, row, next_row)
return (points, row, next_row)
elif row['Safety'] == '1':
return (-2, row, next_row)
elif row['FieldGoalResult'] == 'Good':
return (3, row, next_row)
else:
if debug:
print('unable to determine points for scoring play')
print(row['desc'])
print(row['FieldGoalResult'])
return (1, row, next_row)
# Stands for "pretty print row" in case you were wondering.
def ppr(row):
print('Game: {}, Drive: {}, Down: {}, '
'Yards To Goal: {}, ScoreDiff: {}, '
'PosTeam: {}, DefTeam: {},\nDesc: {}'.format(
row['GameID'], row['Drive'], row['down'],
row['yrdline100'], row['ScoreDiff'],
row['posteam'], row['DefensiveTeam'], row['desc']))
## Types of games here:
## Tampa Bay / Minnesota was a game where the *first* play of overtime is a fumble run back for a touchdown
#f = open('picksix.csv', 'r')
f = open('nfl_play_by_play_2009_2016.csv', 'r')
reader = csv.DictReader(f)
debug = False
debug_lots = False
filter_to_games = []
games = []
current_game = None
current_drive = None
ignored = 0
total = 0
last_row = None
reader1, reader2 = itertools.tee(csv.DictReader(f))
next(reader2)
for row, next_row in itertools.zip_longest(reader1, reader2):
total = total + 1
# So we ignore plays with this content, they seem to be meaningless
if row['desc'] == '*** play under review ***':
ignored = ignored + 1
continue
if len(filter_to_games) > 0 and row['GameID'] not in filter_to_games:
ignored = ignored + 1
continue
if debug_lots:
ppr(row)
# Indicates either it's the first row in the file, or it's the first row
# of a new drive.
if current_game is None:
if debug:
print('START OF GAME {}, {} @ {}'.format(row['GameID'], row['AwayTeam'],row['HomeTeam']))
current_game = {
'id': row['GameID'],
'drives': [],
'scores': {
row['HomeTeam']: 0,
row['AwayTeam']: 0,
},
}
# Indicates either it's the first row in the file, or it's the first row
# of a new drive.
if current_drive is None:
if debug:
print('--> START OF DRIVE {}'.format(row['Drive']))
current_drive = {
'id': row['Drive'],
'possessing_team': row['posteam'],
'situations': [],
}
if row['down'] != 'NA':
situation_at_start_of_play = {
'down': int(row['down']),
'time': int(cleanup_time(row['TimeSecs'])),
'distance_to_goal': int(row['yrdline100']),
'yards_to_first_down': int(row['ydstogo']),
'score_differential': int(row['ScoreDiff']),
'scores': deepcopy(current_game['scores']),
}
current_drive['situations'].append(situation_at_start_of_play)
# Our attempt at keeping track of the points, since the ScoreDiff isn't always right.
points, row, next_row = get_points(row, next_row, reader1, reader2)
if points > 0:
current_game['scores'][row['posteam']] = current_game['scores'][row['posteam']] + points
elif points < 0:
current_game['scores'][row['DefensiveTeam']] = current_game['scores'][row['DefensiveTeam']] - points
# Constantly keep track of what the current winner is, because sometimes the final row for a game has the
# score differential, sometimes it doesn't. ¯\_(ツ)_/¯
if row['ScoreDiff'] != 'NA':
current_game['winner'] = determine_winning_team(row)
current_game['winner_scores'] = determine_winning_team_from_scores(current_game['scores'])
# If it's the end of the game, reset the game and drive.
# Sometimes we have an "End of Game" play, sometimes the game just ends,
# and there is always the last game in the file. Sometimes there are two
# end of game plays. ¯\_(ツ)_/¯
if row['PlayType'] == 'End of Game' \
or next_row is None \
or next_row['GameID'] != current_game['id']:
# Sometimes we get two 'end of game' plays. If this is the case, just advance both readers
if next_row is not None and next_row['PlayType'] == 'End of Game':
ignored = ignored + 1
next(reader1), next(reader2)
current_drive['points'] = points
if debug:
print('--> END OF DRIVE {}, resulting in {} points'.format(current_drive['id'], current_drive['points']))
current_game['drives'].append(current_drive)
current_drive = None
games.append(current_game)
if debug:
print('END OF GAME {}, winner is {} scores are {}'.format(
current_game['id'], current_game['winner'], current_game['scores']))
current_game = None
continue
# This was the last play in the drive, either scoring or not
if current_drive['id'] != next_row['Drive']:
current_drive['points'] = points
current_game['drives'].append(current_drive)
if debug:
print('--> END OF DRIVE {}, resulting in {} points'.format(current_drive['id'], current_drive['points']))
current_drive = None
# Record the last drive for the last game
if current_drive is not None and len(current_drive['situations']) > 0:
current_drive['points'] = 0
current_game['drives'].append(current_drive)
games.append(current_game)
#print('end of game {}, winner is {} scores are {}'.format(current_game['id'], current_game['winner'], current_game['scores']))
if current_game['winner'] != current_game['winner_scores']:
print('I thought {} won the game, but scorediff thinks that {} won the game'.format(current_game['winner_scores'], current_game['winner']))
print('ignored ' + str(ignored) + ' rows of ' + str(total) + ' total ' + str(total - ignored) + ' observations')
rows = []
print(len(games))
for game in games:
for drive in game['drives']:
for situation in drive['situations']:
rows.append(situation)
print(len(rows))
import matplotlib.pyplot as plt
import numpy as np
fig, axes = plt.subplots(1, 3, figsize=(20, 5))
downs = list(map(lambda row: row['down'], rows))
downs_hist = axes[0].hist(downs, 4)
axes[0].set_title('Down')
yardstogo = list(map(lambda row: row['yards_to_first_down'], rows))
ytg_hist = axes[1].hist(yardstogo, bins=np.linspace(min(yardstogo), max(yardstogo), max(yardstogo)), log=True)
axes[1].set_title('Yards to 1st Down')
yardstogoal = list(map(lambda row: row['distance_to_goal'], rows))
ytg_hist = axes[2].hist(yardstogoal, bins=np.linspace(min(yardstogoal), max(yardstogoal), max(yardstogoal)), log=False)
axes[2].set_title('Yards To Goal')
time_remaining = list(map(lambda row: row['time'], rows))
plt.figure(figsize=(30, 5))
plt.title('Seconds Left In Regulation')
plt.gca().invert_xaxis()
time_remaining_hist = plt.hist(time_remaining,
bins=np.linspace(min(time_remaining), max(time_remaining), max(time_remaining)),
log=True)
# time_remaining_hist.set_title('Time Left')
time_remaining = plt = time_remaining_hist = None
You can see at the end of the half and at the end of the game the rate of plays pick up. The beginning of the each quarter sees some very slow plays.
How should we create GLM inputs for Expected Points? Should we bucket them?
Time should probably be the amount of time left in regulation. Overtime is nice and all but I'm happy to not model it.
Down should be crossed with yards to 1st down, probably. The yardage becomes more and more important as you approach fourth down.
import pandas as pd
import statsmodels.api as sm
glm_rows = []
for game in games:
for drive in game['drives']:
first_downs = 0
for situation in drive['situations']:
glm_row = deepcopy(situation)
glm_row['points'] = drive['points']
glm_row['is_first_down'] = 1 if glm_row['down'] == 1 else 0
glm_row['is_second_down'] = 1 if glm_row['down'] == 2 else 0
glm_row['is_third_down'] = 1 if glm_row['down'] == 3 else 0
glm_row['is_over_10_yards_to_first_down'] = 1 if glm_row['yards_to_first_down'] > 10 else 0
glm_row['is_field_goal_range'] = 1 if glm_row['distance_to_goal'] <= 35 else 0
if glm_row['down'] == 1:
first_downs = first_downs + 1
glm_row['first_downs'] = first_downs
del glm_row['down']
del glm_row['scores']
del glm_row['score_differential']
glm_rows.append(glm_row)
row_data_frame = pd.DataFrame.from_dict(glm_rows)
glm_rows = None
row_data_frame.describe(include = 'all')
res = sm.formula.glm('points ~ '
'distance_to_goal + yards_to_first_down + is_first_down + is_second_down + is_third_down'
'+ first_downs'
'+ is_over_10_yards_to_first_down'
'+ is_field_goal_range'
'+ yards_to_first_down : is_first_down'
'+ yards_to_first_down : is_second_down'
'+ yards_to_first_down : is_third_down',
family=sm.families.Gaussian(),
data=row_data_frame).fit()
res.summary()
# Make a model with just the important things
res = sm.formula.glm('points ~ '
'distance_to_goal + yards_to_first_down + is_first_down + is_second_down + is_third_down',
family=sm.families.Gaussian(),
data=row_data_frame).fit()
res.summary()
import matplotlib.pyplot as plt
yard_lines = range(99, 0, -1)
expected_points = {}
for d in range(0, 4):
down = d + 1
values = []
for i in yard_lines:
values.append({
'distance_to_goal' : i,
'yards_to_first_down': 10,
'is_first_down': 1 if down == 1 else 0,
'is_second_down': 1 if down == 2 else 0,
'is_third_down': 1 if down == 3 else 0,
})
to_predict = pd.DataFrame.from_dict(values)
expected_points[down] = res.predict(exog=to_predict)
for d in range(0, 4):
down = d + 1
plt.plot(yard_lines, expected_points[down], label='Down {}'.format(down))
plt.legend(loc='best')
plt.xlabel('Yards To Goal')
plt.ylabel('Expected Points')