I work in my spare time with a lot of NBA data, and I realized I wished there was a simple and free way to get play-by-play data for the whole NBA season. Well, voila! I made a little R script function that helps solve this problem and inserts all the data into a PostgreSQL database. (It’s a lot of data…It will take a while to run theupload_game_data_to_db function)

Dependecies

library(XML)
library(MASS)
library(RPostgreSQL)
library(rvest)

Web Scraping the Data

Here we get all the game ids that have been play this season to the current date.

full_games = data.frame(games=(character()))
days = seq(as.Date('2016-10-25'),as.Date(Sys.Date()),by='day')
for(i in seq_along(days)){
    date=format(days[i],format="%Y%m%d")
    games = read_html(paste0("http://www.espn.com/nba/schedule/_/date/",date))
    games = games %>% html_nodes(xpath="//td/a") %>% html_attr("href")
    games = as.data.frame(games)
    games[,1] = as.character(games[,1])
    games = games[grep("gameId",games[,1]),]
    games = gsub("\\/nba\\/game\\?gameId=","",games)
    games = as.data.frame(games)
    games[,1] = as.character(games[,1])
    full_games = rbind(games,full_games)
}
full_games_clean = full_games[!duplicated(full_games[,1]),]
full_games_clean = as.data.frame(full_games_clean)
full_games_clean = full_games_clean[-grep("http",full_games_clean[,1]),]
full_games_clean = as.data.frame(full_games_clean)
names(full_games_clean) = "game_id"
full_games_clean$game_id = as.character(full_games_clean$game_id)

With our game_ids we use this function to get play-by-play data for each game, and the location on the court of made shots. We get the quarter the shot was made, and the player_id of the player that made the shot.

upload_game_data_to_db = function(games){
    game_id = games$game_id
    for(i in 1:length(game_id)){
        play_by_play = readHTMLTable(paste0("http://www.espn.com/nba/playbyplay?gameId=",game_id[i]))
        if(length(play_by_play)<5){
            print("Game Hasnt Happened")
        }
        else{
            print("GAME HAS HAPPEN")
            play_by_play[[2]]$quarter = 1
            play_by_play[[3]]$quarter = 2
            play_by_play[[4]]$quarter = 3
            play_by_play[[5]]$quarter = 4
            play_by_play = rbind(play_by_play[[2]],play_by_play[[3]],play_by_play[[4]],play_by_play[[5]])
            play_by_play = play_by_play[-grep("time",play_by_play[,1]),]

            play_by_play = cbind(play_by_play,game_id[i])
            play_by_play = as.data.frame(play_by_play)
            names(play_by_play) = c("time_game","team","play_detail","score","nada","quarter","game_id")
            play_by_play$quarter = as.numeric(play_by_play$quarter)
            play_by_play$game_id = as.numeric(play_by_play$game_id)

            shot_chart = read_html(paste0("http://www.espn.com/nba/playbyplay?gameId=",game_id[i]))
            made_shots_text = shot_chart %>%   
              html_nodes(xpath ='//*[contains(concat( " ", @class, " " ), concat( " ", "made", " " ))]') %>% 
              html_attr("data-text")
            made_shots_loc = shot_chart %>% 
              html_nodes(xpath ='//*[contains(concat( " ", @class, " " ), concat( " ", "made", " " ))]') %>% 
              html_attr("style")
            player_id = shot_chart %>% 
              html_nodes(xpath ='//*[contains(concat( " ", @class, " " ), concat( " ", "made", " " ))]') %>% 
              html_attr("data-shooter")
            quarter_shot = shot_chart %>% 
              html_nodes(xpath ='//*[contains(concat( " ", @class, " " ), concat( " ", "made", " " ))]') %>% 
              html_attr("data-period")
            made_shots_loc_top = sub(".*top:", "", made_shots_loc)
            made_shots_loc_left = sub(".*left:", "", made_shots_loc)

            made_shots_loc_top = gsub("%;","",made_shots_loc_top)
            made_shots_loc_left = gsub("%.*","",made_shots_loc_left)

            location_makes=cbind(made_shots_text,made_shots_loc_top,
                                     made_shots_loc_left,player_id,quarter_shot,game_id[i])
            location_makes = as.data.frame(location_makes)
            names(location_makes)[6]="game_id"
            location_makes$made_shots_loc_top =as.numeric(location_makes$made_shots_loc_top)
            location_makes$made_shots_loc_left =as.numeric(location_makes$made_shots_loc_left)
            location_makes$player_id =as.numeric(location_makes$player_id)
            location_makes$quarter_shot =as.numeric(location_makes$quarter_shot)
            location_makes$game_id =as.numeric(location_makes$game_id)

            if(i == 1){
                drv = dbDriver("PostgreSQL")
                    connection = dbConnect(drv,
                    port = "port",
                    host = "host",
                    user = "user",
                    dbname = "nba_games",
                    password = "password")
                dbWriteTable(connection, "location", value=location_makes,row.names=FALSE)
                dbWriteTable(connection, "play_by_play", value=play_by_play,row.names=FALSE)
                dbDisconnect(connection)

            }
            else{
                drv = dbDriver("PostgreSQL")
                connection = dbConnect(drv,
                    port = "port",
                    host = "host",
                    user = "user",
                    dbname = "nba_games",
                    password = "password")
                dbWriteTable(connection, "location", value=location_makes,append=TRUE,row.names=FALSE)
                dbWriteTable(connection, "play_by_play", value=play_by_play,append=TRUE,row.names=FALSE)
                dbDisconnect(connection)
            }
        }
    }
}

Next we mess with a couple queries to see what we can do with the data.

Plotting the Data

Now lets say we want Kawhi Leonard’s made shots during the 4th quarter, we can query the data and get this data. Now we can filter by certain times, or location of shots.For simple reasons let’s upload a png file of a basketball court and use ggplot to graph everything.

*Note that left side of the court is playing away, and right side of court is playing at home.

drv = dbDriver("PostgreSQL")
                connection = dbConnect(drv,
                    port = "port",
                    host = "host",
                    user = "user",
                    dbname = "nba_games",
                    password = "password")
kawhi_4th_made=dbGetQuery(connection,"SELECT * FROM location WHERE quarter_shot = 4 AND player_id = 6450")
all_4th_quarter_threes = dbGetQuery(connection,"SELECT * FROM location WHERE quarter_shot = 4 AND made_shots_text LIKE '%three pointer%'")
isiah_first =dbGetQuery(connection,"SELECT * FROM location WHERE quarter_shot = 1 AND player_id = 6472")
isiah_second =dbGetQuery(connection,"SELECT * FROM location WHERE quarter_shot = 2 AND player_id = 6472")
isiah_third = dbGetQuery(connection,"SELECT * FROM location WHERE quarter_shot = 3 AND player_id = 6472")
isiah_fourth =dbGetQuery(connection,"SELECT * FROM location WHERE quarter_shot = 4 AND player_id = 6472")

isiah_first$made_shots_loc_left=isiah_first$made_shots_loc_left*5.94
isiah_first$made_shots_loc_top=315-isiah_first$made_shots_loc_top*3.15

isiah_second$made_shots_loc_left=isiah_second$made_shots_loc_left*5.94
isiah_second$made_shots_loc_top=315-isiah_second$made_shots_loc_top*3.15

isiah_third$made_shots_loc_left=isiah_third$made_shots_loc_left*5.94
isiah_third$made_shots_loc_top=315-isiah_third$made_shots_loc_top*3.15

isiah_fourth$made_shots_loc_left=isiah_fourth$made_shots_loc_left*5.94
isiah_fourth$made_shots_loc_top=315-isiah_fourth$made_shots_loc_top*3.15





kawhi_4th_made$made_shots_loc_left=kawhi_4th_made$made_shots_loc_left*5.94
kawhi_4th_made$made_shots_loc_top=315-kawhi_4th_made$made_shots_loc_top*3.15
all_4th_quarter_threes$made_shots_loc_left=all_4th_quarter_threes$made_shots_loc_left*5.94
all_4th_quarter_threes$made_shots_loc_top=315-all_4th_quarter_threes$made_shots_loc_top*3.15
head(kawhi_4th_made)
##                              made_shots_text made_shots_loc_top
## 1         Kawhi Leonard makes two point shot              151.2
## 2 Kawhi Leonard makes 17-foot two point shot              214.2
## 3         Kawhi Leonard makes two point shot              176.4
## 4 Kawhi Leonard makes 11-foot two point shot              170.1
## 5 Kawhi Leonard makes 29-foot  three pointer              126.0
## 6           Kawhi Leonard makes driving dunk              157.5
##   made_shots_loc_left player_id quarter_shot   game_id
## 1              546.48      6450            4 400899470
## 2              463.32      6450            4 400900148
## 3              558.36      6450            4 400900148
## 4              481.14      6450            4 400900148
## 5              368.28      6450            4 400900148
## 6              558.36      6450            4 400900148
dbDisconnect(connection)
## [1] TRUE
library(grid)
library(jpeg)
library(ggplot2)
library(ggthemes)

courtImg.URL <- "~/Desktop/nba_shinyapp/play_by_play_data/bg-court-logo.jpeg"
court <- rasterGrob(readJPEG((courtImg.URL)))
qplot(kawhi_4th_made$made_shots_loc_left,kawhi_4th_made$made_shots_loc_top,geom="blank")+
annotation_custom(court,xmin=0, xmax=594, ymin=0, ymax=315) +geom_point(colour="red")+ coord_fixed() + xlim(0,594) +
       ylim(0, 315)+ theme_pander()+theme(line = element_blank(),
             axis.title.x = element_blank(),
             axis.title.y = element_blank(),
             axis.text.x = element_blank(),
             axis.text.y = element_blank(),
             text=element_text(family="Avenir"))+ggtitle("Kawhi Leonard 4th Quarter Shots")