Linear Weights

SDS 355

Prof. Baumer

September 15, 2025

What is the value of a play?

Watch

What happened?

  • 1st and 3rd, nobody out
  • Joey Votto singles
  • Billy Hamilton scores from first!
  • error allows Votto to advance to 2nd
  • two actual runs scored

Retrosheet data

library(tidyverse)
library(retrosheet)
cin2016 <- get_retrosheet("play", 2016, "CIN")
cin2016[[19]] |>
  pluck("play") |>
  filter(retroID == "vottj001")
   inning team  retroID count      pitches                      play
1       1    1 vottj001     0         11>C                       SB2
2       1    1 vottj001    22 11>C.SB2>FBC                         K
3       3    1 vottj001    32       BBBCFC                         K
4       5    1 vottj001     0       1311>X S8/G6+.3-H;1-H;B-2(E5/TH)
5       7    1 vottj001    22        CBBCX                63/G6M.2-3
6      10    1 vottj001     0         <NA>                        NP
7      10    1 vottj001     0         <NA>                        NP
8      10    1 vottj001     0         <NA>                        NP
9      10    1 vottj001     0         <NA>                        NP
10     10    1 vottj001    32  ....BCFFBBC                         K

How valuable is a single?

Prepare a data set

library(Lahman)
teams54 <- Teams |>
  filter(yearID >= 1954) |>
  mutate(
    X1B = H - X2B - X3B - HR,
    OUTS = AB - H,
    WK = ifelse(is.na(HBP), 0, HBP) + BB
  )

Convert to per game averages

teams54rg <- teams54 |>
  mutate(
    across(c(R, X1B, X2B, X3B, HR, WK, OUTS, SF, SB, CS), \(x) x/G
  ))

Set up our basic plot

rplot <- ggplot(teams54rg, aes(y = R, x = HR)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  scale_y_continuous("Runs Scored")
rplot

Regression model for home runs

lm(R ~ HR, data = teams54rg)

Call:
lm(formula = R ~ HR, data = teams54rg)

Coefficients:
(Intercept)           HR  
      3.004        1.498  

Singles

rplot + 
  aes(x = X1B)

Regression model for singles

lm(R ~ X1B, data = teams54rg)

Call:
lm(formula = R ~ X1B, data = teams54rg)

Coefficients:
(Intercept)          X1B  
     3.3636       0.1753  

Stolen bases

rplot + 
  aes(x = SB)

Caught stealings

rplot + 
  aes(x = CS)

Basic regression model

# Curveball, pg. 180
teams54rg |>
  filter(yearID < 2000) |>
  lm(R ~ X1B + X2B + X3B + HR + WK + SB, data = _) |>
  summary()

Call:
lm(formula = R ~ X1B + X2B + X3B + HR + WK + SB, data = filter(teams54rg, 
    yearID < 2000))

Residuals:
     Min       1Q   Median       3Q      Max 
-0.43719 -0.09737  0.00028  0.09707  0.47134 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -2.62043    0.08343 -31.410  < 2e-16 ***
X1B          0.51316    0.01261  40.703  < 2e-16 ***
X2B          0.61460    0.02399  25.623  < 2e-16 ***
X3B          1.29969    0.07083  18.351  < 2e-16 ***
HR           1.52896    0.02420  63.192  < 2e-16 ***
WK           0.34013    0.01074  31.659  < 2e-16 ***
SB           0.12645    0.01738   7.277 6.58e-13 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.1488 on 1071 degrees of freedom
Multiple R-squared:  0.9306,    Adjusted R-squared:  0.9302 
F-statistic:  2394 on 6 and 1071 DF,  p-value: < 2.2e-16

Full regression model

lm(R ~ X1B + X2B + X3B + HR + WK + SF + SB + CS + OUTS, data = teams54rg)

Call:
lm(formula = R ~ X1B + X2B + X3B + HR + WK + SF + SB + CS + OUTS, 
    data = teams54rg)

Coefficients:
(Intercept)          X1B          X2B          X3B           HR           WK  
     0.5564       0.4407       0.5919       1.0394       1.4725       0.3143  
         SF           SB           CS         OUTS  
     0.8012       0.1998      -0.2118      -0.1084  

Iterate over eras

lwts <- function(d) {
  lm(R ~ X1B + X2B + X3B + HR + WK + SB + CS + OUTS, data = d) |>
    coef() |>
    t() |>
    as_tibble()
}

lwts(teams54rg)
# A tibble: 1 × 9
  `(Intercept)`   X1B   X2B   X3B    HR    WK    SB     CS   OUTS
          <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>  <dbl>
1          1.05 0.483 0.556  1.25  1.49 0.325 0.189 -0.267 -0.129
teams54rg |>
  mutate(score = floor(yearID %/% 20) * 20) |>
  group_split(score) |>
  map(lwts) |>
  bind_rows()
# A tibble: 5 × 9
  `(Intercept)`   X1B   X2B   X3B    HR    WK    SB      CS    OUTS
          <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl>   <dbl>
1       -0.245  0.541 0.491 0.845  1.63 0.374 0.299  0.0611 -0.0996
2        0.873  0.472 0.615 1.42   1.52 0.291 0.219 -0.363  -0.119 
3       -0.320  0.505 0.679 1.02   1.46 0.336 0.208 -0.175  -0.0885
4        0.0815 0.503 0.770 1.09   1.50 0.312 0.168 -0.227  -0.110 
5        1.60   0.365 0.777 1.63   1.40 0.298 0.211 -0.474  -0.130 

Evaluating run estimators

Common run estimators

teams54 <- teams54 |>
  mutate(
    R = R/G,
    BAVG = H / AB,
    OBP = (H + WK) / (AB + WK + ifelse(is.na(SF), 0, SF)),
    SLG = (X1B + 2*X2B + 3*X3B + 4*HR) / AB,
    OPS = OBP + SLG,
    LWTS = 0.46*X1B + 0.8*X2B + 1.02*X3B + 1.4*HR + 0.33*WK + 0.3*SB - 0.6*CS - 0.25*(OUTS),
    XR = (0.5*X1B + 0.72*X2B + 1.04*X3B + 1.44*HR + 0.33*WK + 0.18*SB -0.32*CS - 0.098*OUTS) / G,
    RC = OBP * SLG
  ) 

Batting average

rplot <- rplot %+% teams54
rplot +
  aes(x = BAVG)

On-base percentage

rplot +
  aes(x = OBP)

Slugging percentage

rplot +
  aes(x = SLG)

OPS

rplot +
  aes(x = OPS)

eXtrapolated Runs

rplot +
  aes(x = XR)

Runs Created

rplot +
  aes(x = RC)

Correlation

teams54 |>
  select(R, BAVG, OBP, SLG, OPS, LWTS, XR, RC) |>
  cor()
             R      BAVG       OBP       SLG       OPS      LWTS        XR
R    1.0000000 0.7452374 0.8595853 0.9098903 0.9527274 0.9446182 0.9600107
BAVG 0.7452374 1.0000000 0.8414923 0.6821185 0.7812527 0.7709039 0.7790329
OBP  0.8595853 0.8414923 1.0000000 0.7242935 0.8657659 0.8839206 0.8904464
SLG  0.9098903 0.6821185 0.7242935 1.0000000 0.9721242 0.9355944 0.9483656
OPS  0.9527274 0.7812527 0.8657659 0.9721242 1.0000000 0.9796594 0.9911482
LWTS 0.9446182 0.7709039 0.8839206 0.9355944 0.9796594 1.0000000 0.9872646
XR   0.9600107 0.7790329 0.8904464 0.9483656 0.9911482 0.9872646 1.0000000
RC   0.9550842 0.7916800 0.8827493 0.9621475 0.9985340 0.9810157 0.9927272
            RC
R    0.9550842
BAVG 0.7916800
OBP  0.8827493
SLG  0.9621475
OPS  0.9985340
LWTS 0.9810157
XR   0.9927272
RC   1.0000000