Building a database

SDS 236

Ben Baumer

2025-03-01

Building your own database

Consider the data posted here

https://www.doe.mass.edu/infoservices/reports/enroll/default.html

1. Define the parameters for the query string

year <- 2002:2023
var <- paste0(
  "sped", 
  str_sub(year, start = 3, end = 4), 
  str_pad(year - 1999, side = "left", pad = "0", width = 2)
)
url <- paste0(
  "https://www.doe.mass.edu/infoservices/reports/enroll/default.html?yr=", 
  var
)
url
 [1] "https://www.doe.mass.edu/infoservices/reports/enroll/default.html?yr=sped0203"
 [2] "https://www.doe.mass.edu/infoservices/reports/enroll/default.html?yr=sped0304"
 [3] "https://www.doe.mass.edu/infoservices/reports/enroll/default.html?yr=sped0405"
 [4] "https://www.doe.mass.edu/infoservices/reports/enroll/default.html?yr=sped0506"
 [5] "https://www.doe.mass.edu/infoservices/reports/enroll/default.html?yr=sped0607"
 [6] "https://www.doe.mass.edu/infoservices/reports/enroll/default.html?yr=sped0708"
 [7] "https://www.doe.mass.edu/infoservices/reports/enroll/default.html?yr=sped0809"
 [8] "https://www.doe.mass.edu/infoservices/reports/enroll/default.html?yr=sped0910"
 [9] "https://www.doe.mass.edu/infoservices/reports/enroll/default.html?yr=sped1011"
[10] "https://www.doe.mass.edu/infoservices/reports/enroll/default.html?yr=sped1112"
[11] "https://www.doe.mass.edu/infoservices/reports/enroll/default.html?yr=sped1213"
[12] "https://www.doe.mass.edu/infoservices/reports/enroll/default.html?yr=sped1314"
[13] "https://www.doe.mass.edu/infoservices/reports/enroll/default.html?yr=sped1415"
[14] "https://www.doe.mass.edu/infoservices/reports/enroll/default.html?yr=sped1516"
[15] "https://www.doe.mass.edu/infoservices/reports/enroll/default.html?yr=sped1617"
[16] "https://www.doe.mass.edu/infoservices/reports/enroll/default.html?yr=sped1718"
[17] "https://www.doe.mass.edu/infoservices/reports/enroll/default.html?yr=sped1819"
[18] "https://www.doe.mass.edu/infoservices/reports/enroll/default.html?yr=sped1920"
[19] "https://www.doe.mass.edu/infoservices/reports/enroll/default.html?yr=sped2021"
[20] "https://www.doe.mass.edu/infoservices/reports/enroll/default.html?yr=sped2122"
[21] "https://www.doe.mass.edu/infoservices/reports/enroll/default.html?yr=sped2223"
[22] "https://www.doe.mass.edu/infoservices/reports/enroll/default.html?yr=sped2324"

2. Scrape all of the web pages

library(rvest)

pages <- url |>
  map(read_html)

pages
[[1]]
{html_document}
<html lang="en-US">
[1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
[2] <body>\r\n\t\t\t<div id="high"></div>\r\n\t\t\t<!-- Google Analytics -->\ ...

[[2]]
{html_document}
<html lang="en-US">
[1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
[2] <body>\r\n\t\t\t<div id="high"></div>\r\n\t\t\t<!-- Google Analytics -->\ ...

[[3]]
{html_document}
<html lang="en-US">
[1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
[2] <body>\r\n\t\t\t<div id="high"></div>\r\n\t\t\t<!-- Google Analytics -->\ ...

[[4]]
{html_document}
<html lang="en-US">
[1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
[2] <body>\r\n\t\t\t<div id="high"></div>\r\n\t\t\t<!-- Google Analytics -->\ ...

[[5]]
{html_document}
<html lang="en-US">
[1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
[2] <body>\r\n\t\t\t<div id="high"></div>\r\n\t\t\t<!-- Google Analytics -->\ ...

[[6]]
{html_document}
<html lang="en-US">
[1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
[2] <body>\r\n\t\t\t<div id="high"></div>\r\n\t\t\t<!-- Google Analytics -->\ ...

[[7]]
{html_document}
<html lang="en-US">
[1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
[2] <body>\r\n\t\t\t<div id="high"></div>\r\n\t\t\t<!-- Google Analytics -->\ ...

[[8]]
{html_document}
<html lang="en-US">
[1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
[2] <body>\r\n\t\t\t<div id="high"></div>\r\n\t\t\t<!-- Google Analytics -->\ ...

[[9]]
{html_document}
<html lang="en-US">
[1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
[2] <body>\r\n\t\t\t<div id="high"></div>\r\n\t\t\t<!-- Google Analytics -->\ ...

[[10]]
{html_document}
<html lang="en-US">
[1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
[2] <body>\r\n\t\t\t<div id="high"></div>\r\n\t\t\t<!-- Google Analytics -->\ ...

[[11]]
{html_document}
<html lang="en-US">
[1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
[2] <body>\r\n\t\t\t<div id="high"></div>\r\n\t\t\t<!-- Google Analytics -->\ ...

[[12]]
{html_document}
<html lang="en-US">
[1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
[2] <body>\r\n\t\t\t<div id="high"></div>\r\n\t\t\t<!-- Google Analytics -->\ ...

[[13]]
{html_document}
<html lang="en-US">
[1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
[2] <body>\r\n\t\t\t<div id="high"></div>\r\n\t\t\t<!-- Google Analytics -->\ ...

[[14]]
{html_document}
<html lang="en-US">
[1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
[2] <body>\r\n\t\t\t<div id="high"></div>\r\n\t\t\t<!-- Google Analytics -->\ ...

[[15]]
{html_document}
<html lang="en-US">
[1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
[2] <body>\r\n\t\t\t<div id="high"></div>\r\n\t\t\t<!-- Google Analytics -->\ ...

[[16]]
{html_document}
<html lang="en-US">
[1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
[2] <body>\r\n\t\t\t<div id="high"></div>\r\n\t\t\t<!-- Google Analytics -->\ ...

[[17]]
{html_document}
<html lang="en-US">
[1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
[2] <body>\r\n\t\t\t<div id="high"></div>\r\n\t\t\t<!-- Google Analytics -->\ ...

[[18]]
{html_document}
<html lang="en-US">
[1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
[2] <body>\r\n\t\t\t<div id="high"></div>\r\n\t\t\t<!-- Google Analytics -->\ ...

[[19]]
{html_document}
<html lang="en-US">
[1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
[2] <body>\r\n\t\t\t<div id="high"></div>\r\n\t\t\t<!-- Google Analytics -->\ ...

[[20]]
{html_document}
<html lang="en-US">
[1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
[2] <body>\r\n\t\t\t<div id="high"></div>\r\n\t\t\t<!-- Google Analytics -->\ ...

[[21]]
{html_document}
<html lang="en-US">
[1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
[2] <body>\r\n\t\t\t<div id="high"></div>\r\n\t\t\t<!-- Google Analytics -->\ ...

[[22]]
{html_document}
<html lang="en-US">
[1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
[2] <body>\r\n\t\t\t<div id="high"></div>\r\n\t\t\t<!-- Google Analytics -->\ ...

5. Compute remote and local file names

src <- paste0("https://www.doe.mass.edu", age_files)

dest <- src |> 
  dirname() |>
  basename() |>
  paste0(".xlsx")

dest
 [1] "sped13.xlsx"   "sped14.xlsx"   "sped2015.xlsx" "sped2016.xlsx"
 [5] "sped2017.xlsx" "sped2018.xlsx" "sped2019.xlsx" "sped2020.xlsx"
 [9] "sped2021.xlsx" "sped2022.xlsx" "sped2023.xlsx" "sped2024.xlsx"

6. Download files

walk2(src, dest, download.file)

list.files(pattern = ".xlsx")
 [1] "sped13.xlsx"   "sped14.xlsx"   "sped2015.xlsx" "sped2016.xlsx"
 [5] "sped2017.xlsx" "sped2018.xlsx" "sped2019.xlsx" "sped2020.xlsx"
 [9] "sped2021.xlsx" "sped2022.xlsx" "sped2023.xlsx" "sped2024.xlsx"

7. Read .xlsx files

sped <- dest |>
  map(readxl::read_excel)

8. Inspect data

sped[[1]]
# A tibble: 406 × 25
   ORG_CODE District_Name Adjusted_Total SWD_Total AGE_3TO5 AGE_6TO21 AGE_3
   <chr>    <chr>                  <dbl> <chr>     <chr>    <chr>     <chr>
 1 0001     Abington                2060 292       20       272       5    
 2 0002     Acton                   2531 387       52       335       12   
 3 0003     Acushnet                1004 191       21       170       3    
 4 0005     Agawam                  4170 633       78       555       23   
 5 0007     Amesbury                2371 450       49       401       15   
 6 0008     Amherst                 1209 235       37       198       8    
 7 0009     Andover                 6307 1039      63       976       11   
 8 0010     Arlington               4990 715       55       660       8    
 9 0014     Ashland                 2600 376       30       346       7    
10 0016     Attleboro               5937 1036      124      912       32   
# ℹ 396 more rows
# ℹ 18 more variables: AGE_4 <chr>, AGE_5 <chr>, AGE_6 <chr>, AGE_7 <chr>,
#   AGE_8 <chr>, AGE_9 <chr>, AGE_10 <chr>, AGE_11 <chr>, AGE_12 <chr>,
#   AGE_13 <chr>, AGE_14 <chr>, AGE_15 <chr>, AGE_16 <chr>, AGE_17 <chr>,
#   AGE_18 <chr>, AGE_19 <chr>, AGE_20 <chr>, AGE_21 <chr>
sped[[9]]
# A tibble: 409 × 25
   ...1     ...2          ...3   ...4  ...5  ...6  ...7  ...8  ...9  ...10 ...11
   <chr>    <chr>         <chr>  <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
 1 <NA>     <NA>          <NA>   <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
 2 <NA>     <NA>          <NA>   <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
 3 <NA>     <NA>          <NA>   <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
 4 <NA>     <NA>          <NA>   <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
 5 <NA>     <NA>          <NA>   <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
 6 ORG_CODE DISTRICT_NAME Adjus… SWD_… AGE_… AGE_… AGE_3 AGE_4 AGE_5 AGE_6 AGE_7
 7 0001     Abington      2146   340   26    314   5     20    1     28    25   
 8 0003     Acushnet      923    126   13    113   7     5     1     6     13   
 9 0005     Agawam        3541   607   60    547   23    33    4     33    32   
10 0007     Amesbury      1889   419   7     412   1     6     0     17    20   
# ℹ 399 more rows
# ℹ 14 more variables:
#   `Massachusetts Department of Elementary and Secondary Education` <chr>,
#   ...13 <chr>, ...14 <chr>, ...15 <chr>, ...16 <chr>, ...17 <chr>,
#   ...18 <chr>, ...19 <chr>, ...20 <chr>, ...21 <chr>, ...22 <chr>,
#   ...23 <chr>, ...24 <chr>, ...25 <chr>

9. Make adjustments

  • Make sure each file has the same format!
  • In some cases, we need to use the skip argument
  • Add the correct year to each file!
  • Fix all the column types
  • Combine the data sets (using list_rbind())

The finished product

data_good
# A tibble: 4,837 × 26
   year  org_code district_name adjusted_total swd_total age_3to5 age_6to21
   <chr> <chr>    <chr>                  <dbl>     <dbl>    <dbl>     <dbl>
 1 2024  0001     Abington                2172       408       20       388
 2 2024  0003     Acushnet                 949       156       11       145
 3 2024  0005     Agawam                  3536       675       51       624
 4 2024  0007     Amesbury                1850       493       35       458
 5 2024  0008     Amherst                  985       214       29       185
 6 2024  0009     Andover                 5504      1052       47      1005
 7 2024  0010     Arlington               6051      1036       38       998
 8 2024  0014     Ashland                 2933       595       29       566
 9 2024  0016     Attleboro               6097      1142      100      1042
10 2024  0017     Auburn                  2513       354       21       333
# ℹ 4,827 more rows
# ℹ 19 more variables: age_3 <dbl>, age_4 <dbl>, age_5 <dbl>, age_6 <dbl>,
#   age_7 <dbl>, age_8 <dbl>, age_9 <dbl>, age_10 <dbl>, age_11 <dbl>,
#   age_12 <dbl>, age_13 <dbl>, age_14 <dbl>, age_15 <dbl>, age_16 <dbl>,
#   age_17 <dbl>, age_18 <dbl>, age_19 <dbl>, age_20 <dbl>, age_21 <dbl>

Local look

data_good |>
  filter(district_name == "Northampton")
# A tibble: 12 × 26
   year  org_code district_name adjusted_total swd_total age_3to5 age_6to21
   <chr> <chr>    <chr>                  <dbl>     <dbl>    <dbl>     <dbl>
 1 2024  0210     Northampton             2516       548       15       533
 2 2023  0210     Northampton             2594       571       26       545
 3 2022  0210     Northampton             2640       586       47       539
 4 2021  0210     Northampton             2608       536       23       513
 5 2020  0210     Northampton             2724       548       55       493
 6 2019  0210     Northampton             2657       570       52       518
 7 2018  0210     Northampton             2696       576       54       522
 8 2017  0210     Northampton             2716       529       39       490
 9 2016  0210     Northampton             2697       581       57       524
10 2015  0210     Northampton             2739       634       53       581
11 2014  0210     Northampton             2786       610       47       563
12 2013  0210     Northampton             2765       562       38       524
# ℹ 19 more variables: age_3 <dbl>, age_4 <dbl>, age_5 <dbl>, age_6 <dbl>,
#   age_7 <dbl>, age_8 <dbl>, age_9 <dbl>, age_10 <dbl>, age_11 <dbl>,
#   age_12 <dbl>, age_13 <dbl>, age_14 <dbl>, age_15 <dbl>, age_16 <dbl>,
#   age_17 <dbl>, age_18 <dbl>, age_19 <dbl>, age_20 <dbl>, age_21 <dbl>

Summary statistics

data_good |>
  mutate(swd_pct = swd_total / adjusted_total) |>
  group_by(year) |>
  summarize(
    num_districts = n(),
    num_students = sum(adjusted_total),
    avg_pct = mean(swd_pct, na.rm = TRUE)
  )
# A tibble: 12 × 4
   year  num_districts num_students avg_pct
   <chr>         <int>        <dbl>   <dbl>
 1 2013            403       965602   0.163
 2 2014            408       966360   0.165
 3 2015            405       966391   0.169
 4 2016            407       964026   0.170
 5 2017            404       964514   0.171
 6 2018            406       964806   0.177
 7 2019            406       962297   0.181
 8 2020            403       959394   0.184
 9 2021            400       921712   0.187
10 2022            399       921180   0.188
11 2023            397       923349   0.194
12 2024            399       924947   0.202

Your turn!