Data cleaning

SDS 236

Benjamin S. Baumer

Smith College

Mar 12, 2026

Data cleaning

Recall: set cache directory

library(tidyverse)
cache_dir <- here::here("data")
cache_dir
[1] "/home/runner/work/sds236/sds236/data"

Read Noho data from cache

library(sf)
noho <- read_rds(fs::path(cache_dir, "noho.rds"))

Land owners

owners <- noho |>
  as_tibble() |>
  group_by(OWNER1) |>
  summarize(
    num_parcels = n(),
    acreage = sum(LOT_SIZE, na.rm = TRUE),
    value = sum(TOTAL_VAL)
  )

owners |>
  arrange(desc(value)) |>
  print(n = 15)
# A tibble: 9,621 × 4
   OWNER1                              num_parcels acreage     value
   <chr>                                     <int>   <dbl>     <dbl>
 1 SMITH COLLEGE                               156   195.  576572860
 2 NORTHAMPTON CITY OF                         134  3231.  261625812
 3 UNITED STATES VETERANS                        1   104.   85320200
 4 NORTHAMPTON HOUSING AUTHORITY                20    36.5  78940000
 5 MASSACHUSETTS COMMONWEALTH OF                18   108.   45694310
 6 HATHAWAY FARMS TOWNHOMES                      1    18.0  29772300
 7 COOLEY DICKINSON HOSPITAL INC                 4    43.2  24912700
 8 COCA COLA COMPANY THE                         1    21.8  21486800
 9 L-3 COMMUNICATIONS CORP                       1    13.6  20451200
10 NORTHAMPTON MANAGEMENT SYSTEMS, INC           2    21.6  19308800
11 OXBOW PROFESSIONAL PARK LLC                   3    15.1  17414100
12 MEADOWBROOK PRESERVATION                      1    26.5  17222400
13 D'AMOUR PAUL H     ET AL                      1    12.2  16155600
14 CITY OF NORTHAMPTON                          42  1375.   16000700
15 LATHROP COMMUNITY INC                         6    82.6  15683000
# ℹ 9,606 more rows

Problematic variations in spellings 1

owners |>
  filter(str_detect(OWNER1, "SMITH COLLEGE"))
# A tibble: 3 × 4
  OWNER1                                num_parcels acreage     value
  <chr>                                       <int>   <dbl>     <dbl>
1 SMITH COLLEGE                                 156 195.    576572860
2 SMITH COLLEGE - FACILITIES MANAGEMENT           1   0.198    706500
3 THE TRUSTEES OF THE SMITH COLLEGE               1   0.107    231500

Problematic variations in spellings 2

owners |>
  filter(str_detect(OWNER1, "COMMONWEALTH"))
# A tibble: 6 × 4
  OWNER1                        num_parcels acreage    value
  <chr>                               <int>   <dbl>    <dbl>
1 COMMONWEALTH OF MASS                    2   3.64   5566100
2 COMMONWEALTH OF MASSACHUSETTS           2  31.0     369600
3 MASS COMMONWEALTH OF                    1   0.248  1342400
4 MASSACHUSETTS COMMONWEALTH OF          18 108.    45694310
5 THE COMMONWEALTH OF                     2   1.08   7600300
6 THE COMMONWEALTH OF MASS                1   0.23     51400

Problematic variations in spellings 3

owners |>
  filter(str_detect(OWNER1, "NU-WAY"))
# A tibble: 4 × 4
  OWNER1            num_parcels acreage   value
  <chr>                   <int>   <dbl>   <dbl>
1 NU-WAY                      1   0.155  310100
2 NU-WAY HOMES                2   0.657  617600
3 NU-WAY HOMES INC            2   1.59  1405400
4 NU-WAY HOMES, INC           5   2.97  1096800

Fix #1: “manual” reconciliation

owners <- owners |> 
  mutate(
    owner_clean = replace_values(OWNER1, 
      c("SMITH COLLEGE - FACILITIES MANAGEMENT", "THE TRUSTEES OF THE SMITH COLLEGE") ~ "SMITH COLLEGE")
  )
owners |>
  filter(str_detect(OWNER1, "SMITH COLLEGE")) |>
  select(contains("owner"))
# A tibble: 3 × 2
  OWNER1                                owner_clean  
  <chr>                                 <chr>        
1 SMITH COLLEGE                         SMITH COLLEGE
2 SMITH COLLEGE - FACILITIES MANAGEMENT SMITH COLLEGE
3 THE TRUSTEES OF THE SMITH COLLEGE     SMITH COLLEGE

Fix #2: strip business crap

owners <- owners |> 
  mutate(owner_clean = str_remove_all(OWNER1, "\\,?INC$"))

owners |>
  filter(str_detect(OWNER1, "NU-WAY")) |>
  select(contains("owner"))
# A tibble: 4 × 2
  OWNER1            owner_clean     
  <chr>             <chr>           
1 NU-WAY            "NU-WAY"        
2 NU-WAY HOMES      "NU-WAY HOMES"  
3 NU-WAY HOMES INC  "NU-WAY HOMES " 
4 NU-WAY HOMES, INC "NU-WAY HOMES, "

Fix #3: use refinr (Smith)

owners <- owners |> 
  mutate(
    owner_refine1 = refinr::key_collision_merge(OWNER1),
    owner_refine2 = refinr::n_gram_merge(OWNER1)
  )

owners |>
  filter(str_detect(OWNER1, "SMITH COLLEGE")) |>
  select(OWNER1, contains("refine"))
# A tibble: 3 × 3
  OWNER1                                owner_refine1              owner_refine2
  <chr>                                 <chr>                      <chr>        
1 SMITH COLLEGE                         SMITH COLLEGE              SMITH COLLEGE
2 SMITH COLLEGE - FACILITIES MANAGEMENT SMITH COLLEGE - FACILITIE… SMITH COLLEG…
3 THE TRUSTEES OF THE SMITH COLLEGE     THE TRUSTEES OF THE SMITH… THE TRUSTEES…

Fix #3: use refinr (Mass)

owners |>
  filter(str_detect(OWNER1, "COMMONWEALTH")) |>
  select(OWNER1, contains("refine"))
# A tibble: 6 × 3
  OWNER1                        owner_refine1                 owner_refine2     
  <chr>                         <chr>                         <chr>             
1 COMMONWEALTH OF MASS          COMMONWEALTH OF MASS          COMMONWEALTH OF M…
2 COMMONWEALTH OF MASSACHUSETTS COMMONWEALTH OF MASSACHUSETTS COMMONWEALTH OF M…
3 MASS COMMONWEALTH OF          COMMONWEALTH OF MASS          MASS COMMONWEALTH…
4 MASSACHUSETTS COMMONWEALTH OF COMMONWEALTH OF MASSACHUSETTS MASSACHUSETTS COM…
5 THE COMMONWEALTH OF           THE COMMONWEALTH OF           THE COMMONWEALTH …
6 THE COMMONWEALTH OF MASS      THE COMMONWEALTH OF MASS      THE COMMONWEALTH …

Fix #3: use refinr (Nu-Way)

owners |>
  filter(str_detect(OWNER1, "NU-WAY")) |>
  select(OWNER1, contains("refine"))
# A tibble: 4 × 3
  OWNER1            owner_refine1    owner_refine2   
  <chr>             <chr>            <chr>           
1 NU-WAY            NU-WAY           NU-WAY          
2 NU-WAY HOMES      NU WAY HOMES INC NU WAY HOMES INC
3 NU-WAY HOMES INC  NU WAY HOMES INC NU WAY HOMES INC
4 NU-WAY HOMES, INC NU WAY HOMES INC NU WAY HOMES INC