Search code examples
rdplyrcharacterdata-wranglingis-empty

How to fill empty rows using first text value in each row in R?


Data

Here is the dput for a subset of the data I'm working with:

crime_state <- structure(list(State = c("ALABAMA", "", "ARIZONA", "", "", "", 
"", "", "", "", "", "", "ARKANSAS", "CALIFORNIA", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
""), City = c("HUNTSVILLE4", "TUSCALOOSA", "CHANDLER", "GILBERT", 
"GLENDALE", "MESA", "PEORIA", "PHOENIX", "SCOTTSDALE", "SURPRISE", 
"TEMPE", "TUCSON", "LITTLE ROCK", "ANAHEIM", "ANTIOCH", "BAKERSFIELD", 
"BERKELEY", "BURBANK", "CARLSBAD", "CHULA VISTA", "CLOVIS", "CONCORD", 
"CORONA", "COSTA MESA", "DALY CITY", "DOWNEY", "EL CAJON", "EL MONTE", 
"ELK GROVE", "ESCONDIDO", "FAIRFIELD", "FONTANA", "FREMONT", 
"FRESNO", "FULLERTON", "GARDEN GROVE", "GLENDALE", "HAYWARD", 
"HUNTINGTON BEACH", "INGLEWOOD", "IRVINE", "JURUPA VALLEY", "LANCASTER", 
"LONG BEACH", "LOS ANGELES", "MODESTO", "MORENO VALLEY", "MURRIETA", 
"NORWALK", "OAKLAND"), X = c(2018L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L
), Population1 = c("196,620", "101,764", "255,986", "247,463", 
"249,799", "504,873", "170,177", "1,653,080", "254,961", "136,611", 
"188,543", "537,392", "199,288", "354,743", "112,956", "385,609", 
"123,735", "105,041", "116,739", "274,370", "111,759", "130,855", 
"170,041", "114,358", "107,928", "113,277", "104,497", "116,464", 
"174,651", "153,073", "117,883", "213,964", "238,024", "531,818", 
"141,132", "174,661", "204,724", "162,881", "203,428", "110,726", 
"288,052", "107,605", "160,818", "470,445", "4,029,741", "215,822", 
"209,145", "114,706", "106,158", "430,230"), Violent..crime = c("", 
"253", "287", "132", "594", "967", "201", "6,118", "211", "67", 
"448", "2,027", "1,336", "578", "283", "911", "270", "129", "115", 
"406", "106", "246", "127", "170", "113", "174", "246", "181", 
"198", "237", "335", "331", "274", "1,489", "174", "266", "83", 
"330", "218", "349", "82", "134", "586", "1,702", "14,709", "963", 
"396", "49", "219", "2,675"), Murder = c(NA, 3L, 1L, 1L, 7L, 
10L, 2L, 67L, 7L, 3L, 3L, 26L, 19L, 2L, 1L, 18L, 1L, 0L, 1L, 
4L, 1L, 1L, 4L, 3L, 2L, 3L, 2L, 2L, 0L, 1L, 2L, 5L, 1L, 18L, 
0L, 2L, 0L, 0L, 0L, 7L, 0L, 5L, 4L, 11L, 132L, 8L, 7L, 2L, 5L, 
32L), Rape2 = c("", "20", "74", "49", "69", "143", "33", "566", 
"53", "10", "75", "255", "121", "70", "24", "60", "31", "7", 
"19", "43", "23", "27", "27", "35", "18", "7", "20", "12", "16", 
"28", "53", "27", "32", "93", "27", "28", "16", "55", "41", "27", 
"22", "8", "49", "107", "1,324", "47", "17", "5", "4", "214"), 
    Robbery = c("", "71", "62", "15", "171", "192", "29", "1,438", 
    "52", "20", "97", "670", "150", "195", "102", "402", "167", 
    "48", "23", "114", "15", "103", "49", "59", "45", "83", "86", 
    "78", "40", "71", "89", "94", "115", "473", "66", "90", "24", 
    "170", "59", "156", "22", "45", "142", "527", "5,139", "204", 
    "141", "18", "61", "1,220"), Aggravated..assault = c("", 
    "159", "150", "67", "347", "622", "137", "4,047", "99", "34", 
    "273", "1,076", "1,046", "311", "156", "431", "71", "74", 
    "72", "245", "67", "115", "47", "73", "48", "81", "138", 
    "89", "142", "137", "191", "205", "126", "905", "81", "146", 
    "43", "105", "118", "159", "38", "76", "391", "1,057", "8,114", 
    "704", "231", "24", "149", "1,209"), Property..crime = c("", 
    "2,092", "2,771", "1,705", "4,830", "5,113", "1,596", "28,854", 
    "2,853", "1,088", "3,567", "13,925", "6,236", "4,399", "1,479", 
    "8,330", "2,581", "1,362", "1,081", "1,905", "1,271", "2,151", 
    "1,535", "1,911", "719", "1,353", "1,164", "1,216", "1,128", 
    "1,370", "1,639", "1,683", "2,387", "8,628", "1,878", "2,096", 
    "1,491", "2,434", "1,871", "1,382", "1,670", "1,389", "1,855", 
    "5,936", "50,093", "3,965", "2,933", "661", "1,057", "10,677"
    ), Burglary = c("", "414", "369", "214", "756", "751", "281", 
    "5,465", "350", "138", "456", "1,690", "1,068", "733", "278", 
    "1,997", "403", "141", "169", "307", "160", "261", "191", 
    "265", "119", "243", "166", "257", "147", "187", "211", "289", 
    "404", "1,411", "176", "329", "188", "262", "227", "221", 
    "276", "201", "512", "1,077", "7,869", "564", "591", "128", 
    "222", "1,160"), Larceny..theft = c("", "1,569", "2,241", 
    "1,409", "3,581", "3,901", "1,224", "19,461", "2,362", "860", 
    "2,841", "10,959", "4,643", "2,969", "910", "4,875", "1,915", 
    "1,115", "830", "1,244", "1,024", "1,611", "1,082", "1,486", 
    "517", "785", "827", "677", "892", "933", "1,121", "928", 
    "1,620", "6,126", "1,460", "1,438", "1,156", "1,468", "1,485", 
    "822", "1,307", "791", "981", "3,688", "33,364", "2,819", 
    "1,828", "435", "604", "7,021"), Motor..vehicle..theft = c("", 
    "109", "161", "82", "493", "461", "91", "3,928", "141", "90", 
    "270", "1,276", "525", "697", "291", "1,458", "263", "106", 
    "82", "354", "87", "279", "262", "160", "83", "325", "171", 
    "282", "89", "250", "307", "466", "363", "1,091", "242", 
    "329", "147", "704", "159", "339", "87", "397", "362", "1,171", 
    "8,860", "582", "514", "98", "231", "2,496"), Arson3 = c(NA, 
    NA, 8L, 5L, 34L, 7L, 4L, 140L, 7L, 3L, 3L, 92L, 24L, 13L, 
    28L, 116L, 21L, 7L, 5L, 9L, 2L, 12L, 10L, 12L, 3L, 8L, 9L, 
    10L, 6L, 5L, 11L, 5L, 6L, 129L, 8L, 6L, 5L, 9L, 9L, 7L, 2L, 
    0L, 14L, 57L, 897L, 26L, 5L, 2L, 4L, 106L)), class = c("tbl_df", 
"tbl", "data.frame"), row.names = c(NA, -50L))

Problem

I have data that looks like this, where each state has multiple counties, and thus the state rows have several empty values:

# A tibble: 276 × 14
   State     City         X Popul…¹ Viole…² Murder Rape2 Robbery Aggra…³
   <chr>     <chr>    <int> <chr>   <chr>    <int> <chr> <chr>   <chr>  
 1 "ALABAMA" HUNTSVI…  2018 196,620 ""          NA ""    ""      ""     
 2 ""        TUSCALO…  2018 101,764 "253"        3 "20"  "71"    "159"  
 3 "ARIZONA" CHANDLER  2018 255,986 "287"        1 "74"  "62"    "150"  
 4 ""        GILBERT   2018 247,463 "132"        1 "49"  "15"    "67"   
 5 ""        GLENDALE  2018 249,799 "594"        7 "69"  "171"   "347"  
 6 ""        MESA      2018 504,873 "967"       10 "143" "192"   "622"  
 7 ""        PEORIA    2018 170,177 "201"        2 "33"  "29"    "137"  
 8 ""        PHOENIX   2018 1,653,… "6,118"     67 "566" "1,438" "4,047"
 9 ""        SCOTTSD…  2018 254,961 "211"        7 "53"  "52"    "99"   
10 ""        SURPRISE  2018 136,611 "67"         3 "10"  "20"    "34"

For this part of the data, Alabama should fill the first two rows, and Arizona should be filling the last 8 rows that are empty. I tried using the methods in this post 6 years ago but using this code (supplying as.character since that is what most of the data is here):

library(tidyverse)

crime_state %>%
  mutate_all(as.character) %>%
  fill(names(.), 
       .direction = "up")

I have no change in the rows I need fixed. Is there a better alternative for my data? This would be the ideal tibble:

# A tibble: 276 × 14
   State     City     X     Popul…¹ Viole…² Murder Rape2 Robbery Aggra…³
   <chr>     <chr>    <chr> <chr>   <chr>   <chr>  <chr> <chr>   <chr>  
 1 "ALABAMA" HUNTSVI… 2018  196,620 ""      3      ""    ""      ""     
 2 "ALABAMA" TUSCALO… 2018  101,764 "253"   3      "20"  "71"    "159"  
 3 "ARIZONA" CHANDLER 2018  255,986 "287"   1      "74"  "62"    "150"  
 4 "ARIZONA" GILBERT  2018  247,463 "132"   1      "49"  "15"    "67"   
 5 "ARIZONA" GLENDALE 2018  249,799 "594"   7      "69"  "171"   "347"  
 6 "ARIZONA" MESA     2018  504,873 "967"   10     "143" "192"   "622"  
 7 "ARIZONA" PEORIA   2018  170,177 "201"   2      "33"  "29"    "137"  
 8 "ARIZONA" PHOENIX  2018  1,653,… "6,118" 67     "566" "1,438" "4,047"
 9 "ARIZONA" SCOTTSD… 2018  254,961 "211"   7      "53"  "52"    "99"   
10 "ARIZONA" SURPRISE 2018  136,611 "67"    3      "10"  "20"    "34" 

Solution

  • What about:

    [...]
    crime_state |>
      dplyr::mutate(State = ifelse(State == "", NA, State)) |>
      tidyr::fill(State)
    #> # A tibble: 50 × 14
    #>    State   City           X Popul…¹ Viole…² Murder Rape2 Robbery Aggra…³ Prope…⁴
    #>    <chr>   <chr>      <int> <chr>   <chr>    <int> <chr> <chr>   <chr>   <chr>  
    #>  1 ALABAMA HUNTSVILL…  2018 196,620 ""          NA ""    ""      ""      ""     
    #>  2 ALABAMA TUSCALOOSA  2018 101,764 "253"        3 "20"  "71"    "159"   "2,092"
    #>  3 ARIZONA CHANDLER    2018 255,986 "287"        1 "74"  "62"    "150"   "2,771"
    #>  4 ARIZONA GILBERT     2018 247,463 "132"        1 "49"  "15"    "67"    "1,705"
    #>  5 ARIZONA GLENDALE    2018 249,799 "594"        7 "69"  "171"   "347"   "4,830"
    #>  6 ARIZONA MESA        2018 504,873 "967"       10 "143" "192"   "622"   "5,113"
    #>  7 ARIZONA PEORIA      2018 170,177 "201"        2 "33"  "29"    "137"   "1,596"
    #>  8 ARIZONA PHOENIX     2018 1,653,… "6,118"     67 "566" "1,438" "4,047" "28,85…
    #>  9 ARIZONA SCOTTSDALE  2018 254,961 "211"        7 "53"  "52"    "99"    "2,853"
    #> 10 ARIZONA SURPRISE    2018 136,611 "67"         3 "10"  "20"    "34"    "1,088"
    #> # … with 40 more rows, 4 more variables: Burglary <chr>, Larceny..theft <chr>,
    #> #   Motor..vehicle..theft <chr>, Arson3 <int>, and abbreviated variable names
    #> #   ¹​Population1, ²​Violent..crime, ³​Aggravated..assault, ⁴​Property..crime
    

    Created on 2022-10-22 with reprex v2.0.2