Search code examples
rtidyverseocr

Get data by OCR and set the columns names


I run the code below to get the data of this picture but I encounter an error when try to set the column name.

library(magick)
library(tesseract)
team_img <- image_read("C:\\Users\\macosta\\Downloads\\Capture1.PNG") %>% 
  image_resize('2000x') %>%
  image_convert(type = 'Grayscale') %>%
  image_trim(fuzz = 40) %>%
  image_write(format = 'png', density = '300x300') %>%
  tesseract::ocr() %>%
  strsplit('\n') %>%
  getElement(1) %>%
  `[`(-1) %>%
  {read.table(text = .)} %>% # this part show an error
  setNames(c('Number', 'Weight', 'Specie', 'long', 'lat', 'altitude', 'Date', 'Others'))
> Error in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,  : 
> line 2 did not have 9 elements
<- team_img
# [1] ""                                                    
# [2] "3057 4.2 vulgaris 90234\" W 14237N 1550 m 20/1/95"   
# [3] "3058 10.4 lunatus 90234°W 14237N 1550 m 20/1/95"     
# [4] "3059 12.0  lunatus 90246’W 14238’N 1700 m 20/1/95"   
# [5] "3060 4.7 ~~ vulgaris 90247'W 14242’°N 1330 m 20/1/95"
# [6] "3061 4.7 vulgaris 90247 W 14243’N 1350 m 20/1/95"    
# [7] "3062 10.8 coccineus 90951,W 14933N 2080m 20/1/95"    
# [8] "3063 6.8 __lunatus 90248°W —-14221’N 520 m 21/1/95"  
# [9] "3064 5.8 lunatus 91211\"W 14°24°N 230 m 21/1/95"     
# [10] "3065 7.0  lunatus 91208’W 14231’N 750 m 21/1/95"     
# [11] "3066 4.2 vulgaris 91207 W 14236°N 1330 m 21/1/95"    
# [12] "3067 23.3 polyanth. 91207°W 14238’N 1630 m 21/1/95"  
# [13] "3068 10.4 coccineus 91222’W 15221’N 2070 m 22/1/95"  
# [14] "3069 8.0 coccineus 91222’W 15221’N 2070 m 22/1/95"   
# [15] "3070. --- leptosta. 91221’W 15221’N 1910 m 22/1/95"  
# [16] "3071 7.8  lunatus 91254°W 15241’N 590 m 23/1/95"     
# [17] "3072 5.0 Cnido.sp. 91254°W 15241’N 600 m 23/1/95"    
# [18] "3073 0.6 _leptosta. 91248°W 15241’N 1000 m 23/1/95"  
# [19] "3074 3.9 vulgaris 91247'W 15240°N 1180 m 23/1/95"    
# [20] "3075 4.0 vulgaris 91942\"W 15939°N 1540 m 23/1/95"   
# [21] "3076 6.4 coccineus 91231’W 14249’°N 2370 m 24/1/95"  
# [22] "3077 9.5  coccineus 91229°W 14248°N 2070 m 24/1/95"  
# [23] "3078 64 coccineus 91229°W 14247N 2050 m 24/1/95"     
# [24] "3079 11.8 vulgaris 91229’W 14247'N 2050 m 24/1/95"   
# [25] "3080 10.5  lunatus 91230°W 14246’N 1750 m 24/1/95"   
# [26] "3081 6.9 vulgaris 91231’°W 14245’N 1730 m 24/1/95" 

Data picture.

enter image description here

The idea is to obtain in a DF.


Solution

  • While the Tesseract OCR does a pretty admirable job of extracting text from the sample image, it seems to struggle with consistently decoding the ° and symbols in the Latitude and Longitude columns. There is also a space between the measure and the units in the Altitude column. This results in strings that are interpreted by read.table as having either 7, 8 or 9 columns. Since your first input line appears to have 9 columns, read.table expects all subsequent lines to have 9 columns as well. You will have to figure out a way to clean the latitude, longitude output data so that you have 8 space-delimited columns you have identified in setNames

    library(magick)
    library(tesseract)
    
    team_str <- image_read("C:\\Users\\macosta\\Downloads\\Capture1.PNG") %>% 
      image_resize('2000x') %>%
      image_convert(type = 'Grayscale') %>%
      image_trim(fuzz = 40) %>%
      image_write(format = 'png', density = '300x300') %>%
      tesseract::ocr() %>%
      strsplit('\n') %>%
      getElement(1)
    
    sapply(team_str, strsplit, '\\s+') %>% lapply(length)
    
    $`Number, Weight Species Longitude Latitude Altitude Date`
    [1] 7
    
    [[2]]
    [1] 0
    
    $`3057 4.2 vulgaris 90234" W 14237N 1550 m 20/1/95`
    [1] 9
    
    $`3058 10.4 lunatus 90234°W 14237N 1550 m 20/1/95`
    [1] 8
    
    $`3059 12.0  lunatus 90246’W 14238’N 1700 m 20/1/95`
    [1] 8
    
    $`3060 4.7 ~~ vulgaris 90247'W 14242’°N 1330 m 20/1/95`
    [1] 9
    
    $`3061 4.7 vulgaris 90247 W 14243’N 1350 m 20/1/95`
    [1] 9
    
    $`3062 10.8 coccineus 90951,W 14933N 2080m 20/1/95`
    [1] 7
    
    $`3063 6.8 __lunatus 90248°W —-14221’N 520 m 21/1/95`
    [1] 8
    
    $`3064 5.8 lunatus 91211"W 14°24°N 230 m 21/1/95`
    [1] 8
    
    $`3065 7.0  lunatus 91208’W 14231’N 750 m 21/1/95`
    [1] 8
    
    $`3066 4.2 vulgaris 91207 W 14236°N 1330 m 21/1/95`
    [1] 9
    
    $`3067 23.3 polyanth. 91207°W 14238’N 1630 m 21/1/95`
    [1] 8
    
    $`3068 10.4 coccineus 91222’W 15221’N 2070 m 22/1/95`
    [1] 8
    
    $`3069 8.0 coccineus 91222’W 15221’N 2070 m 22/1/95`
    [1] 8
    
    $`3070. --- leptosta. 91221’W 15221’N 1910 m 22/1/95`
    [1] 8
    
    $`3071 7.8  lunatus 91254°W 15241’N 590 m 23/1/95`
    [1] 8
    
    $`3072 5.0 Cnido.sp. 91254°W 15241’N 600 m 23/1/95`
    [1] 8
    
    $`3073 0.6 _leptosta. 91248°W 15241’N 1000 m 23/1/95`
    [1] 8
    
    $`3074 3.9 vulgaris 91247'W 15240°N 1180 m 23/1/95`
    [1] 8
    
    $`3075 4.0 vulgaris 91942"W 15939°N 1540 m 23/1/95`
    [1] 8
    
    $`3076 6.4 coccineus 91231’W 14249’°N 2370 m 24/1/95`
    [1] 8
    
    $`3077 9.5  coccineus 91229°W 14248°N 2070 m 24/1/95`
    [1] 8
    
    $`3078 64 coccineus 91229°W 14247N 2050 m 24/1/95`
    [1] 8
    
    $`3079 11.8 vulgaris 91229’W 14247'N 2050 m 24/1/95`
    [1] 8
    
    $`3080 10.5  lunatus 91230°W 14246’N 1750 m 24/1/95`
    [1] 8
    
    $`3081 6.9 vulgaris 91231’°W 14245’N 1730 m 24/1/95`
    [1] 8
    

    Combining everything with data.table::rbindlist makes it easier to visualize what is happening and to devise a strategy to deal with malformed rows:

    sapply(team_str, str_split, '\\s+') %>% 
      lapply(t) %>%
      lapply(as.data.frame) %>%
      data.table::rbindlist(fill = TRUE) %>%
      print
    
            V1     V2         V3        V4        V5       V6      V7      V8      V9
     1: Number, Weight    Species Longitude  Latitude Altitude    Date    <NA>    <NA>
     2:           <NA>       <NA>      <NA>      <NA>     <NA>    <NA>    <NA>    <NA>
     3:    3057    4.2   vulgaris    90234"         W   14237N    1550       m 20/1/95
     4:    3058   10.4    lunatus   90234°W    14237N     1550       m 20/1/95    <NA>
     5:    3059   12.0    lunatus   90246’W   14238’N     1700       m 20/1/95    <NA>
     6:    3060    4.7         ~~  vulgaris   90247'W 14242’°N    1330       m 20/1/95
     7:    3061    4.7   vulgaris     90247         W  14243’N    1350       m 20/1/95
     8:    3062   10.8  coccineus   90951,W    14933N    2080m 20/1/95    <NA>    <NA>
     9:    3063    6.8  __lunatus   90248°W —-14221’N      520       m 21/1/95    <NA>
    10:    3064    5.8    lunatus   91211"W   14°24°N      230       m 21/1/95    <NA>
    11:    3065    7.0    lunatus   91208’W   14231’N      750       m 21/1/95    <NA>
    12:    3066    4.2   vulgaris     91207         W  14236°N    1330       m 21/1/95
    13:    3067   23.3  polyanth.   91207°W   14238’N     1630       m 21/1/95    <NA>
    14:    3068   10.4  coccineus   91222’W   15221’N     2070       m 22/1/95    <NA>
    15:    3069    8.0  coccineus   91222’W   15221’N     2070       m 22/1/95    <NA>
    16:   3070.    ---  leptosta.   91221’W   15221’N     1910       m 22/1/95    <NA>
    17:    3071    7.8    lunatus   91254°W   15241’N      590       m 23/1/95    <NA>
    18:    3072    5.0  Cnido.sp.   91254°W   15241’N      600       m 23/1/95    <NA>
    19:    3073    0.6 _leptosta.   91248°W   15241’N     1000       m 23/1/95    <NA>
    20:    3074    3.9   vulgaris   91247'W   15240°N     1180       m 23/1/95    <NA>
    21:    3075    4.0   vulgaris   91942"W   15939°N     1540       m 23/1/95    <NA>
    22:    3076    6.4  coccineus   91231’W  14249’°N     2370       m 24/1/95    <NA>
    23:    3077    9.5  coccineus   91229°W   14248°N     2070       m 24/1/95    <NA>
    24:    3078     64  coccineus   91229°W    14247N     2050       m 24/1/95    <NA>
    25:    3079   11.8   vulgaris   91229’W   14247'N     2050       m 24/1/95    <NA>
    26:    3080   10.5    lunatus   91230°W   14246’N     1750       m 24/1/95    <NA>
    27:    3081    6.9   vulgaris  91231’°W   14245’N     1730       m 24/1/95    <NA>
             V1     V2         V3        V4        V5       V6      V7      V8      V9