Search code examples
rmissing-datadata-munging

How to replace missing data in R with median data based on a condition


I have data from https://drive.google.com/file/d/0B9YMMvghK2ytSXI4RFo0clNLc28/view

basically a diamonds dataset of ~600,000 rows

It has missing values for one column price. I want to replace NA values with median prices of that particular color

summary(BigDiamonds)
##        X1             carat           cut               color          
##  Min.   :     1   Min.   :0.200   Length:598024      Length:598024     
##  1st Qu.:149507   1st Qu.:0.500   Class :character   Class :character  
##  Median :299013   Median :0.900   Mode  :character   Mode  :character  
##  Mean   :299013   Mean   :1.071                                        
##  3rd Qu.:448518   3rd Qu.:1.500                                        
##  Max.   :598024   Max.   :9.250                                        
##                                                                        
##    clarity              table           depth           cert          
##  Length:598024      Min.   : 0.00   Min.   : 0.00   Length:598024     
##  Class :character   1st Qu.:56.00   1st Qu.:61.00   Class :character  
##  Mode  :character   Median :58.00   Median :62.10   Mode  :character  
##                     Mean   :57.63   Mean   :61.06                     
##                     3rd Qu.:59.00   3rd Qu.:62.70                     
##                     Max.   :75.90   Max.   :81.30                     
##                                                                       
##  measurements           price             x                y         
##  Length:598024      Min.   :  300   Min.   : 0.150   Min.   : 1.000  
##  Class :character   1st Qu.: 1220   1st Qu.: 4.740   1st Qu.: 4.970  
##  Mode  :character   Median : 3503   Median : 5.780   Median : 6.050  
##                     Mean   : 8753   Mean   : 5.991   Mean   : 6.199  
##                     3rd Qu.:11174   3rd Qu.: 6.970   3rd Qu.: 7.230  
##                     Max.   :99990   Max.   :13.890   Max.   :13.890  
##                     NA's   :713     NA's   :1815     NA's   :1852    
##        z         
##  Min.   : 0.040  
##  1st Qu.: 3.120  
##  Median : 3.860  
##  Mean   : 4.033  
##  3rd Qu.: 4.610  
##  Max.   :13.180  
##  NA's   :2544

and

table(BigDiamonds$color)
## 
##     D     E     F     G     H     I     J     K     L 
## 73630 93483 93573 96204 86619 70282 48709 25868  9656



Diamonds2=BigDiamonds[is.na(BigDiamonds$price),]
Diamonds3=BigDiamonds[is.na(BigDiamonds$price)==F,]
library(Hmisc)
summarize(Diamonds3$price,Diamonds3$color,median)
##   Diamonds3$color Diamonds3$price
## 1               D            2690
## 2               E            2342
## 3               F            2966
## 4               G            3720
## 5               H            4535
## 6               I            4717
## 7               J            4697
## 8               K            4418
## 9               L            3017

I tried this, but its not working

Diamonds21=select(Diamonds2,price,color,cut)

Diamonds21$newprice=ifelse(Diamonds21$color=="J",4697,Diamonds21$newprice)
Diamonds21$newprice<-ifelse(Diamonds21$color=="D",2690,Diamonds21$newprice)
Diamonds21$newprice<-ifelse(Diamonds21$color=="E",2342,Diamonds21$newprice)
Diamonds21$newprice<-ifelse(Diamonds21$color=="F",2966,Diamonds21$newprice)
Diamonds21$newprice<-ifelse(Diamonds21$color=="G",3720,Diamonds21$newprice)
Diamonds21$newprice<-ifelse(Diamonds21$color=="H",4535,Diamonds21$newprice)
Diamonds21$newprice<-ifelse(Diamonds21$color=="I",4717,Diamonds21$newprice)
Diamonds21$newprice<-ifelse(Diamonds21$color=="K",4418,Diamonds21$newprice)
Diamonds21$newprice<-ifelse(Diamonds21$color=="L",3017,Diamonds21$newprice) 

Whats wrong with my logic?


Solution

  • I tried this and it worked

    note first line is different

    Diamonds21$newprice=ifelse(Diamonds21$color=="J",4697,Diamonds21$price)
    Diamonds21$newprice<-ifelse(Diamonds21$color=="D",2690,Diamonds21$newprice)
    Diamonds21$newprice<-ifelse(Diamonds21$color=="E",2342,Diamonds21$newprice)
    Diamonds21$newprice<-ifelse(Diamonds21$color=="F",2966,Diamonds21$newprice)
    Diamonds21$newprice<-ifelse(Diamonds21$color=="G",3720,Diamonds21$newprice)
    Diamonds21$newprice<-ifelse(Diamonds21$color=="H",4535,Diamonds21$newprice)
    Diamonds21$newprice<-ifelse(Diamonds21$color=="I",4717,Diamonds21$newprice)
    Diamonds21$newprice<-ifelse(Diamonds21$color=="K",4418,Diamonds21$newprice)
    Diamonds21$newprice<-ifelse(Diamonds21$color=="L",3017,Diamonds21$newprice)