Antenatal Care - Harvard

library(tidyr) # to use "separate"
library(reshape2, warn.conflicts = FALSE)
knitr::opts_chunk$set(echo=TRUE, warning=FALSE, include=TRUE)
df0<- rio::import("/home/passi/MEGA/PhD/PhD_report/content/docs2/Harvard/Antenatal_Care.csv")
# Long format
df0<-melt(df0, id=c("ID"))
# split year and month
df0 <- df0 %>% separate(variable, c("Year","Month"), sep = "_")
# Calculate mean, sd (using 0 values) and number of months with values diff to 0
df0<-df0 %>%
  group_by(ID,Year) %>%
  mutate(mean = mean(value, na.rm=TRUE),
         SD = sd(value, na.rm=TRUE),
         count = sum(value!=0))
# Calculate 3*sd
df0$SD3<-3*df0$SD
df0$SD3_sup<-df0$mean + df0$SD3
df0$SD3_inf<-df0$mean- df0$SD3

# Validate value according to 3sd
df0$value_valid<-df0$value
df0$value_valid[df0$value<df0$SD3_inf | df0$value>df0$SD3_sup ]<-NA

# Calculate proportion of months in a year with values diff 0
df0<-df0 %>%
  group_by(ID,Year) %>%
  mutate(prop_month = count/12)

# Identify ID  with values diff 0 in April and May 2020
df0$ap_may_20<-NA
df0$ap_may_20[df0$Year=="2020" & (df0$Month=="04"|df0$Month=="05") & df0$value!=0]<-1
df0<-df0 %>%
  group_by(ID) %>%
  mutate(ap_may_20_s = sum(ap_may_20, na.rm=TRUE))

# ID with two conditions: >=90% of months with data diff to 0 and reporting April and May 2020
df0$month90_april_may<-NA
df0$month90_april_may[df0$prop_month>=0.9 & df0$ap_may_20_s==2]<-1

#Round values
df0<- df0 %>% 
  mutate(across(where(is.numeric), round, 2))
summary(df0)
##        ID             Year              Month               value        
##  Min.   :101300   Length:41448       Length:41448       Min.   :  0.000  
##  1st Qu.:110732   Class :character   Class :character   1st Qu.:  0.000  
##  Median :116736   Mode  :character   Mode  :character   Median :  1.000  
##  Mean   :121035                                         Mean   :  6.841  
##  3rd Qu.:122450                                         3rd Qu.:  9.000  
##  Max.   :200595                                         Max.   :162.000  
##                                                                          
##       mean               SD             count             SD3        
##  Min.   :  0.000   Min.   : 0.000   Min.   : 0.000   Min.   :  0.00  
##  1st Qu.:  0.300   1st Qu.: 0.510   1st Qu.: 3.000   1st Qu.:  1.54  
##  Median :  1.580   Median : 1.290   Median : 8.000   Median :  3.86  
##  Mean   :  6.841   Mean   : 2.483   Mean   : 6.941   Mean   :  7.45  
##  3rd Qu.:  9.000   3rd Qu.: 3.360   3rd Qu.:10.000   3rd Qu.: 10.07  
##  Max.   :112.250   Max.   :37.780   Max.   :12.000   Max.   :113.34  
##                                                                      
##     SD3_sup          SD3_inf          value_valid        prop_month    
##  Min.   :  0.00   Min.   :-41.3200   Min.   :  0.000   Min.   :0.0000  
##  1st Qu.:  1.95   1st Qu.: -2.2600   1st Qu.:  0.000   1st Qu.:0.2500  
##  Median :  5.56   Median : -1.1500   Median :  1.000   Median :0.6700  
##  Mean   : 14.29   Mean   : -0.6092   Mean   :  6.866   Mean   :0.5778  
##  3rd Qu.: 19.30   3rd Qu.: -0.0075   3rd Qu.:  9.000   3rd Qu.:0.8300  
##  Max.   :217.10   Max.   : 43.8200   Max.   :162.000   Max.   :1.0000  
##                                      NA's   :181                       
##    ap_may_20      ap_may_20_s    month90_april_may
##  Min.   :1       Min.   :0.000   Min.   :1        
##  1st Qu.:1       1st Qu.:0.000   1st Qu.:1        
##  Median :1       Median :2.000   Median :1        
##  Mean   :1       Mean   :1.205   Mean   :1        
##  3rd Qu.:1       3rd Qu.:2.000   3rd Qu.:1        
##  Max.   :1       Max.   :2.000   Max.   :1        
##  NA's   :39177                   NA's   :32148
head(df0)
## # A tibble: 6 x 15
## # Groups:   ID [6]
##       ID Year  Month value  mean    SD count   SD3 SD3_sup SD3_inf value_valid
##    <int> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl>   <dbl>       <dbl>
## 1 125402 2019  01        2  0.83  0.94     6  2.81    3.65   -1.98           2
## 2 125417 2019  01        2  1.33  0.98     9  2.95    4.29   -1.62           2
## 3 125418 2019  01        2  1.08  1.51     6  4.52    5.6    -3.43           2
## 4 125701 2019  01        4  4.67  2.15    12  6.44   11.1    -1.77           4
## 5 200380 2019  01       24 14.3   4.98    12 14.9    29.3    -0.6           24
## 6 125421 2019  01        1  0.33  0.49     4  1.48    1.81   -1.14           1
## # … with 4 more variables: prop_month <dbl>, ap_may_20 <dbl>,
## #   ap_may_20_s <dbl>, month90_april_may <dbl>
write.csv(df0,"Antenatal_Care_AP.csv")

Download file