library(tidyr) # to use "separate"
library(reshape2, warn.conflicts = FALSE)
knitr::opts_chunk$set(echo=TRUE, warning=FALSE, include=TRUE)
df0<- rio::import("/home/passi/MEGA/PhD/PhD_report/content/docs2/Harvard/Antenatal_Care.csv")
# Long format
df0<-melt(df0, id=c("ID"))
# split year and month
df0 <- df0 %>% separate(variable, c("Year","Month"), sep = "_")
# Calculate mean, sd (using 0 values) and number of months with values diff to 0
df0<-df0 %>%
group_by(ID,Year) %>%
mutate(mean = mean(value, na.rm=TRUE),
SD = sd(value, na.rm=TRUE),
count = sum(value!=0))
# Calculate 3*sd
df0$SD3<-3*df0$SD
df0$SD3_sup<-df0$mean + df0$SD3
df0$SD3_inf<-df0$mean- df0$SD3
# Validate value according to 3sd
df0$value_valid<-df0$value
df0$value_valid[df0$value<df0$SD3_inf | df0$value>df0$SD3_sup ]<-NA
# Calculate proportion of months in a year with values diff 0
df0<-df0 %>%
group_by(ID,Year) %>%
mutate(prop_month = count/12)
# Identify ID with values diff 0 in April and May 2020
df0$ap_may_20<-NA
df0$ap_may_20[df0$Year=="2020" & (df0$Month=="04"|df0$Month=="05") & df0$value!=0]<-1
df0<-df0 %>%
group_by(ID) %>%
mutate(ap_may_20_s = sum(ap_may_20, na.rm=TRUE))
# ID with two conditions: >=90% of months with data diff to 0 and reporting April and May 2020
df0$month90_april_may<-NA
df0$month90_april_may[df0$prop_month>=0.9 & df0$ap_may_20_s==2]<-1
#Round values
df0<- df0 %>%
mutate(across(where(is.numeric), round, 2))
summary(df0)
## ID Year Month value
## Min. :101300 Length:41448 Length:41448 Min. : 0.000
## 1st Qu.:110732 Class :character Class :character 1st Qu.: 0.000
## Median :116736 Mode :character Mode :character Median : 1.000
## Mean :121035 Mean : 6.841
## 3rd Qu.:122450 3rd Qu.: 9.000
## Max. :200595 Max. :162.000
##
## mean SD count SD3
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.00
## 1st Qu.: 0.300 1st Qu.: 0.510 1st Qu.: 3.000 1st Qu.: 1.54
## Median : 1.580 Median : 1.290 Median : 8.000 Median : 3.86
## Mean : 6.841 Mean : 2.483 Mean : 6.941 Mean : 7.45
## 3rd Qu.: 9.000 3rd Qu.: 3.360 3rd Qu.:10.000 3rd Qu.: 10.07
## Max. :112.250 Max. :37.780 Max. :12.000 Max. :113.34
##
## SD3_sup SD3_inf value_valid prop_month
## Min. : 0.00 Min. :-41.3200 Min. : 0.000 Min. :0.0000
## 1st Qu.: 1.95 1st Qu.: -2.2600 1st Qu.: 0.000 1st Qu.:0.2500
## Median : 5.56 Median : -1.1500 Median : 1.000 Median :0.6700
## Mean : 14.29 Mean : -0.6092 Mean : 6.866 Mean :0.5778
## 3rd Qu.: 19.30 3rd Qu.: -0.0075 3rd Qu.: 9.000 3rd Qu.:0.8300
## Max. :217.10 Max. : 43.8200 Max. :162.000 Max. :1.0000
## NA's :181
## ap_may_20 ap_may_20_s month90_april_may
## Min. :1 Min. :0.000 Min. :1
## 1st Qu.:1 1st Qu.:0.000 1st Qu.:1
## Median :1 Median :2.000 Median :1
## Mean :1 Mean :1.205 Mean :1
## 3rd Qu.:1 3rd Qu.:2.000 3rd Qu.:1
## Max. :1 Max. :2.000 Max. :1
## NA's :39177 NA's :32148
head(df0)
## # A tibble: 6 x 15
## # Groups: ID [6]
## ID Year Month value mean SD count SD3 SD3_sup SD3_inf value_valid
## <int> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 125402 2019 01 2 0.83 0.94 6 2.81 3.65 -1.98 2
## 2 125417 2019 01 2 1.33 0.98 9 2.95 4.29 -1.62 2
## 3 125418 2019 01 2 1.08 1.51 6 4.52 5.6 -3.43 2
## 4 125701 2019 01 4 4.67 2.15 12 6.44 11.1 -1.77 4
## 5 200380 2019 01 24 14.3 4.98 12 14.9 29.3 -0.6 24
## 6 125421 2019 01 1 0.33 0.49 4 1.48 1.81 -1.14 1
## # … with 4 more variables: prop_month <dbl>, ap_may_20 <dbl>,
## # ap_may_20_s <dbl>, month90_april_may <dbl>
write.csv(df0,"Antenatal_Care_AP.csv")
Download file