# rm(list=ls())
pkg <- c("readr","readxl","dplyr","stringr","ggplot2","tidyr")
pkgload <- lapply(pkg, require, character.only = TRUE)
## Loading required package: readr
## Loading required package: readxl
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: stringr
## Loading required package: ggplot2
## Loading required package: tidyr
data1 <- read_excel("/Users/MichaelMiao/UCLA/uclatextbook/140sl/140\ review\ from\ Wilbur/demographicData4pm.xls")
## readxl works best with a newer version of the tibble package.
## You currently have tibble v1.4.2.
## Falling back to column name repair from tibble <= v1.4.2.
## Message displays once per session.
load("/Users/MichaelMiao/UCLA/uclatextbook/140sl/140\ review\ from\ Wilbur/LAB4PM.RData")
data2 <- LAB4PM
summary(data1)
## STUDY_ID Gender Race
## Length:18721 Length:18721 Length:18721
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
## Ethnicity
## Length:18721
## Class :character
## Mode :character
summary(data2)
## STUDY_ID AnionGap Age Year
## Length:14161 Min. : 4.00 Min. :30.00 Min. :2016
## Class :character 1st Qu.:12.00 1st Qu.:32.00 1st Qu.:2016
## Mode :character Median :14.00 Median :34.00 Median :2016
## Mean :13.77 Mean :34.45 Mean :2016
## 3rd Qu.:15.00 3rd Qu.:37.00 3rd Qu.:2017
## Max. :35.00 Max. :40.00 Max. :2017
## Inpatient_Outpatient SODIUM
## Length:14161 Min. :123.0
## Class :character 1st Qu.:139.0
## Mode :character Median :140.0
## Mean :140.1
## 3rd Qu.:142.0
## Max. :151.0
names(data1)
## [1] "STUDY_ID" "Gender" "Race" "Ethnicity"
names(data2)
## [1] "STUDY_ID" "AnionGap" "Age"
## [4] "Year" "Inpatient_Outpatient" "SODIUM"
datajoin <- inner_join(data1,data2,by = "STUDY_ID")
#View(datajoin)
datajoind <- distinct(datajoin)
attach(datajoin)
summary(datajoin)
## STUDY_ID Gender Race
## Length:14161 Length:14161 Length:14161
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## Ethnicity AnionGap Age Year
## Length:14161 Min. : 4.00 Min. :30.00 Min. :2016
## Class :character 1st Qu.:12.00 1st Qu.:32.00 1st Qu.:2016
## Mode :character Median :14.00 Median :34.00 Median :2016
## Mean :13.77 Mean :34.45 Mean :2016
## 3rd Qu.:15.00 3rd Qu.:37.00 3rd Qu.:2017
## Max. :35.00 Max. :40.00 Max. :2017
## Inpatient_Outpatient SODIUM
## Length:14161 Min. :123.0
## Class :character 1st Qu.:139.0
## Mode :character Median :140.0
## Mean :140.1
## 3rd Qu.:142.0
## Max. :151.0
tapply(datajoin$AnionGap,datajoin$Race,mean)
## American Indian or Alaska Native
## 14.23913
## Asian
## 13.83436
## Black or African American
## 13.15815
## Multiple Races
## 13.61905
## NA
## 13.13636
## Native Hawaiian or Other Pacific Islander
## 14.45946
## Other
## 13.80646
## Patient Refused
## 14.21692
## Unknown
## 13.92087
## White or Caucasian
## 13.65992
tapply(datajoin$AnionGap,datajoin$Gender,mean)
## Female Male Unknown
## 13.67799 13.87348 16.00000
tapply(datajoin$AnionGap,datajoin$Ethnicity,mean)
## Cuban Hispanic or Latino
## 16.00000 13.59548
## Hispanic/Spanish origin Other Mexican, Mexican American, Chicano/a
## 14.01170 13.00000
## Not Hispanic or Latino Patient Refused
## 13.69819 14.19592
## Puerto Rican Unknown
## 12.50000 13.93228
tapply(datajoin$SODIUM,datajoin$Race,mean)
## American Indian or Alaska Native
## 139.3261
## Asian
## 140.1168
## Black or African American
## 140.0032
## Multiple Races
## 139.7738
## NA
## 139.9545
## Native Hawaiian or Other Pacific Islander
## 139.7297
## Other
## 140.1326
## Patient Refused
## 140.2988
## Unknown
## 140.2171
## White or Caucasian
## 140.1332
tapply(datajoin$SODIUM,datajoin$Gender,mean)
## Female Male Unknown
## 139.7425 140.6421 143.0000
tapply(datajoin$SODIUM,datajoin$Ethnicity,mean)
## Cuban Hispanic or Latino
## 141.2500 139.9774
## Hispanic/Spanish origin Other Mexican, Mexican American, Chicano/a
## 140.2047 139.7419
## Not Hispanic or Latino Patient Refused
## 140.1228 140.3020
## Puerto Rican Unknown
## 140.7500 140.2659
prop.table(table(datajoin$Gender))
##
## Female Male Unknown
## 5.528564e-01 4.470729e-01 7.061648e-05
prop.table(table(datajoin$Race))
##
## American Indian or Alaska Native
## 0.003248358
## Asian
## 0.102746981
## Black or African American
## 0.044205918
## Multiple Races
## 0.005931784
## NA
## 0.003107125
## Native Hawaiian or Other Pacific Islander
## 0.002612810
## Other
## 0.170397571
## Patient Refused
## 0.078454911
## Unknown
## 0.149918791
## White or Caucasian
## 0.439375750
table(Gender)
## Gender
## Female Male Unknown
## 7829 6331 1
table(Race)
## Race
## American Indian or Alaska Native
## 46
## Asian
## 1455
## Black or African American
## 626
## Multiple Races
## 84
## NA
## 44
## Native Hawaiian or Other Pacific Islander
## 37
## Other
## 2413
## Patient Refused
## 1111
## Unknown
## 2123
## White or Caucasian
## 6222
table(Ethnicity)
## Ethnicity
## Cuban Hispanic or Latino
## 4 1372
## Hispanic/Spanish origin Other Mexican, Mexican American, Chicano/a
## 171 124
## Not Hispanic or Latino Patient Refused
## 9042 1225
## Puerto Rican Unknown
## 8 2215
datajoinnew <- datajoin %>% mutate(.,Noinfo = replace(Race, Race=="Patient Refused","Unknown"))
table(datajoinnew$Noinfo)
##
## American Indian or Alaska Native
## 46
## Asian
## 1455
## Black or African American
## 626
## Multiple Races
## 84
## NA
## 44
## Native Hawaiian or Other Pacific Islander
## 37
## Other
## 2413
## Unknown
## 3234
## White or Caucasian
## 6222
datajoinnew <- datajoinnew %>% select_at(.,vars(-c(Race)))
datajoinnew <- rename(datajoinnew, Race = Noinfo)
dim(datajoinnew)
## [1] 14161 9
datajoinnew <- datajoinnew %>% mutate(., NO = replace(Ethnicity, Ethnicity == "Patient Refused","Unknown"))
datajoinnew <- datajoinnew %>% select_at(.,vars(-c(Ethnicity)))
dim(datajoinnew)
## [1] 14161 9
datajoinnew <- rename(datajoinnew, Ethnicity = NO)
str(datajoinnew)
## Classes 'tbl_df', 'tbl' and 'data.frame': 14161 obs. of 9 variables:
## $ STUDY_ID : chr "000EA425FFF3622052E55966D027AEC7" "0013E732CFD6284BBBDC47C8B3D6132A" "0014D2A96ECDA4C63B10E375F26DB63A" "00162780B66CD4C246AE00EA683D2CE9" ...
## $ Gender : chr "Female" "Female" "Female" "Female" ...
## $ AnionGap : num 13 15 13 15 14 15 16 14 12 11 ...
## $ Age : int 33 35 38 39 33 33 32 31 35 32 ...
## $ Year : int 2016 2016 2016 2016 2016 2017 2016 2017 2017 2016 ...
## $ Inpatient_Outpatient: chr "IP" "OP" "OP" "OP" ...
## $ SODIUM : num 141 140 141 143 141 142 141 144 139 136 ...
## $ Race : chr "Other" "Other" "White or Caucasian" "White or Caucasian" ...
## $ Ethnicity : chr "Not Hispanic or Latino" "Unknown" "Not Hispanic or Latino" "Not Hispanic or Latino" ...
par(mfrow=c(1,2))
ggplot(data = datajoinnew,aes(factor(Year),AnionGap))+geom_boxplot(fill = c("yellow","blue"))
ggplot(data = datajoinnew,aes(factor(Year),SODIUM))+geom_boxplot(fill = c("yellow","blue"))
attach(datajoinnew)
## The following objects are masked from datajoin:
##
## Age, AnionGap, Ethnicity, Gender, Inpatient_Outpatient, Race,
## SODIUM, STUDY_ID, Year
plot(SODIUM,Year)
plot(AnionGap,Year)
plot(as.factor(Gender),AnionGap)
plot(as.factor(Gender),SODIUM)
plot(as.factor(Race),AnionGap)
plot(as.factor(Race),SODIUM)
plot(as.factor(Ethnicity),AnionGap)
plot(as.factor(Ethnicity),SODIUM)
attach(datajoinnew)
## The following objects are masked from datajoinnew (pos = 3):
##
## Age, AnionGap, Ethnicity, Gender, Inpatient_Outpatient, Race,
## SODIUM, STUDY_ID, Year
## The following objects are masked from datajoin:
##
## Age, AnionGap, Ethnicity, Gender, Inpatient_Outpatient, Race,
## SODIUM, STUDY_ID, Year
model1 <- glm(data = datajoinnew, SODIUM~ factor(Ethnicity)+ factor(Race) + factor(Gender))
summary(model1)
##
## Call:
## glm(formula = SODIUM ~ factor(Ethnicity) + factor(Race) + factor(Gender),
## data = datajoinnew)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -17.2885 -1.6203 0.2097 1.3797 10.3797
##
## Coefficients:
## Estimate Std. Error
## (Intercept) 140.12018 1.23701
## factor(Ethnicity)Hispanic or Latino -1.19461 1.18805
## factor(Ethnicity)Hispanic/Spanish origin Other -0.99804 1.20011
## factor(Ethnicity)Mexican, Mexican American, Chicano/a -1.39230 1.20539
## factor(Ethnicity)Not Hispanic or Latino -1.06050 1.18649
## factor(Ethnicity)Puerto Rican -0.49213 1.45296
## factor(Ethnicity)Unknown -0.88782 1.18953
## factor(Race)Asian 0.73789 0.35554
## factor(Race)Black or African American 0.54781 0.36271
## factor(Race)Multiple Races 0.41494 0.43520
## factor(Race)NA 0.25916 0.50791
## factor(Race)Native Hawaiian or Other Pacific Islander 0.27560 0.52417
## factor(Race)Other 0.73058 0.35342
## factor(Race)Unknown 0.59755 0.36162
## factor(Race)White or Caucasian 0.66737 0.35121
## factor(Gender)Male 0.89330 0.04027
## factor(Gender)Unknown 3.34386 2.37335
## t value Pr(>|t|)
## (Intercept) 113.273 <2e-16 ***
## factor(Ethnicity)Hispanic or Latino -1.006 0.3147
## factor(Ethnicity)Hispanic/Spanish origin Other -0.832 0.4056
## factor(Ethnicity)Mexican, Mexican American, Chicano/a -1.155 0.2481
## factor(Ethnicity)Not Hispanic or Latino -0.894 0.3714
## factor(Ethnicity)Puerto Rican -0.339 0.7348
## factor(Ethnicity)Unknown -0.746 0.4555
## factor(Race)Asian 2.075 0.0380 *
## factor(Race)Black or African American 1.510 0.1310
## factor(Race)Multiple Races 0.953 0.3404
## factor(Race)NA 0.510 0.6099
## factor(Race)Native Hawaiian or Other Pacific Islander 0.526 0.5990
## factor(Race)Other 2.067 0.0387 *
## factor(Race)Unknown 1.652 0.0985 .
## factor(Race)White or Caucasian 1.900 0.0574 .
## factor(Gender)Male 22.185 <2e-16 ***
## factor(Gender)Unknown 1.409 0.1589
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 5.627735)
##
## Null deviance: 82581 on 14160 degrees of freedom
## Residual deviance: 79599 on 14144 degrees of freedom
## AIC: 64672
##
## Number of Fisher Scoring iterations: 2
model2 <- glm(data = datajoinnew, AnionGap~ factor(Ethnicity)+ factor(Race) + factor(Gender))
summary(model2)
##
## Call:
## glm(formula = AnionGap ~ factor(Ethnicity) + factor(Race) + factor(Gender),
## data = datajoinnew)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -9.9460 -1.7726 0.0458 1.4140 21.4140
##
## Coefficients:
## Estimate Std. Error
## (Intercept) 16.48070 1.32351
## factor(Ethnicity)Hispanic or Latino -2.46108 1.27112
## factor(Ethnicity)Hispanic/Spanish origin Other -2.05224 1.28403
## factor(Ethnicity)Mexican, Mexican American, Chicano/a -3.06385 1.28968
## factor(Ethnicity)Not Hispanic or Latino -2.26967 1.26945
## factor(Ethnicity)Puerto Rican -3.50846 1.55455
## factor(Ethnicity)Unknown -2.07179 1.27271
## factor(Race)Asian -0.45157 0.38040
## factor(Race)Black or African American -1.13579 0.38807
## factor(Race)Multiple Races -0.60796 0.46563
## factor(Race)NA -1.36555 0.54342
## factor(Race)Native Hawaiian or Other Pacific Islander 0.17267 0.56082
## factor(Race)Other -0.42089 0.37814
## factor(Race)Unknown -0.45472 0.38690
## factor(Race)White or Caucasian -0.62499 0.37577
## factor(Gender)Male 0.18653 0.04308
## factor(Gender)Unknown 2.40128 2.53931
## t value Pr(>|t|)
## (Intercept) 12.452 < 2e-16 ***
## factor(Ethnicity)Hispanic or Latino -1.936 0.05287 .
## factor(Ethnicity)Hispanic/Spanish origin Other -1.598 0.11000
## factor(Ethnicity)Mexican, Mexican American, Chicano/a -2.376 0.01753 *
## factor(Ethnicity)Not Hispanic or Latino -1.788 0.07381 .
## factor(Ethnicity)Puerto Rican -2.257 0.02403 *
## factor(Ethnicity)Unknown -1.628 0.10358
## factor(Race)Asian -1.187 0.23521
## factor(Race)Black or African American -2.927 0.00343 **
## factor(Race)Multiple Races -1.306 0.19168
## factor(Race)NA -2.513 0.01199 *
## factor(Race)Native Hawaiian or Other Pacific Islander 0.308 0.75818
## factor(Race)Other -1.113 0.26570
## factor(Race)Unknown -1.175 0.23991
## factor(Race)White or Caucasian -1.663 0.09629 .
## factor(Gender)Male 4.330 1.5e-05 ***
## factor(Gender)Unknown 0.946 0.34435
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 6.442289)
##
## Null deviance: 92020 on 14160 degrees of freedom
## Residual deviance: 91120 on 14144 degrees of freedom
## AIC: 66586
##
## Number of Fisher Scoring iterations: 2
model3 <- glm(data = datajoinnew,SODIUM~AnionGap)
summary(model3)
##
## Call:
## glm(formula = SODIUM ~ AnionGap, data = datajoinnew)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -16.9951 -1.4079 0.0049 1.5921 10.3964
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 137.45071 0.10905 1260.44 <2e-16 ***
## AnionGap 0.19572 0.00779 25.13 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 5.583425)
##
## Null deviance: 82581 on 14160 degrees of freedom
## Residual deviance: 79056 on 14159 degrees of freedom
## AIC: 64545
##
## Number of Fisher Scoring iterations: 2
I think there is very tiny association of Sodium between gender(male) and race(asian) ; and some tiny association of Anigap between gender(male) and race(black). The association is very small overall.There is association between the Sodium and Aniongap. Also, the value goes up for the Sodium and Aniongap with year goes up.