## 1.

 # rm(list=ls())
pkg <- c("readr","readxl","dplyr","stringr","ggplot2","tidyr")
pkgload <- lapply(pkg, require, character.only = TRUE)
## Loading required package: readr
## Loading required package: readxl
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## Loading required package: stringr
## Loading required package: ggplot2
## Loading required package: tidyr
data1 <- read_excel("/Users/MichaelMiao/UCLA/uclatextbook/140sl/140\ review\ from\ Wilbur/demographicData4pm.xls")
## readxl works best with a newer version of the tibble package.
## You currently have tibble v1.4.2.
## Falling back to column name repair from tibble <= v1.4.2.
## Message displays once per session.
load("/Users/MichaelMiao/UCLA/uclatextbook/140sl/140\ review\ from\ Wilbur/LAB4PM.RData")
data2 <- LAB4PM
summary(data1)
##    STUDY_ID            Gender              Race          
##  Length:18721       Length:18721       Length:18721      
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##   Ethnicity        
##  Length:18721      
##  Class :character  
##  Mode  :character
summary(data2)
##    STUDY_ID            AnionGap          Age             Year     
##  Length:14161       Min.   : 4.00   Min.   :30.00   Min.   :2016  
##  Class :character   1st Qu.:12.00   1st Qu.:32.00   1st Qu.:2016  
##  Mode  :character   Median :14.00   Median :34.00   Median :2016  
##                     Mean   :13.77   Mean   :34.45   Mean   :2016  
##                     3rd Qu.:15.00   3rd Qu.:37.00   3rd Qu.:2017  
##                     Max.   :35.00   Max.   :40.00   Max.   :2017  
##  Inpatient_Outpatient     SODIUM     
##  Length:14161         Min.   :123.0  
##  Class :character     1st Qu.:139.0  
##  Mode  :character     Median :140.0  
##                       Mean   :140.1  
##                       3rd Qu.:142.0  
##                       Max.   :151.0
names(data1)
## [1] "STUDY_ID"  "Gender"    "Race"      "Ethnicity"
names(data2)
## [1] "STUDY_ID"             "AnionGap"             "Age"                 
## [4] "Year"                 "Inpatient_Outpatient" "SODIUM"
datajoin <- inner_join(data1,data2,by = "STUDY_ID")
#View(datajoin)
datajoind <- distinct(datajoin)
attach(datajoin)
summary(datajoin)
##    STUDY_ID            Gender              Race          
##  Length:14161       Length:14161       Length:14161      
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##   Ethnicity            AnionGap          Age             Year     
##  Length:14161       Min.   : 4.00   Min.   :30.00   Min.   :2016  
##  Class :character   1st Qu.:12.00   1st Qu.:32.00   1st Qu.:2016  
##  Mode  :character   Median :14.00   Median :34.00   Median :2016  
##                     Mean   :13.77   Mean   :34.45   Mean   :2016  
##                     3rd Qu.:15.00   3rd Qu.:37.00   3rd Qu.:2017  
##                     Max.   :35.00   Max.   :40.00   Max.   :2017  
##  Inpatient_Outpatient     SODIUM     
##  Length:14161         Min.   :123.0  
##  Class :character     1st Qu.:139.0  
##  Mode  :character     Median :140.0  
##                       Mean   :140.1  
##                       3rd Qu.:142.0  
##                       Max.   :151.0
tapply(datajoin$AnionGap,datajoin$Race,mean)
##          American Indian or Alaska Native 
##                                  14.23913 
##                                     Asian 
##                                  13.83436 
##                 Black or African American 
##                                  13.15815 
##                            Multiple Races 
##                                  13.61905 
##                                        NA 
##                                  13.13636 
## Native Hawaiian or Other Pacific Islander 
##                                  14.45946 
##                                     Other 
##                                  13.80646 
##                           Patient Refused 
##                                  14.21692 
##                                   Unknown 
##                                  13.92087 
##                        White or Caucasian 
##                                  13.65992
tapply(datajoin$AnionGap,datajoin$Gender,mean)
##   Female     Male  Unknown 
## 13.67799 13.87348 16.00000
tapply(datajoin$AnionGap,datajoin$Ethnicity,mean)
##                                Cuban                   Hispanic or Latino 
##                             16.00000                             13.59548 
##        Hispanic/Spanish origin Other Mexican, Mexican American, Chicano/a 
##                             14.01170                             13.00000 
##               Not Hispanic or Latino                      Patient Refused 
##                             13.69819                             14.19592 
##                         Puerto Rican                              Unknown 
##                             12.50000                             13.93228
tapply(datajoin$SODIUM,datajoin$Race,mean)
##          American Indian or Alaska Native 
##                                  139.3261 
##                                     Asian 
##                                  140.1168 
##                 Black or African American 
##                                  140.0032 
##                            Multiple Races 
##                                  139.7738 
##                                        NA 
##                                  139.9545 
## Native Hawaiian or Other Pacific Islander 
##                                  139.7297 
##                                     Other 
##                                  140.1326 
##                           Patient Refused 
##                                  140.2988 
##                                   Unknown 
##                                  140.2171 
##                        White or Caucasian 
##                                  140.1332
tapply(datajoin$SODIUM,datajoin$Gender,mean)
##   Female     Male  Unknown 
## 139.7425 140.6421 143.0000
tapply(datajoin$SODIUM,datajoin$Ethnicity,mean)
##                                Cuban                   Hispanic or Latino 
##                             141.2500                             139.9774 
##        Hispanic/Spanish origin Other Mexican, Mexican American, Chicano/a 
##                             140.2047                             139.7419 
##               Not Hispanic or Latino                      Patient Refused 
##                             140.1228                             140.3020 
##                         Puerto Rican                              Unknown 
##                             140.7500                             140.2659
prop.table(table(datajoin$Gender))
## 
##       Female         Male      Unknown 
## 5.528564e-01 4.470729e-01 7.061648e-05
prop.table(table(datajoin$Race))
## 
##          American Indian or Alaska Native 
##                               0.003248358 
##                                     Asian 
##                               0.102746981 
##                 Black or African American 
##                               0.044205918 
##                            Multiple Races 
##                               0.005931784 
##                                        NA 
##                               0.003107125 
## Native Hawaiian or Other Pacific Islander 
##                               0.002612810 
##                                     Other 
##                               0.170397571 
##                           Patient Refused 
##                               0.078454911 
##                                   Unknown 
##                               0.149918791 
##                        White or Caucasian 
##                               0.439375750
table(Gender)
## Gender
##  Female    Male Unknown 
##    7829    6331       1
table(Race)
## Race
##          American Indian or Alaska Native 
##                                        46 
##                                     Asian 
##                                      1455 
##                 Black or African American 
##                                       626 
##                            Multiple Races 
##                                        84 
##                                        NA 
##                                        44 
## Native Hawaiian or Other Pacific Islander 
##                                        37 
##                                     Other 
##                                      2413 
##                           Patient Refused 
##                                      1111 
##                                   Unknown 
##                                      2123 
##                        White or Caucasian 
##                                      6222
table(Ethnicity)
## Ethnicity
##                                Cuban                   Hispanic or Latino 
##                                    4                                 1372 
##        Hispanic/Spanish origin Other Mexican, Mexican American, Chicano/a 
##                                  171                                  124 
##               Not Hispanic or Latino                      Patient Refused 
##                                 9042                                 1225 
##                         Puerto Rican                              Unknown 
##                                    8                                 2215
datajoinnew <- datajoin %>% mutate(.,Noinfo = replace(Race, Race=="Patient Refused","Unknown"))
table(datajoinnew$Noinfo)
## 
##          American Indian or Alaska Native 
##                                        46 
##                                     Asian 
##                                      1455 
##                 Black or African American 
##                                       626 
##                            Multiple Races 
##                                        84 
##                                        NA 
##                                        44 
## Native Hawaiian or Other Pacific Islander 
##                                        37 
##                                     Other 
##                                      2413 
##                                   Unknown 
##                                      3234 
##                        White or Caucasian 
##                                      6222
datajoinnew <- datajoinnew %>% select_at(.,vars(-c(Race)))
datajoinnew <- rename(datajoinnew, Race = Noinfo)
dim(datajoinnew)
## [1] 14161     9
datajoinnew <- datajoinnew %>% mutate(., NO = replace(Ethnicity, Ethnicity == "Patient Refused","Unknown")) 
datajoinnew <- datajoinnew %>% select_at(.,vars(-c(Ethnicity)))
dim(datajoinnew)
## [1] 14161     9
datajoinnew <- rename(datajoinnew, Ethnicity = NO)
str(datajoinnew)
## Classes 'tbl_df', 'tbl' and 'data.frame':    14161 obs. of  9 variables:
##  $ STUDY_ID            : chr  "000EA425FFF3622052E55966D027AEC7" "0013E732CFD6284BBBDC47C8B3D6132A" "0014D2A96ECDA4C63B10E375F26DB63A" "00162780B66CD4C246AE00EA683D2CE9" ...
##  $ Gender              : chr  "Female" "Female" "Female" "Female" ...
##  $ AnionGap            : num  13 15 13 15 14 15 16 14 12 11 ...
##  $ Age                 : int  33 35 38 39 33 33 32 31 35 32 ...
##  $ Year                : int  2016 2016 2016 2016 2016 2017 2016 2017 2017 2016 ...
##  $ Inpatient_Outpatient: chr  "IP" "OP" "OP" "OP" ...
##  $ SODIUM              : num  141 140 141 143 141 142 141 144 139 136 ...
##  $ Race                : chr  "Other" "Other" "White or Caucasian" "White or Caucasian" ...
##  $ Ethnicity           : chr  "Not Hispanic or Latino" "Unknown" "Not Hispanic or Latino" "Not Hispanic or Latino" ...
par(mfrow=c(1,2))
ggplot(data = datajoinnew,aes(factor(Year),AnionGap))+geom_boxplot(fill = c("yellow","blue"))

ggplot(data = datajoinnew,aes(factor(Year),SODIUM))+geom_boxplot(fill = c("yellow","blue"))

attach(datajoinnew)
## The following objects are masked from datajoin:
## 
##     Age, AnionGap, Ethnicity, Gender, Inpatient_Outpatient, Race,
##     SODIUM, STUDY_ID, Year
plot(SODIUM,Year)

plot(AnionGap,Year)

plot(as.factor(Gender),AnionGap)

plot(as.factor(Gender),SODIUM)

plot(as.factor(Race),AnionGap)

plot(as.factor(Race),SODIUM)

plot(as.factor(Ethnicity),AnionGap)

plot(as.factor(Ethnicity),SODIUM)

attach(datajoinnew)
## The following objects are masked from datajoinnew (pos = 3):
## 
##     Age, AnionGap, Ethnicity, Gender, Inpatient_Outpatient, Race,
##     SODIUM, STUDY_ID, Year
## The following objects are masked from datajoin:
## 
##     Age, AnionGap, Ethnicity, Gender, Inpatient_Outpatient, Race,
##     SODIUM, STUDY_ID, Year
model1 <- glm(data = datajoinnew, SODIUM~ factor(Ethnicity)+ factor(Race) + factor(Gender))
summary(model1)
## 
## Call:
## glm(formula = SODIUM ~ factor(Ethnicity) + factor(Race) + factor(Gender), 
##     data = datajoinnew)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -17.2885   -1.6203    0.2097    1.3797   10.3797  
## 
## Coefficients:
##                                                        Estimate Std. Error
## (Intercept)                                           140.12018    1.23701
## factor(Ethnicity)Hispanic or Latino                    -1.19461    1.18805
## factor(Ethnicity)Hispanic/Spanish origin Other         -0.99804    1.20011
## factor(Ethnicity)Mexican, Mexican American, Chicano/a  -1.39230    1.20539
## factor(Ethnicity)Not Hispanic or Latino                -1.06050    1.18649
## factor(Ethnicity)Puerto Rican                          -0.49213    1.45296
## factor(Ethnicity)Unknown                               -0.88782    1.18953
## factor(Race)Asian                                       0.73789    0.35554
## factor(Race)Black or African American                   0.54781    0.36271
## factor(Race)Multiple Races                              0.41494    0.43520
## factor(Race)NA                                          0.25916    0.50791
## factor(Race)Native Hawaiian or Other Pacific Islander   0.27560    0.52417
## factor(Race)Other                                       0.73058    0.35342
## factor(Race)Unknown                                     0.59755    0.36162
## factor(Race)White or Caucasian                          0.66737    0.35121
## factor(Gender)Male                                      0.89330    0.04027
## factor(Gender)Unknown                                   3.34386    2.37335
##                                                       t value Pr(>|t|)    
## (Intercept)                                           113.273   <2e-16 ***
## factor(Ethnicity)Hispanic or Latino                    -1.006   0.3147    
## factor(Ethnicity)Hispanic/Spanish origin Other         -0.832   0.4056    
## factor(Ethnicity)Mexican, Mexican American, Chicano/a  -1.155   0.2481    
## factor(Ethnicity)Not Hispanic or Latino                -0.894   0.3714    
## factor(Ethnicity)Puerto Rican                          -0.339   0.7348    
## factor(Ethnicity)Unknown                               -0.746   0.4555    
## factor(Race)Asian                                       2.075   0.0380 *  
## factor(Race)Black or African American                   1.510   0.1310    
## factor(Race)Multiple Races                              0.953   0.3404    
## factor(Race)NA                                          0.510   0.6099    
## factor(Race)Native Hawaiian or Other Pacific Islander   0.526   0.5990    
## factor(Race)Other                                       2.067   0.0387 *  
## factor(Race)Unknown                                     1.652   0.0985 .  
## factor(Race)White or Caucasian                          1.900   0.0574 .  
## factor(Gender)Male                                     22.185   <2e-16 ***
## factor(Gender)Unknown                                   1.409   0.1589    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 5.627735)
## 
##     Null deviance: 82581  on 14160  degrees of freedom
## Residual deviance: 79599  on 14144  degrees of freedom
## AIC: 64672
## 
## Number of Fisher Scoring iterations: 2
model2 <- glm(data = datajoinnew, AnionGap~ factor(Ethnicity)+ factor(Race) + factor(Gender))
summary(model2)
## 
## Call:
## glm(formula = AnionGap ~ factor(Ethnicity) + factor(Race) + factor(Gender), 
##     data = datajoinnew)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -9.9460  -1.7726   0.0458   1.4140  21.4140  
## 
## Coefficients:
##                                                       Estimate Std. Error
## (Intercept)                                           16.48070    1.32351
## factor(Ethnicity)Hispanic or Latino                   -2.46108    1.27112
## factor(Ethnicity)Hispanic/Spanish origin Other        -2.05224    1.28403
## factor(Ethnicity)Mexican, Mexican American, Chicano/a -3.06385    1.28968
## factor(Ethnicity)Not Hispanic or Latino               -2.26967    1.26945
## factor(Ethnicity)Puerto Rican                         -3.50846    1.55455
## factor(Ethnicity)Unknown                              -2.07179    1.27271
## factor(Race)Asian                                     -0.45157    0.38040
## factor(Race)Black or African American                 -1.13579    0.38807
## factor(Race)Multiple Races                            -0.60796    0.46563
## factor(Race)NA                                        -1.36555    0.54342
## factor(Race)Native Hawaiian or Other Pacific Islander  0.17267    0.56082
## factor(Race)Other                                     -0.42089    0.37814
## factor(Race)Unknown                                   -0.45472    0.38690
## factor(Race)White or Caucasian                        -0.62499    0.37577
## factor(Gender)Male                                     0.18653    0.04308
## factor(Gender)Unknown                                  2.40128    2.53931
##                                                       t value Pr(>|t|)    
## (Intercept)                                            12.452  < 2e-16 ***
## factor(Ethnicity)Hispanic or Latino                    -1.936  0.05287 .  
## factor(Ethnicity)Hispanic/Spanish origin Other         -1.598  0.11000    
## factor(Ethnicity)Mexican, Mexican American, Chicano/a  -2.376  0.01753 *  
## factor(Ethnicity)Not Hispanic or Latino                -1.788  0.07381 .  
## factor(Ethnicity)Puerto Rican                          -2.257  0.02403 *  
## factor(Ethnicity)Unknown                               -1.628  0.10358    
## factor(Race)Asian                                      -1.187  0.23521    
## factor(Race)Black or African American                  -2.927  0.00343 ** 
## factor(Race)Multiple Races                             -1.306  0.19168    
## factor(Race)NA                                         -2.513  0.01199 *  
## factor(Race)Native Hawaiian or Other Pacific Islander   0.308  0.75818    
## factor(Race)Other                                      -1.113  0.26570    
## factor(Race)Unknown                                    -1.175  0.23991    
## factor(Race)White or Caucasian                         -1.663  0.09629 .  
## factor(Gender)Male                                      4.330  1.5e-05 ***
## factor(Gender)Unknown                                   0.946  0.34435    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 6.442289)
## 
##     Null deviance: 92020  on 14160  degrees of freedom
## Residual deviance: 91120  on 14144  degrees of freedom
## AIC: 66586
## 
## Number of Fisher Scoring iterations: 2
model3 <- glm(data = datajoinnew,SODIUM~AnionGap)
summary(model3)
## 
## Call:
## glm(formula = SODIUM ~ AnionGap, data = datajoinnew)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -16.9951   -1.4079    0.0049    1.5921   10.3964  
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 137.45071    0.10905 1260.44   <2e-16 ***
## AnionGap      0.19572    0.00779   25.13   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 5.583425)
## 
##     Null deviance: 82581  on 14160  degrees of freedom
## Residual deviance: 79056  on 14159  degrees of freedom
## AIC: 64545
## 
## Number of Fisher Scoring iterations: 2

I think there is very little association of Sodium between gender(male) and race(asian) ; and little association of Anigap between gender(male) and race(black). The association is very tiny. There is association between two Sodium and Aniongap. The value goes up with year as well.