library(scales); library(dplyr); library(gridExtra);
library(xtable); library(magrittr); library(knitr)
library(vcd)
$set(echo = TRUE, warning=FALSE, message=FALSE, cache=TRUE) opts_chunk
<- function(df, var, dig){
univ.n.p %>% select_(var) %>% na.omit() %>% group_by_(var) %>%
df summarise(n=n()) %>%
mutate(pct=round(n/sum(n),dig))
}
<- function(df, var){
univ.n.pct %>% select_(var) %>% na.omit() %>% group_by_(var) %>%
df summarise(n=n()) %>%
mutate(pct=paste0(round(n/sum(n)*100,1), '%'))
}
<- function(df, row.var, col.var) {
biv.n.col.p %>% group_by_(row.var, col.var) %>%
df summarise(count=n()) %>%
mutate(pct=round(count/sum(count),3))
}
<- "data6e/"
data.path
<- read.delim(paste0(data.path,"Parhiv.txt"), sep="\t", header=TRUE)
raw
<- raw %>%
parhiv mutate(JOBMO = factor(JOBMO,
labels=c("Employed", "Unemployed", "Retired/Disabled")),
EDUMO = factor(EDUMO,
labels=c("Less than High School", "High School Graduate/GED",
"Post Secondary")),
bsi_overall = rowMeans(raw[,grep("BSI", names(raw))]),
log_bsi_overall = log(bsi_overall+.01),
parent_bonding = (PB01+(4-PB02)+(4-PB04)+PB05+PB06+
+PB12+(4-PB14)+(4-PB16)+PB17+(4-PB18)+(4-PB24))/12)
PB11$AGEALC[parhiv$AGEALC==0] <- NA
parhiv$AGEMAR[parhiv$AGEMAR==0] <- NA
parhiv$AGESMOKE[parhiv$AGESMOKE==0] <- NA
parhiv
<- read.delim(paste0(data.path,"Depress.txt"), sep="\t", header=TRUE)
raw.d <- raw.d %>%
depress mutate(gender = factor(SEX,
labels=c("Male", "Female")),
educat = factor(EDUCAT,
labels=c("<HS", "Some HS", "HS Grad", "Some college",
"BS", "MS", "PhD")),
marital = factor(MARITAL,
labels = c("Never Married", "Married", "Divorced",
"Separated", "Widowed")))
<- read.delim(paste0(data.path,"Lung.txt"), sep="\t", header=TRUE)
raw.l
<- read.delim(paste0(data.path,"Mice.txt"), sep="\t", header=TRUE) mice
hist(parhiv$bsi_overall);box()
ggplot(parhiv, aes(x=bsi_overall)) +
geom_histogram() +
theme_bw(base_size=16)
ggplot(parhiv, aes(x=bsi_overall)) +
geom_histogram() +
theme_classic(base_size = 20) +
xlab("BSI score") + ylab("Frequency") + ggtitle("Brief Symptom Inventory Score")
ggplot(filter(parhiv, !is.na(EDUMO)), aes(x=EDUMO)) +
theme_classic(base_size = 20) +
geom_bar(aes(y = ..count..)) + ggtitle("Frequency of educational level") +
xlab("Highest educational level attained") +
scale_x_discrete(labels = function(x) lapply(strwrap(x, width = 12, simplify = FALSE),
collapse="\n")) +
paste, geom_text(aes(y=..count.. + 5, label=..count..), stat='count', size = 7) +
theme(axis.ticks.x=element_blank())
<- univ.n.p(parhiv, "EDUMO", dig=3)
ptab.ctp
ggplot(ptab.ctp, aes(x=EDUMO)) +
geom_bar(aes(y=pct), stat="identity") +
xlab("Highest educational level attained") + ylab("") +
ggtitle("Percent of mothers with each \n educational level") +
theme_classic(base_size = 20) +
scale_y_continuous(labels=percent, limits=c(0,1)) + ylab("") +
scale_x_discrete(labels = function(x) lapply(strwrap(x, width = 12, simplify = FALSE),
collapse="\n")) +
paste, geom_text(aes(y=pct + .05, label=paste0(pct*100, "%")), size = 7) +
theme(axis.ticks.x=element_blank())
<- univ.n.pct(depress, "marital")
ptab.empl
ggplot(ptab.empl, aes(x=n, y=reorder(marital, n), label=n)) +
geom_point(size = 3) +
geom_text(size=5, vjust=-1) +
xlab("Frequency") + ylab("Marital Status") + ggtitle("Frequency of marital status") +
theme_classic(base_size=20) +
scale_x_continuous(limits=c(0,130))+
theme(panel.grid.major.x = element_blank(), axis.ticks.y=element_blank(),
panel.grid.minor.x = element_blank(),
panel.grid.major.y = element_line(color='grey60', linetype='dashed'))
<- table(depress$marital)
dc
pie(dc, labels = paste0(names(dc), ' (', round(prop.table(dc)*100,1), "%)"),
col = c("#f7f7f7","#cccccc", "#969696", "#636363", "#252525"), cex=1.5, radius=0.5)
par(mar=c(0,0,0,0), oma=c(0,0,0,0))
plot(c(0,0.5), c(0,1), type="n", axes=FALSE, main="", ylab="", xlab="")
<- capture.output(stem(sort(depress$AGE)))
tmp <- tmp[4:length(tmp)]
tmp text(0,1, paste(tmp, collapse='\n'), adj=c(0,1), family='mono', cex=1)
ggplot(depress, aes(y=AGE, x=1)) +
geom_jitter(position = position_jitter(0.2)) +
ylab("Age") +
scale_x_continuous(limits=c(0,2)) +
theme_classic(base_size=20)+ xlab("")+
theme(panel.grid.major.y = element_blank(), axis.ticks.y=element_blank(),
panel.grid.minor.y= element_blank(), axis.text.y=element_blank()) +
coord_flip()
ggplot(depress, aes(x=AGE)) +
geom_histogram(colour="black", fill="grey80", binwidth=10) +
xlab("Age") + ylab("Frequency") +
ggtitle("Distribution of age") +
theme_classic(base_size = 16) +
scale_x_continuous(limits=c(20,80), breaks=seq(20,80,by=10))
ggplot(depress, aes(x=AGE)) +
geom_histogram(colour="black", fill="grey80") +
xlab("Age") + ylab("Frequency") +
ggtitle("Distribution of age") +
theme_classic(base_size = 16) +
scale_x_continuous(limits=c(15,90), breaks=seq(20,90,by=10))
ggplot(depress, aes(x=AGE)) +
geom_histogram(aes(y=..density..), colour="black", fill="grey80") +
xlab("Age") + ylab("Density") +
geom_density(colour="black", lwd=1.1) +
scale_x_continuous(limits=c(15,90), breaks=seq(20,90,by=10)) +
ggtitle("Distribution of age") + theme_classic(base_size = 16)
<- boxplot(depress$AGE, plot=FALSE)$stats
txt <- data.frame(value=as.vector(txt), lab=as.character(txt))
data.labels <- data.frame(value=as.vector(txt),
labels lab=c("Min", "Q(0.25)", "Median", "Q(0.75)", "Max"))
ggplot(depress, aes(y=AGE, x=1)) +
geom_boxplot(width=.3) + ylab("Age") + xlab("") +
scale_x_continuous(limits=c(0.5,1.35)) +
theme_classic(base_size = 16) +
theme(axis.ticks = element_blank(), axis.text.x = element_blank()) +
geom_text(data=data.labels,aes(x=0.75,y=value,label=lab),angle=0, size=5) +
geom_text(data=labels,aes(x=0.6,y=value,label=lab),angle=0, size=5)
<- boxplot(depress$CESD, plot=FALSE)$stats
txt <- data.frame(value=as.vector(txt), lab=as.character(txt))
data.labels <- data.frame(value=as.vector(txt),
labels lab=c("Min", "Q(0.25)", "Median", "Q(0.75)", "Largest value \n within fence"))
ggplot(depress, aes(y=CESD, x=1)) +
geom_boxplot(width=.3) + ylab("CESD") + xlab("") +
scale_x_continuous(limits=c(0.75,1.5)) +
theme_classic(base_size = 16) +
theme(axis.ticks = element_blank(), axis.text.x = element_blank()) +
geom_text(data=data.labels,aes(x=1.25,y=value,label=lab),angle=0, size=5) +
geom_text(data=labels,aes(x=1.4,y=value,label=lab),angle=0, size=5)
ggplot(depress, aes(y=AGE, x=1)) +
geom_boxplot(width=.3) + geom_violin(alpha=.2) +
ylab("Age") + xlab("") + theme_classic(base_size = 16) +
theme(axis.ticks = element_blank(), axis.text.x = element_blank()) +
stat_summary(fun.y=mean, geom="point", shape=18, size=4)
ggplot(depress, aes(y=CESD, x=1)) +
geom_boxplot(width=.3) + geom_violin(alpha=.2) +
ylab("CESD") + xlab("") + theme_classic(base_size = 16) +
geom_jitter(alpha=.3, width=.1) +
theme(axis.ticks = element_blank(), axis.text.x = element_blank())
table(depress$gender, depress$educat) %>% kable()
<HS | Some HS | HS Grad | Some college | BS | MS | PhD | |
---|---|---|---|---|---|---|---|
Male | 4 | 19 | 39 | 18 | 17 | 8 | 6 |
Female | 1 | 42 | 75 | 30 | 26 | 6 | 3 |
kable(
addmargins(
prop.table(table(depress$gender, depress$educat))*100
),digits=1)
<HS | Some HS | HS Grad | Some college | BS | MS | PhD | Sum | |
---|---|---|---|---|---|---|---|---|
Male | 1.4 | 6.5 | 13.3 | 6.1 | 5.8 | 2.7 | 2.0 | 37.8 |
Female | 0.3 | 14.3 | 25.5 | 10.2 | 8.8 | 2.0 | 1.0 | 62.2 |
Sum | 1.7 | 20.7 | 38.8 | 16.3 | 14.6 | 4.8 | 3.1 | 100.0 |
kable(
addmargins(
prop.table(table(depress$gender, depress$educat), margin=1)*100
),digits=1)
<HS | Some HS | HS Grad | Some college | BS | MS | PhD | Sum | |
---|---|---|---|---|---|---|---|---|
Male | 3.6 | 17.1 | 35.1 | 16.2 | 15.3 | 7.2 | 5.4 | 100 |
Female | 0.5 | 23.0 | 41.0 | 16.4 | 14.2 | 3.3 | 1.6 | 100 |
Sum | 4.2 | 40.1 | 76.1 | 32.6 | 29.5 | 10.5 | 7.0 | 200 |
kable(
addmargins(
prop.table(table(depress$gender, depress$educat), margin=2)*100
),digits=1)
<HS | Some HS | HS Grad | Some college | BS | MS | PhD | Sum | |
---|---|---|---|---|---|---|---|---|
Male | 80 | 31.1 | 34.2 | 37.5 | 39.5 | 57.1 | 66.7 | 346.2 |
Female | 20 | 68.9 | 65.8 | 62.5 | 60.5 | 42.9 | 33.3 | 353.8 |
Sum | 100 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | 700.0 |
<- parhiv %>% select(JOBMO, EDUMO) %>% na.omit()
phna <- biv.n.col.p(phna, "EDUMO", "JOBMO")
pct.cc
ggplot(phna, aes(x=EDUMO, fill=JOBMO)) +
geom_bar() +
theme_classic(base_size = 16) +
scale_fill_grey(name="Job Status") + xlab("") +
theme(legend.position="top", legend.title=element_text(size=10), legend.text=element_text(size=10)) +
coord_equal(1/30) + ylab("Frequency") +
scale_x_discrete(labels = function(x) lapply(strwrap(x, width = 12, simplify = FALSE),
collapse="\n")) +
paste, theme(axis.ticks.x=element_blank())
ggplot(pct.cc, aes(x=EDUMO, fill=JOBMO)) +
geom_bar(aes(y=pct), stat='identity') +
scale_fill_grey(guide=FALSE) +
theme_classic(base_size = 16) +
xlab("") + ylab("Percent") +
scale_y_continuous(labels=percent) +
coord_equal(1/0.35) +
scale_x_discrete(labels = function(x) lapply(strwrap(x, width = 12, simplify = FALSE),
collapse="\n")) +
paste, theme(axis.ticks.x=element_blank())
ggplot(pct.cc, aes(x=EDUMO, y=pct, fill=JOBMO)) +
scale_fill_grey(name="Job Status") +
geom_bar(stat="identity", position = "dodge") +
theme_classic(base_size = 16) + xlab("") +
scale_y_continuous(labels = percent) +
ylab("Percent") +
scale_x_discrete(labels = function(x) lapply(strwrap(x, width = 12, simplify = FALSE),
collapse="\n")) +
paste, theme(axis.ticks.x=element_blank())
ggplot(pct.cc, aes(x=count, y=reorder(JOBMO, count))) +
geom_point(size = 3) +
xlab("Frequency") +
facet_grid(EDUMO ~ ., labeller=label_wrap_gen(14)) +
geom_segment(aes(yend=JOBMO), xend=0, color='grey50') +
ylab("Job Status") + theme_bw(base_size = 16) +
theme(panel.grid.major.y = element_blank()) +
theme(axis.ticks.y=element_blank())
<- parhiv %>% mutate(gender=factor(GENDER, labels=c("Male", "Female"))) %>%
bdpd select(JOBMO, gender) %>% na.omit() %>%
group_by(JOBMO, gender) %>%
summarise(count=n())
ggplot(bdpd, aes(x=count, y=reorder(JOBMO, count))) +
geom_point(aes(shape=gender), size=3) +
geom_line(aes(group=JOBMO)) +
scale_shape_discrete(name="Gender of \n teen") +
xlab("Frequency") +
ylab("Mother's Job Status") +
theme_bw(base_size = 16) +
theme(panel.grid.major.y = element_blank()) +
theme(axis.ticks.y=element_blank())
kable(
prop.table(
table(phna$EDUMO, phna$JOBMO)
*100,
)digits=1)
Employed | Unemployed | Retired/Disabled | |
---|---|---|---|
Less than High School | 4.0 | 24.7 | 13.2 |
High School Graduate/GED | 4.0 | 14.4 | 13.8 |
Post Secondary | 5.2 | 10.9 | 9.8 |
<- list(set_varnames=c(JOBMO="Job Status", EDUMO="Highest Education Level"),
varnames rot_labels = c(0,90,0,0),
offset_varnames = c(left = 5, top=1),
just_labels=c(left="right"))
<- list(EDUMO = c("Less than\nHigh School", "High School\nGraduate/GED", "Post\nSecondary"))
lnames
mosaic(JOBMO~EDUMO, data=phna, color=TRUE,
set_labels=lnames,
labeling_args = varnames,
margins=c(top=5, right=0, left=7))
ggplot(depress, aes(x=AGE, y=CESD)) + geom_point(size=1, col="grey") + theme_classic(base_size = 16) +
xlab("Age") + ylab("CESD") +
geom_smooth(se=FALSE, method="lm", color="black", aes(linetype="Linear"))+
geom_smooth(se=FALSE, method="loess", color="black", aes(linetype="Lowess"))+
scale_linetype_manual(values = c("solid","dashed"), name="line types",
labels=c("Linear","Lowess"),guide="legend")
ggplot(mice, aes(y=WEIGHT, x=DAY, group=ID)) +
geom_point() + geom_line() +
theme_classic(base_size = 16) +
ylab("Weight (g)") + xlab("Time (Days)")
ggplot(depress, aes(y=CESD, x=gender, color=gender)) +
scale_color_grey(start=0, end=0.5,guide=FALSE) +
geom_boxplot(width=.3) + geom_violin(alpha=.2) +
ylab("CESD") + xlab("") +
theme_classic(base_size = 20) +
stat_summary(fun.y=mean, geom="point", shape=18, size=4) +
theme(axis.ticks.x=element_blank())
ggplot(depress, aes(x=CESD, color=gender)) +
geom_density() + theme_classic(base_size = 20) +
ylab("Density") + xlab("CESD")+
scale_color_grey(start=0, end=0.5)+
theme(axis.ticks = element_blank(), axis.text.y = element_blank()) +
theme(legend.justification=c(1,1), legend.position=c(1,1))
ggplot(depress, aes(x=CESD, fill=gender)) +
facet_grid(gender ~ .) +
xlab("CESD")+ylab("Frequency") +
geom_histogram() + theme_bw(base_size = 20) +
scale_fill_grey(start=0, end=0.5,guide=FALSE) +
theme(axis.ticks = element_blank(), axis.text.y = element_blank()) +
theme(legend.justification=c(1,1), legend.position=c(1,1))
ggplot(depress, aes(x=AGE, y=CESD, color=gender)) +
scale_color_grey() +
geom_point() +
theme_classic(base_size = 20) +
xlab("Age") + ylab("CESD")
ggplot(depress, aes(x=AGE, y=CESD, shape=gender)) +
geom_point() +
theme_classic(base_size = 20) +
xlab("Age") + ylab("CESD")
ggplot(depress, aes(x=AGE, y=CESD, size=INCOME)) +
geom_point(shape=21) +
theme_classic(base_size = 20) +
xlab("Age") + ylab("CESD") +
scale_size_continuous(limits=c(0,70), breaks = seq(0,70,by=10))
ggplot(depress, aes(x=AGE, y=CESD, fill=INCOME)) +
geom_point(shape=21, size=5) +
theme_classic(base_size = 20) +
xlab("Age") + ylab("CESD") +
scale_fill_gradient(low="white", high="black")
ggplot(filter(parhiv, !is.na(EDUMO) & !is.na(JOBMO)), aes(x=bsi_overall)) +
ylab("Frequency") +
geom_histogram()+
facet_grid(EDUMO~JOBMO, labeller=label_wrap_gen(14)) +
scale_y_continuous(breaks=seq(0,10,by=2))+
theme_bw(base_size = 18) +
xlab("Overall BSI score")
library(psych)
<- parhiv[,c("bsi_overall", "parent_bonding", 'AGESMOKE', 'AGEALC')]
c.vars names(c.vars) <- c("BSI", "Parental care subscale", "Age started smoking", "Age started drinking")
pairs.panels(c.vars, hist.col="grey80", col=1, rug=FALSE, cex.cor = .5, ellipses = FALSE)
library(corrplot)
corrplot(cor(c.vars, use='pairwise.complete.obs'), type="upper", col=gray.colors(100),
tl.col="black", tl.srt=30)