First, we load necessary R libraries.
library(data.table)
library(ggpubr)
library(ggbeeswarm)
library(rstatix)
library(janitor)
library(ggforce)
library(openxlsx)
library(dplyr)
options(stringsAsFactors = F,max.print = 10000,error=NULL,
"openxlsx.dateFormat" = "dd/mm/yyyy"
)
Then, we load the data source necessary to generate all the figures.
sheet.names <- getSheetNames("../../Revision/Results/CLL_fCpGs_Gene_expression_bulk/Data_source_gene_expression.xlsx")
sheet.names <- structure(sheet.names,names=sheet.names)
sheet.names
data.source <- lapply(sheet.names, function(sheet.i){
res <- read.xlsx(xlsxFile = "../../Revision/Results/CLL_fCpGs_Gene_expression_bulk/Data_source_gene_expression.xlsx",
sheet = sheet.i,detectDates = T)
})
lapply(data.source,head)
We can see how genes associated with fCpGs show a lower gene expression than those non-fCpGs genes.(The Illumina array annotation was used to link CpGs to genes).
df.pp.plot <- data.source$pp_plot
pp.plot <-
df.pp.plot %>%
ggplot(aes(x=Non.fCpG.genes,
y=fCpG.genes
)
)+
xlab("Non-fCpG genes")+
ylab("fCpG genes")+
geom_abline(slope = 1,
color="grey80",
lty="dashed",
lwd=0.3
)+
ggrastr::rasterise(geom_point(size=1.5),dpi=300)+
# ggtitle(paste0("pp plot (%) of tpms, samples:",length(unique(df.tpms$sample))))+
theme_bw()+
theme(text=element_text(size = 5),
line = element_line(linewidth = 0.3),
panel.background = element_rect(fill = NA,colour = NA,linewidth = 0.3)
)
print(pp.plot)
We selected a random patient and plot the gene expression of genes associated with fCpGs and those that are not associated with them. (The Illumina array annotation was used to link CpGs to genes).
random.sample.anonymous <- data.source$sample_genes_fCpGs_non.fCpgs$sample
plot.genes.sample <-
data.source$sample_genes_fCpGs_non.fCpgs %>%
ggplot(aes(x = fCpG,
y = tpm +1
)
)+
geom_boxplot(
aes(fill=fCpG),#disease_state,sample_tx
show.legend = F,
lwd=0.3
)+
geom_pwc(method = "wilcox_test",
size = 0.3,
p.adjust.method = "none",
label = "P={p}",
y.position = log10(8500),
label.size = 1
)+
scale_fill_manual(values = c("orange2","grey70"))+
coord_cartesian(ylim = c(0,max(data.source$sample_genes_fCpGs_non.fCpgs$tpm))+1)+
scale_y_log10()+
annotation_logticks(sides = "l",
size = 0.3)+
ggtitle(paste0("Anonymous=",random.sample.anonymous))+
theme_bw()+
theme(legend.position = "none",
plot.margin = unit(c(0,0,0,0),"pt"),
text = element_text(size = 6),
line = element_line(linewidth = 0.3),
panel.border = element_rect(fill = NA,linewidth = 0.3),
panel.background = element_blank()
)
plot.genes.sample
All genes related to fCpGs are lowly expressed regardless of the number of alleles methylated. A beta binomial mixture model was used to categorize CpGs, and then Illumina array annotation was used to link CpGs to genes.
## plot genes segregated by meth allele of the random sample
plot.meth.alleles.sample <-
data.source$sample_meth_alleles_expression %>%
ggplot(aes(x = meth,
y = tpm+1
)
)+
geom_beeswarm(aes(fill=meth),
color="grey10",
corral = "wrap",
pch=21,
stroke=0.1,
size=1
)+
geom_boxplot(aes(fill=meth),
alpha=0.2,
outlier.shape = NA,
lwd=0.3
)+
scale_fill_manual(values = c("0"="#2F7FC2","2"="#BC5250","1"="grey80"))+
geom_pwc(method = "wilcox_test",
label = "P={p}",
size=0.3,
label.size = 1,
step.increase = 0.05
)+
coord_cartesian(ylim = c(0,max(data.source$sample_genes_fCpGs_non.fCpgs$tpm))+1)+
scale_y_log10()+
annotation_logticks(sides = "l",
size=0.3)+
ylab(NULL)+
xlab("# of fCpG methylated alleles")+
theme_bw()+
theme(legend.position = "none",
plot.margin = unit(c(0,0,0,0),"pt"),
text = element_text(size = 6),
line = element_line(linewidth = 0.3),
panel.border = element_rect(fill = NA,linewidth = 0.3),
panel.background = element_blank()
)
print(plot.meth.alleles.sample)
Plot all panels together
cowplot::plot_grid(pp.plot,
plot.genes.sample,
plot.meth.alleles.sample,
nrow = 1,
align = "h",
rel_widths = c(1,0.5,0.75)
)
print(sessionInfo())
## R version 4.5.0 (2025-04-11)
## Platform: aarch64-apple-darwin20
## Running under: macOS Ventura 13.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: Europe/Madrid
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] dplyr_1.1.4 openxlsx_4.2.8 ggforce_0.5.0 janitor_2.2.1
## [5] rstatix_0.7.2.999 ggbeeswarm_0.7.2 ggpubr_0.6.1 ggplot2_3.5.2
## [9] data.table_1.17.8 BiocStyle_2.36.0
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 beeswarm_0.4.0 xfun_0.53
## [4] bslib_0.9.0 Cairo_1.6-5 vctrs_0.6.5
## [7] tools_4.5.0 generics_0.1.4 tibble_3.3.0
## [10] pkgconfig_2.0.3 RColorBrewer_1.1-3 lifecycle_1.0.4
## [13] compiler_4.5.0 farver_2.1.2 stringr_1.5.1
## [16] tinytex_0.57 carData_3.0-5 snakecase_0.11.1
## [19] vipor_0.4.7 htmltools_0.5.8.1 sass_0.4.10
## [22] yaml_2.3.10 Formula_1.2-5 pillar_1.11.0
## [25] car_3.1-3 jquerylib_0.1.4 tidyr_1.3.1
## [28] MASS_7.3-65 cachem_1.1.0 magick_2.8.7
## [31] abind_1.4-8 tidyselect_1.2.1 zip_2.3.3
## [34] digest_0.6.37 stringi_1.8.7 purrr_1.1.0
## [37] bookdown_0.44 labeling_0.4.3 cowplot_1.2.0
## [40] polyclip_1.10-7 fastmap_1.2.0 grid_4.5.0
## [43] cli_3.6.5 magrittr_2.0.3 dichromat_2.0-0.1
## [46] broom_1.0.9 withr_3.0.2 scales_1.4.0
## [49] backports_1.5.0 lubridate_1.9.4 timechange_0.3.0
## [52] rmarkdown_2.29 ggsignif_0.6.4 evaluate_1.0.5
## [55] knitr_1.50 ggrastr_1.0.2 rlang_1.1.6
## [58] Rcpp_1.1.0 glue_1.8.0 tweenr_2.0.3
## [61] BiocManager_1.30.26 rstudioapi_0.17.1 jsonlite_2.0.0
## [64] R6_2.6.1