Here we use previous WGBS data from normal and neoplastic B cells, and map the identified fCpGs in array data and check the methylation status in normal and neoplastic B cells.
##
## Load necessary packages and set global options
##
library(data.table)
library(janitor)
library(dplyr)
library(ggpubr)
library(pals)
library(GenomicRanges)
library(openxlsx)
library(BSgenome.Hsapiens.UCSC.hg38)
options(stringsAsFactors = F,max.print = 10000,error=NULL,
"openxlsx.dateFormat" = "dd/mm/yyyy"
)
## load fCpGs with meth values
fcpgs.betas <-
fread("../Data/FMC/Final_data/beta_filtered_samples_fcpgs_laplacian.tsv",data.table = F) %>%
tibble::column_to_rownames(var = "V1")
fcpgs.betas[1:5,1:5]
dim(fcpgs.betas)
fcpgs <-
openxlsx::read.xlsx("../Revision/Data/Supplementary_Tables_revision_curated.2.xlsx",
sheet = "supplementary_table_3") %>%
clean_names()
fcpgs <-
GRanges(fcpgs$chromosome,IRanges(fcpgs$position,fcpgs$position),strand = fcpgs$strand,
fcpgs %>% select(-c(chromosome,position,strand))
)
fcpgs
getSeq(BSgenome.Hsapiens.UCSC.hg19::BSgenome.Hsapiens.UCSC.hg19,fcpgs) %>% as.character() %>% table
getSeq(BSgenome.Hsapiens.UCSC.hg19::BSgenome.Hsapiens.UCSC.hg19,fcpgs[strand(fcpgs)=="+"]) %>% as.character() %>% table
getSeq(BSgenome.Hsapiens.UCSC.hg19::BSgenome.Hsapiens.UCSC.hg19,fcpgs[strand(fcpgs)=="-"]) %>% as.character() %>% table
##perform liftover
hg19.to.hg38.chain <- rtracklayer::import.chain(con = "../Revision/Data/Nanopore/hg19ToHg38.over.chain")
fcpgs.hg38 <- rtracklayer::liftOver(x = fcpgs,chain = hg19.to.hg38.chain)
fcpgs.hg38 <- unlist(fcpgs.hg38)
length(fcpgs)
length(fcpgs.hg38)
fcpgs$name[which(!fcpgs$name %in% fcpgs.hg38$name)]
fcpgs.hg38
getSeq(BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38,fcpgs.hg38) %>% as.character() %>% table
getSeq(BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38,fcpgs.hg38[strand(fcpgs.hg38)=="+"]) %>% as.character() %>% table
getSeq(BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38,fcpgs.hg38[strand(fcpgs.hg38)=="-"]) %>% as.character() %>% table
fcpgs.hg38[strand(fcpgs.hg38)=="-"] %>% granges()
##liftover do not work on this CpG
fcpgs.hg38[strand(fcpgs.hg38)=="-"][which(getSeq(BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38,fcpgs.hg38[strand(fcpgs.hg38)=="-"]) %>% as.character() !="G"),]
exclude <- fcpgs.hg38[strand(fcpgs.hg38)=="-"]$name[which(getSeq(BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38,fcpgs.hg38[strand(fcpgs.hg38)=="-"]) %>% as.character() !="G")]
fcpgs.hg38 <- fcpgs.hg38[fcpgs.hg38$name!=exclude]
fcpgs.hg38
#check nucleotides
getSeq(BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38,fcpgs.hg38[strand(fcpgs.hg38)=="+"]) %>% as.character() %>% table
getSeq(BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38,fcpgs.hg38[strand(fcpgs.hg38)=="-"]) %>% as.character() %>% table
# ok!
## meth fcpgs
# update annotation for - stand, as array annot of CG always refer to + strand and first nucleotide
# fcpgs.hg38[which(strand(fcpgs.hg38)=="-")] <-
# resize(fcpgs.hg38[which(strand(fcpgs.hg38)=="-")],width = 2,fix = "end")
# fcpgs.hg38
# start(fcpgs.hg38[which(strand(fcpgs.hg38)=="-")]) <-
# end(fcpgs.hg38[which(strand(fcpgs.hg38)=="-")])
# fcpgs.hg38 %>% width() %>% table
# fcpgs.hg38
getSeq(BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38,fcpgs.hg38[strand(fcpgs.hg38)=="-"]) %>% as.character() %>% unique()
##ok"
fcpgs.hg38 <- sort(sortSeqlevels(fcpgs.hg38),ignore.strand=T)
fcpgs.hg38
##
## Load previous WGBS
##
load("../../WGBS/WGBS_57_10reads_NAs.RData")
head(meth.final);dim(meth.final)
class(meth.final)
colnames(meth.final) <- c("chr","start",
"S1.1","S1.2","S3.1","S3.2","NBCB8","NBCB7","NBCB9","GCBC3","GCBC5","GCBC2","ncsMBC3","csMBC5","csMBC6","PCBM1","PCBM2","PCT2","PCT6","PCT5",
"ALL.M485","ALL.M486",
"MCL8761","MCL2392","MCL81703","MCL6902","MCL007",
"CLL3","CLL182","CLL110","CLL12","CLL1525","CLL1532","CLL1228","CLL16",
"FL","FL","FL","BL","FL","BL","FL","BL","BL","UN","BL","FL","BL","FL","BL","BL","BL","BL","BL",
"MM54847","MM55358","MM54498","MM35574","MM35174"
) %>% make.unique()
wgbs <- GRanges(seqnames = meth.final$chr,ranges = IRanges(meth.final$start,meth.final$start))
mcols(wgbs) <- meth.final %>% select(-c(chr,start))
wgbs <- sort(sortSeqlevels(wgbs),ignore.strand=T)
wgbs
rm(meth.final);gc()
## checks
# wgbs.seq <- getSeq(BSgenome.Hsapiens.UCSC.hg38,wgbs)
# wgbs.seq
# wgbs.seq %>% as.matrix() %>% apply(.,2,table)
We next overlap fCpGs with WGBS data.
##overlap with fcpgs
fcpgs.overlaps <- findOverlaps(query = fcpgs.hg38,
subject = wgbs,
ignore.strand=T
)
fcpgs.overlaps
subjectHits(fcpgs.overlaps) %>% length()
subjectHits(fcpgs.overlaps) %>% unique() %>% length()
fcpgs.meth <- wgbs[subjectHits(fcpgs.overlaps)]
fcpgs.meth %>% width() %>% table
fcpgs.meth
##names regions with cpg names
queryHits(fcpgs.overlaps) %>% length()
queryHits(fcpgs.overlaps) %>% unique() %>% length()
names(fcpgs.meth) <- fcpgs.hg38$name[queryHits(fcpgs.overlaps)]
identical(fcpgs.meth,sort(sortSeqlevels(fcpgs.meth),ignore.strand=TRUE))
fcpgs.meth
getSeq(BSgenome.Hsapiens.UCSC.hg38,fcpgs.meth) %>% as.character() %>% table() ## al Cs! OK!
We next remove some samples with lower CpG coverages and plot the heatmap.
##
## Plot
##
##heamtap
library(ComplexHeatmap)
library(circlize)
## get colors for heatmap
cols.paper <- c(
NA_values="#666666",
HPC="#E6E6E6",
pre.Bcell="#D5D5D5",
NBC="#C3C3C3",
GCB="#AEAEAE",
tPC="#969696",
MBC="#787878",
bmPC="#4D4D4D",
Tcell="#F0F8FF",
PBMCs="#FFF8DC",
WholeBlood="#CDC8B1",
Normal_lymphoid_cells="#D5D5D5",
Normal_myeloid_cells="#E6E6FA",
TALL="#CD3333",
TALL.remission="#C7A5A5",
BALL="#E69F00",
BALL.remission="#D8CAAB",
MCL="#F0E442",
C1.or.cMCL="#F5D51B",
C2.or.nnMCL="#0072B2",
DLBCL.NOS="#56B4E9",
MBL="#5A9A89",
CLL="#009E73",
RT="#02C590",
CLL.unmutated="#e65100",
CLL.mutated="#361379",
nCLL="#006E93",
iCLL="#FDC010",
mCLL="#963736",
Disease.Unclassified="#CCCCCC",
MGUS="#BF99AE",
MM="#CC79A7"
)
evoflux.meth <- fread("../Revision/Data/meth_palette_evoflux.tsv")
evoflux.meth
## get matrix
mat.meth <- mcols(fcpgs.meth) %>% as.matrix()
##reatain cell types analyzed in the sutdy
mat.meth <- mat.meth[,grep("^FL|BL|UN",colnames(mat.meth),invert = T)]
head(mat.meth);dim(mat.meth)
( colSums(is.na(mat.meth))/nrow(mat.meth) ) %>% hist(.,main="CpG missigness across across samples")
exclude <- sort(colSums(is.na(mat.meth))/nrow(mat.meth))[ncol(mat.meth)] %>% names()
##remove 1 MM with many NA's
mat.meth <- mat.meth[ ,!colnames(mat.meth) %in% exclude]
head(mat.meth);dim(mat.meth)
##check CpGs across samples
( rowSums(is.na(mat.meth))/ncol(mat.meth) ) %>% hist(.,main="Missigness across CpGs")
dim(mat.meth)
mat.meth <- mat.meth[ (rowSums(is.na(mat.meth))/ncol(mat.meth) ) <= 0.25,]
dim(mat.meth)
##set column annotation
colnames(mat.meth)
samples.annotation <- setNames(colnames(mat.meth),NA)
names(samples.annotation)[grep("^ALL|^MCL|^CLL|^MM",colnames(mat.meth),invert = T)] <- "Normal B cell"
names(samples.annotation)[grep("^ALL",colnames(mat.meth),invert = F)] <- "BALL"
names(samples.annotation)[grep("^MCL",colnames(mat.meth),invert = F)] <- "MCL"
names(samples.annotation)[grep("^CLL",colnames(mat.meth),invert = F)] <- "CLL"
names(samples.annotation)[grep("^MM",colnames(mat.meth),invert = F)] <- "MM"
samples.annotation
ht_opt$legend_grid_width <- unit(2,"mm")
ht_opt$legend_grid_height <-unit(2,"mm")
ht_opt$legend_labels_gp <- gpar(fontsize=5)
ht_opt$legend_title_gp <- gpar(fontsize=5)
ht_opt$heatmap_column_title_gp <- gpar(fontsize=5)
ht_opt$heatmap_row_title_gp <- gpar(fontsize=5)
ht_opt$heatmap_row_names_gp <- gpar(fontsize=5)
ht_opt$heatmap_column_names_gp <- gpar(fontsize=5)
ht_opt$HEATMAP_LEGEND_PADDING <- unit(1,"mm")
ht_opt$ANNOTATION_LEGEND_PADDING <- unit(1,"mm")
ht_opt$legend_gap <- unit(c(1,1),"mm")
ht_opt$fast_hclust <- T
h <- Heatmap(matrix = mat.meth,
row_title=paste0("CpGs=",nrow(mat.meth)," (at least 10 reads in each sample)"),
cluster_columns = T,
cluster_rows = T,
show_row_names = F,
show_column_names = F,
show_row_dend = F,
show_column_dend = F,
name="DNA methylation",
border = T,
col=evoflux.meth$color,
column_split = ifelse(names(samples.annotation)=="Normal B cell","Normal B cells","B-cell tumors"),
cluster_column_slices = F,
row_dend_gp = gpar(lwd=0.01),
column_dend_gp = gpar(lwd=0.3),
na_col = "grey0",
top_annotation = HeatmapAnnotation("Cell type"=names(samples.annotation),
col=list("Cell type"=c(cols.paper,"Normal B cell"="#D5D5D5")
),
show_annotation_name = F,
na_col = "grey45",
annotation_name_side = "left",
show_legend = F,
annotation_name_gp = gpar(fontsize=5),
simple_anno_size = unit(2,"mm")
),
heatmap_legend_param = list(break_dist = 1,at=seq(0,1,by=0.2),direction="horizontal"),
)
draw(h,
column_title = "WGBS",
ht_gap = unit(2,"mm"),
merge_legend=T,
heatmap_legend_side = "bottom",
annotation_legend_side = "bottom"
)
We next perform histogramsn to see the distribution of CpG methylation.
for (i in colnames(mat.meth)){
mat.meth.hist <-
mat.meth %>%
reshape2::melt() %>%
filter(Var2==i) %>%
tidyr::drop_na() %>%
mutate(N.CpGs=n(),
sample_type=names(samples.annotation)[samples.annotation==unique(Var2)]
)
p <- mat.meth.hist %>%
ggplot(aes(x=value,
fill=sample_type
))+
geom_histogram(color="grey20",lwd=0.1)+
scale_fill_manual(values = c(cols.paper,"Normal B cell"="#D5D5D5"))+
xlab("DNA methylation")+
theme_classic()+
ggtitle(paste0(mat.meth.hist$N.CpGs %>% unique()," CpGs with at least 10 reads (sample: ",i,")"))+
theme(text = element_text(colour = "grey0",size = 5),
panel.grid.major = element_line(linewidth = 0.075),
# panel.grid.minor.x = element_line(linewidth = 0.025),
strip.placement = "out",
strip.background = element_blank(),
strip.text.y = element_text(angle = 0),
plot.margin = unit(c(0,0,0,0),"pt"),
line=element_line(linewidth = 0.1),
legend.box.margin=margin(-5,-5,-5,-5),
legend.position = "bottom",
plot.background =element_blank(),
legend.key.size = unit(5,"pt")
)
print(p)
}
print(sessionInfo())
## R version 4.5.0 (2025-04-11)
## Platform: aarch64-apple-darwin20
## Running under: macOS Ventura 13.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: Europe/Madrid
## tzcode source: internal
##
## attached base packages:
## [1] grid stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] circlize_0.4.16 ComplexHeatmap_2.24.1
## [3] BSgenome.Hsapiens.UCSC.hg38_1.4.5 BSgenome_1.76.0
## [5] rtracklayer_1.68.0 BiocIO_1.18.0
## [7] Biostrings_2.76.0 XVector_0.48.0
## [9] openxlsx_4.2.8 GenomicRanges_1.60.0
## [11] GenomeInfoDb_1.44.2 IRanges_2.42.0
## [13] S4Vectors_0.46.0 BiocGenerics_0.54.0
## [15] generics_0.1.4 pals_1.10
## [17] ggpubr_0.6.1 ggplot2_3.5.2
## [19] dplyr_1.1.4 janitor_2.2.1
## [21] data.table_1.17.8 BiocStyle_2.36.0
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-9 rlang_1.1.6
## [3] magrittr_2.0.3 clue_0.3-66
## [5] GetoptLong_1.0.5 snakecase_0.11.1
## [7] matrixStats_1.5.0 compiler_4.5.0
## [9] reshape2_1.4.4 png_0.1-8
## [11] vctrs_0.6.5 maps_3.4.3
## [13] stringr_1.5.1 shape_1.4.6.1
## [15] pkgconfig_2.0.3 crayon_1.5.3
## [17] fastmap_1.2.0 magick_2.8.7
## [19] backports_1.5.0 labeling_0.4.3
## [21] Rsamtools_2.24.0 rmarkdown_2.29
## [23] UCSC.utils_1.4.0 tinytex_0.57
## [25] purrr_1.1.0 xfun_0.53
## [27] cachem_1.1.0 jsonlite_2.0.0
## [29] DelayedArray_0.34.1 BiocParallel_1.42.1
## [31] cluster_2.1.8.1 broom_1.0.9
## [33] parallel_4.5.0 R6_2.6.1
## [35] bslib_0.9.0 stringi_1.8.7
## [37] RColorBrewer_1.1-3 car_3.1-3
## [39] lubridate_1.9.4 jquerylib_0.1.4
## [41] Rcpp_1.1.0 bookdown_0.44
## [43] SummarizedExperiment_1.38.1 iterators_1.0.14
## [45] knitr_1.50 Matrix_1.7-4
## [47] timechange_0.3.0 tidyselect_1.2.1
## [49] rstudioapi_0.17.1 dichromat_2.0-0.1
## [51] abind_1.4-8 yaml_2.3.10
## [53] doParallel_1.0.17 codetools_0.2-20
## [55] curl_7.0.0 plyr_1.8.9
## [57] lattice_0.22-7 tibble_3.3.0
## [59] Biobase_2.68.0 withr_3.0.2
## [61] evaluate_1.0.5 zip_2.3.3
## [63] pillar_1.11.0 BiocManager_1.30.26
## [65] MatrixGenerics_1.20.0 carData_3.0-5
## [67] foreach_1.5.2 RCurl_1.98-1.17
## [69] scales_1.4.0 glue_1.8.0
## [71] mapproj_1.2.12 tools_4.5.0
## [73] ggsignif_0.6.4 GenomicAlignments_1.44.0
## [75] fastcluster_1.3.0 XML_3.99-0.19
## [77] Cairo_1.6-5 tidyr_1.3.1
## [79] colorspace_2.1-1 GenomeInfoDbData_1.2.14
## [81] restfulr_0.0.16 Formula_1.2-5
## [83] cli_3.6.5 S4Arrays_1.8.1
## [85] gtable_0.3.6 rstatix_0.7.2.999
## [87] BSgenome.Hsapiens.UCSC.hg19_1.4.3 sass_0.4.10
## [89] digest_0.6.37 SparseArray_1.8.1
## [91] rjson_0.2.23 farver_2.1.2
## [93] htmltools_0.5.8.1 lifecycle_1.0.4
## [95] httr_1.4.7 GlobalOptions_0.1.2