Here we use previous WGBS data from normal and neoplastic B cells, and map the identified fCpGs in array data and check the methylation status in normal and neoplastic B cells.

1 Load packages, fCpGs information and WGBS data

##
## Load necessary packages and set global options
##

library(data.table)
library(janitor)
library(dplyr)
library(ggpubr)
library(pals)
library(GenomicRanges)
library(openxlsx)
library(BSgenome.Hsapiens.UCSC.hg38)

options(stringsAsFactors = F,max.print = 10000,error=NULL,
        "openxlsx.dateFormat" = "dd/mm/yyyy"
)



## load fCpGs with meth values

fcpgs.betas <- 
  fread("../Data/FMC/Final_data/beta_filtered_samples_fcpgs_laplacian.tsv",data.table = F) %>% 
  tibble::column_to_rownames(var = "V1")

fcpgs.betas[1:5,1:5]
dim(fcpgs.betas)

fcpgs <-
  openxlsx::read.xlsx("../Revision/Data/Supplementary_Tables_revision_curated.2.xlsx",
                      sheet = "supplementary_table_3") %>% 
  clean_names() 

fcpgs  <- 
  GRanges(fcpgs$chromosome,IRanges(fcpgs$position,fcpgs$position),strand = fcpgs$strand,
          fcpgs %>% select(-c(chromosome,position,strand))
  )
fcpgs

getSeq(BSgenome.Hsapiens.UCSC.hg19::BSgenome.Hsapiens.UCSC.hg19,fcpgs) %>% as.character() %>% table
getSeq(BSgenome.Hsapiens.UCSC.hg19::BSgenome.Hsapiens.UCSC.hg19,fcpgs[strand(fcpgs)=="+"]) %>% as.character() %>% table
getSeq(BSgenome.Hsapiens.UCSC.hg19::BSgenome.Hsapiens.UCSC.hg19,fcpgs[strand(fcpgs)=="-"]) %>% as.character() %>% table

##perform liftover
hg19.to.hg38.chain <- rtracklayer::import.chain(con = "../Revision/Data/Nanopore/hg19ToHg38.over.chain")
fcpgs.hg38 <- rtracklayer::liftOver(x = fcpgs,chain = hg19.to.hg38.chain)
fcpgs.hg38 <- unlist(fcpgs.hg38)

length(fcpgs)
length(fcpgs.hg38)

fcpgs$name[which(!fcpgs$name %in% fcpgs.hg38$name)]

fcpgs.hg38

getSeq(BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38,fcpgs.hg38) %>% as.character() %>% table
getSeq(BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38,fcpgs.hg38[strand(fcpgs.hg38)=="+"]) %>% as.character() %>% table
getSeq(BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38,fcpgs.hg38[strand(fcpgs.hg38)=="-"]) %>% as.character() %>% table

fcpgs.hg38[strand(fcpgs.hg38)=="-"] %>% granges()

##liftover do not work on this CpG
fcpgs.hg38[strand(fcpgs.hg38)=="-"][which(getSeq(BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38,fcpgs.hg38[strand(fcpgs.hg38)=="-"]) %>% as.character() !="G"),]
exclude <- fcpgs.hg38[strand(fcpgs.hg38)=="-"]$name[which(getSeq(BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38,fcpgs.hg38[strand(fcpgs.hg38)=="-"]) %>% as.character() !="G")]


fcpgs.hg38 <- fcpgs.hg38[fcpgs.hg38$name!=exclude]
fcpgs.hg38

#check nucleotides
getSeq(BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38,fcpgs.hg38[strand(fcpgs.hg38)=="+"]) %>% as.character() %>% table
getSeq(BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38,fcpgs.hg38[strand(fcpgs.hg38)=="-"]) %>% as.character() %>% table
# ok!






## meth fcpgs
# update annotation for - stand, as array annot of CG always refer to + strand and first nucleotide
# fcpgs.hg38[which(strand(fcpgs.hg38)=="-")] <- 
#   resize(fcpgs.hg38[which(strand(fcpgs.hg38)=="-")],width = 2,fix = "end")
# fcpgs.hg38
# start(fcpgs.hg38[which(strand(fcpgs.hg38)=="-")]) <- 
#   end(fcpgs.hg38[which(strand(fcpgs.hg38)=="-")])
# fcpgs.hg38 %>% width() %>% table
# fcpgs.hg38

getSeq(BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38,fcpgs.hg38[strand(fcpgs.hg38)=="-"]) %>% as.character() %>% unique()
##ok"


fcpgs.hg38 <- sort(sortSeqlevels(fcpgs.hg38),ignore.strand=T)
fcpgs.hg38



##
## Load previous WGBS
##


load("../../WGBS/WGBS_57_10reads_NAs.RData")

head(meth.final);dim(meth.final)
class(meth.final)
colnames(meth.final) <- c("chr","start",
                          "S1.1","S1.2","S3.1","S3.2","NBCB8","NBCB7","NBCB9","GCBC3","GCBC5","GCBC2","ncsMBC3","csMBC5","csMBC6","PCBM1","PCBM2","PCT2","PCT6","PCT5",
                          "ALL.M485","ALL.M486",
                          "MCL8761","MCL2392","MCL81703","MCL6902","MCL007",
                          "CLL3","CLL182","CLL110","CLL12","CLL1525","CLL1532","CLL1228","CLL16",
                          "FL","FL","FL","BL","FL","BL","FL","BL","BL","UN","BL","FL","BL","FL","BL","BL","BL","BL","BL",
                          "MM54847","MM55358","MM54498","MM35574","MM35174"
                          ) %>% make.unique()

wgbs <- GRanges(seqnames = meth.final$chr,ranges = IRanges(meth.final$start,meth.final$start))
mcols(wgbs) <- meth.final %>% select(-c(chr,start))
wgbs <- sort(sortSeqlevels(wgbs),ignore.strand=T)
wgbs

rm(meth.final);gc()


## checks
# wgbs.seq <- getSeq(BSgenome.Hsapiens.UCSC.hg38,wgbs)
# wgbs.seq
# wgbs.seq %>% as.matrix() %>% apply(.,2,table)

2 fCpGs in WGBS data

We next overlap fCpGs with WGBS data.

##overlap with fcpgs

fcpgs.overlaps <- findOverlaps(query = fcpgs.hg38,
                               subject = wgbs,
                               ignore.strand=T
                               )
fcpgs.overlaps 
subjectHits(fcpgs.overlaps) %>% length()
subjectHits(fcpgs.overlaps) %>% unique() %>% length()

fcpgs.meth <- wgbs[subjectHits(fcpgs.overlaps)]
fcpgs.meth %>% width() %>% table
fcpgs.meth

##names regions with cpg names
queryHits(fcpgs.overlaps) %>% length()
queryHits(fcpgs.overlaps) %>% unique() %>% length()
names(fcpgs.meth) <- fcpgs.hg38$name[queryHits(fcpgs.overlaps)]
identical(fcpgs.meth,sort(sortSeqlevels(fcpgs.meth),ignore.strand=TRUE))
fcpgs.meth

getSeq(BSgenome.Hsapiens.UCSC.hg38,fcpgs.meth) %>% as.character() %>% table() ## al Cs! OK!

3 Heatmap

We next remove some samples with lower CpG coverages and plot the heatmap.

##
## Plot
##


##heamtap
library(ComplexHeatmap)
library(circlize)


## get colors for heatmap

cols.paper <- c(
  NA_values="#666666",
  HPC="#E6E6E6",
  pre.Bcell="#D5D5D5",
  NBC="#C3C3C3",
  GCB="#AEAEAE",
  tPC="#969696",
  MBC="#787878",
  bmPC="#4D4D4D",
  Tcell="#F0F8FF",
  PBMCs="#FFF8DC",
  WholeBlood="#CDC8B1",
  Normal_lymphoid_cells="#D5D5D5",
  Normal_myeloid_cells="#E6E6FA",
  TALL="#CD3333",
  TALL.remission="#C7A5A5",
  BALL="#E69F00",
  BALL.remission="#D8CAAB",
  MCL="#F0E442",
  C1.or.cMCL="#F5D51B",
  C2.or.nnMCL="#0072B2",
  DLBCL.NOS="#56B4E9",
  MBL="#5A9A89",
  CLL="#009E73",
  RT="#02C590",
  CLL.unmutated="#e65100",
  CLL.mutated="#361379",
  nCLL="#006E93",
  iCLL="#FDC010",
  mCLL="#963736",
  Disease.Unclassified="#CCCCCC",
  MGUS="#BF99AE",
  MM="#CC79A7"
)


evoflux.meth <- fread("../Revision/Data/meth_palette_evoflux.tsv")
evoflux.meth



## get matrix


mat.meth <- mcols(fcpgs.meth) %>% as.matrix()
##reatain cell types analyzed in the sutdy
mat.meth <- mat.meth[,grep("^FL|BL|UN",colnames(mat.meth),invert = T)]
head(mat.meth);dim(mat.meth)

( colSums(is.na(mat.meth))/nrow(mat.meth) ) %>% hist(.,main="CpG missigness across across samples")

exclude <- sort(colSums(is.na(mat.meth))/nrow(mat.meth))[ncol(mat.meth)] %>% names()

##remove 1 MM with many NA's 
mat.meth <- mat.meth[ ,!colnames(mat.meth) %in% exclude]
head(mat.meth);dim(mat.meth)

##check CpGs across samples
( rowSums(is.na(mat.meth))/ncol(mat.meth) ) %>% hist(.,main="Missigness across CpGs")

dim(mat.meth)
mat.meth <- mat.meth[ (rowSums(is.na(mat.meth))/ncol(mat.meth) ) <= 0.25,]
dim(mat.meth)



##set column annotation
colnames(mat.meth)
samples.annotation <- setNames(colnames(mat.meth),NA)
names(samples.annotation)[grep("^ALL|^MCL|^CLL|^MM",colnames(mat.meth),invert = T)] <- "Normal B cell"
names(samples.annotation)[grep("^ALL",colnames(mat.meth),invert = F)] <- "BALL"
names(samples.annotation)[grep("^MCL",colnames(mat.meth),invert = F)] <- "MCL"
names(samples.annotation)[grep("^CLL",colnames(mat.meth),invert = F)] <- "CLL"
names(samples.annotation)[grep("^MM",colnames(mat.meth),invert = F)] <- "MM"
samples.annotation

ht_opt$legend_grid_width <- unit(2,"mm")
ht_opt$legend_grid_height <-unit(2,"mm")
ht_opt$legend_labels_gp <- gpar(fontsize=5)
ht_opt$legend_title_gp <- gpar(fontsize=5)
ht_opt$heatmap_column_title_gp <- gpar(fontsize=5)
ht_opt$heatmap_row_title_gp <- gpar(fontsize=5)
ht_opt$heatmap_row_names_gp <- gpar(fontsize=5)
ht_opt$heatmap_column_names_gp <- gpar(fontsize=5)
ht_opt$HEATMAP_LEGEND_PADDING <- unit(1,"mm")
ht_opt$ANNOTATION_LEGEND_PADDING <- unit(1,"mm")
ht_opt$legend_gap <- unit(c(1,1),"mm")
ht_opt$fast_hclust <- T

h <- Heatmap(matrix = mat.meth,
             row_title=paste0("CpGs=",nrow(mat.meth)," (at least 10 reads in each sample)"),
             cluster_columns = T,
             cluster_rows = T,
             show_row_names = F,
             show_column_names = F,
             show_row_dend = F,
             show_column_dend = F,
             name="DNA methylation",
             border = T,
             col=evoflux.meth$color,
             column_split = ifelse(names(samples.annotation)=="Normal B cell","Normal B cells","B-cell tumors"),
             cluster_column_slices = F,
             row_dend_gp = gpar(lwd=0.01),
             column_dend_gp = gpar(lwd=0.3),
             na_col = "grey0",
             top_annotation = HeatmapAnnotation("Cell type"=names(samples.annotation),
                                                col=list("Cell type"=c(cols.paper,"Normal B cell"="#D5D5D5")
                                                         ),
                                                show_annotation_name = F,
                                                na_col = "grey45",
                                                annotation_name_side = "left",
                                                show_legend = F,
                                                annotation_name_gp = gpar(fontsize=5),
                                                simple_anno_size = unit(2,"mm")
                                                ),
             heatmap_legend_param = list(break_dist = 1,at=seq(0,1,by=0.2),direction="horizontal"),
             )



draw(h,
     column_title = "WGBS",
     ht_gap = unit(2,"mm"),
     merge_legend=T,
     heatmap_legend_side = "bottom",
     annotation_legend_side = "bottom"
)

4 fCpG methylation distribution

We next perform histogramsn to see the distribution of CpG methylation.

for (i in colnames(mat.meth)){
  mat.meth.hist <- 
    mat.meth %>% 
    reshape2::melt() %>%
    filter(Var2==i) %>% 
    tidyr::drop_na() %>% 
    mutate(N.CpGs=n(),
           sample_type=names(samples.annotation)[samples.annotation==unique(Var2)] 
           )
    p <- mat.meth.hist %>% 
      ggplot(aes(x=value,
                 fill=sample_type
                 ))+
      geom_histogram(color="grey20",lwd=0.1)+
      scale_fill_manual(values = c(cols.paper,"Normal B cell"="#D5D5D5"))+
      xlab("DNA methylation")+
      theme_classic()+
      ggtitle(paste0(mat.meth.hist$N.CpGs %>% unique()," CpGs with at least 10 reads (sample: ",i,")"))+
      theme(text = element_text(colour = "grey0",size = 5),
            panel.grid.major = element_line(linewidth = 0.075),
            # panel.grid.minor.x = element_line(linewidth = 0.025),
            strip.placement = "out",
            strip.background = element_blank(),
            strip.text.y =  element_text(angle = 0),
            plot.margin = unit(c(0,0,0,0),"pt"),
            line=element_line(linewidth = 0.1),
            legend.box.margin=margin(-5,-5,-5,-5),
            legend.position = "bottom",
            plot.background =element_blank(),
            legend.key.size = unit(5,"pt")
      )
    print(p)
}

5 Session info

print(sessionInfo())

## R version 4.5.0 (2025-04-11)
## Platform: aarch64-apple-darwin20
## Running under: macOS Ventura 13.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: Europe/Madrid
## tzcode source: internal
## 
## attached base packages:
## [1] grid      stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] circlize_0.4.16                   ComplexHeatmap_2.24.1            
##  [3] BSgenome.Hsapiens.UCSC.hg38_1.4.5 BSgenome_1.76.0                  
##  [5] rtracklayer_1.68.0                BiocIO_1.18.0                    
##  [7] Biostrings_2.76.0                 XVector_0.48.0                   
##  [9] openxlsx_4.2.8                    GenomicRanges_1.60.0             
## [11] GenomeInfoDb_1.44.2               IRanges_2.42.0                   
## [13] S4Vectors_0.46.0                  BiocGenerics_0.54.0              
## [15] generics_0.1.4                    pals_1.10                        
## [17] ggpubr_0.6.1                      ggplot2_3.5.2                    
## [19] dplyr_1.1.4                       janitor_2.2.1                    
## [21] data.table_1.17.8                 BiocStyle_2.36.0                 
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-9                      rlang_1.1.6                      
##  [3] magrittr_2.0.3                    clue_0.3-66                      
##  [5] GetoptLong_1.0.5                  snakecase_0.11.1                 
##  [7] matrixStats_1.5.0                 compiler_4.5.0                   
##  [9] reshape2_1.4.4                    png_0.1-8                        
## [11] vctrs_0.6.5                       maps_3.4.3                       
## [13] stringr_1.5.1                     shape_1.4.6.1                    
## [15] pkgconfig_2.0.3                   crayon_1.5.3                     
## [17] fastmap_1.2.0                     magick_2.8.7                     
## [19] backports_1.5.0                   labeling_0.4.3                   
## [21] Rsamtools_2.24.0                  rmarkdown_2.29                   
## [23] UCSC.utils_1.4.0                  tinytex_0.57                     
## [25] purrr_1.1.0                       xfun_0.53                        
## [27] cachem_1.1.0                      jsonlite_2.0.0                   
## [29] DelayedArray_0.34.1               BiocParallel_1.42.1              
## [31] cluster_2.1.8.1                   broom_1.0.9                      
## [33] parallel_4.5.0                    R6_2.6.1                         
## [35] bslib_0.9.0                       stringi_1.8.7                    
## [37] RColorBrewer_1.1-3                car_3.1-3                        
## [39] lubridate_1.9.4                   jquerylib_0.1.4                  
## [41] Rcpp_1.1.0                        bookdown_0.44                    
## [43] SummarizedExperiment_1.38.1       iterators_1.0.14                 
## [45] knitr_1.50                        Matrix_1.7-4                     
## [47] timechange_0.3.0                  tidyselect_1.2.1                 
## [49] rstudioapi_0.17.1                 dichromat_2.0-0.1                
## [51] abind_1.4-8                       yaml_2.3.10                      
## [53] doParallel_1.0.17                 codetools_0.2-20                 
## [55] curl_7.0.0                        plyr_1.8.9                       
## [57] lattice_0.22-7                    tibble_3.3.0                     
## [59] Biobase_2.68.0                    withr_3.0.2                      
## [61] evaluate_1.0.5                    zip_2.3.3                        
## [63] pillar_1.11.0                     BiocManager_1.30.26              
## [65] MatrixGenerics_1.20.0             carData_3.0-5                    
## [67] foreach_1.5.2                     RCurl_1.98-1.17                  
## [69] scales_1.4.0                      glue_1.8.0                       
## [71] mapproj_1.2.12                    tools_4.5.0                      
## [73] ggsignif_0.6.4                    GenomicAlignments_1.44.0         
## [75] fastcluster_1.3.0                 XML_3.99-0.19                    
## [77] Cairo_1.6-5                       tidyr_1.3.1                      
## [79] colorspace_2.1-1                  GenomeInfoDbData_1.2.14          
## [81] restfulr_0.0.16                   Formula_1.2-5                    
## [83] cli_3.6.5                         S4Arrays_1.8.1                   
## [85] gtable_0.3.6                      rstatix_0.7.2.999                
## [87] BSgenome.Hsapiens.UCSC.hg19_1.4.3 sass_0.4.10                      
## [89] digest_0.6.37                     SparseArray_1.8.1                
## [91] rjson_0.2.23                      farver_2.1.2                     
## [93] htmltools_0.5.8.1                 lifecycle_1.0.4                  
## [95] httr_1.4.7                        GlobalOptions_0.1.2

Whole-genome bisulfite sequencing data, Gabbutt and Duran-Ferrer 2025

10/07/2025

1 Load packages, fCpGs information and WGBS data

2 fCpGs in WGBS data

3 Heatmap

4 fCpG methylation distribution

5 Session info