Analyses related to long-read Nanopore data. We start the analyses with the counts provided by CNAG. This matrix needs filtering and curation.

1 Load packages and fCpGs information

##
## Load necessary packages and set global options
##

library(data.table)
library(janitor)
library(dplyr)
library(ggpubr)
library(pals)
library(GenomicRanges)
library(openxlsx)
library(BSgenome.Hsapiens.UCSC.hg38)

options(stringsAsFactors = F,max.print = 10000,error=NULL,
        "openxlsx.dateFormat" = "dd/mm/yyyy"
)


## load fCpGs with meth values

fcpgs.betas <- 
  fread("../Data/FMC/Final_data/beta_filtered_samples_fcpgs_laplacian.tsv",data.table = F) %>% 
  tibble::column_to_rownames(var = "V1")

fcpgs.betas[1:5,1:5]
dim(fcpgs.betas)

fcpgs <-
  openxlsx::read.xlsx("../Revision/Data/Supplementary_Tables_revision_curated.2.xlsx",
                      sheet = "supplementary_table_3") %>% 
  clean_names() 

fcpgs  <- 
  GRanges(fcpgs$chromosome,IRanges(fcpgs$position,fcpgs$position),strand = fcpgs$strand,
          fcpgs %>% select(-c(chromosome,position,strand))
          )
fcpgs

getSeq(BSgenome.Hsapiens.UCSC.hg19::BSgenome.Hsapiens.UCSC.hg19,fcpgs) %>% as.character() %>% table
getSeq(BSgenome.Hsapiens.UCSC.hg19::BSgenome.Hsapiens.UCSC.hg19,fcpgs[strand(fcpgs)=="+"]) %>% as.character() %>% table
getSeq(BSgenome.Hsapiens.UCSC.hg19::BSgenome.Hsapiens.UCSC.hg19,fcpgs[strand(fcpgs)=="-"]) %>% as.character() %>% table

##perform liftover
hg19.to.hg38.chain <- rtracklayer::import.chain(con = "../Revision/Data/Nanopore/hg19ToHg38.over.chain")
fcpgs.hg38 <- rtracklayer::liftOver(x = fcpgs,chain = hg19.to.hg38.chain)
fcpgs.hg38 <- unlist(fcpgs.hg38)

length(fcpgs)
length(fcpgs.hg38)

fcpgs$name[which(!fcpgs$name %in% fcpgs.hg38$name)]

fcpgs.hg38

getSeq(BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38,fcpgs.hg38) %>% as.character() %>% table
getSeq(BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38,fcpgs.hg38[strand(fcpgs.hg38)=="+"]) %>% as.character() %>% table
getSeq(BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38,fcpgs.hg38[strand(fcpgs.hg38)=="-"]) %>% as.character() %>% table

fcpgs.hg38[strand(fcpgs.hg38)=="-"] %>% granges()

##liftover do not work on this CpG
fcpgs.hg38[strand(fcpgs.hg38)=="-"][which(getSeq(BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38,fcpgs.hg38[strand(fcpgs.hg38)=="-"]) %>% as.character() !="G"),]
exclude <- fcpgs.hg38[strand(fcpgs.hg38)=="-"]$name[which(getSeq(BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38,fcpgs.hg38[strand(fcpgs.hg38)=="-"]) %>% as.character() !="G")]


fcpgs.hg38 <- fcpgs.hg38[fcpgs.hg38$name!=exclude]
fcpgs.hg38

#check nucleotides
getSeq(BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38,fcpgs.hg38[strand(fcpgs.hg38)=="+"]) %>% as.character() %>% table
getSeq(BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38,fcpgs.hg38[strand(fcpgs.hg38)=="-"]) %>% as.character() %>% table
# ok!


##load metadata

sheets <- getSheetNames("../Revision/Data/Supplementary_Tables_revision_curated.2.xlsx")
sheets <- setNames(sheets,sheets)
sheets

fcpgs.metadata <- parallel::mclapply(sheets,function(i){
  openxlsx::read.xlsx("../Revision/Data/Supplementary_Tables_revision_curated.2.xlsx",
                      sheet = i,
                      detectDates = T)
},mc.cores = 9) 

head(fcpgs.metadata$supplementary_table_1)



## load metadata of meth nanopore samples

nano.metadata <- 
  openxlsx::read.xlsx("../Revision/Data/Nanopore/Nanopore_CLL_Bcell_curated.xlsx") %>% 
  clean_names()

head(nano.metadata)

nano.metadata %>% select(sample_barcode,sample_name,sample_id_curated)
nano.metadata %>% View()



##load meth data
gc()
meth.cov <- 
  fread("../Revision/Data/Nanopore/beds/BCLLatlas_5mC_CpG_meth_cov.txt.gz") %>%
  GRanges()
gc()
meth.cov

colnames(mcols(meth.cov)) <- gsub(":","_",colnames(mcols(meth.cov)))
old.colnames <- colnames(mcols(meth.cov))
old.colnames
nano.metadata %>% arrange(sample_id_curated) %>% View()

##change name normal B cells
colnames(mcols(meth.cov))[gsub("_meth|_cov","",colnames(mcols(meth.cov))) %in% nano.metadata$sample_barcode] <- 
  paste0(nano.metadata$sample_id_curated[match(gsub("_meth|_cov","",colnames(mcols(meth.cov))),nano.metadata$sample_barcode) %>% na.omit()],
       c("_meth","_cov")
       )
colnames(mcols(meth.cov))

##change name  CLLs
colnames(mcols(meth.cov))[gsub("_meth|_cov","",colnames(mcols(meth.cov))) %in% nano.metadata$flowcell_name] <- 
  paste0(nano.metadata$sample_id_curated[match(gsub("_meth|_cov","",colnames(mcols(meth.cov))),nano.metadata$flowcell_name) %>% na.omit()],
         c("_meth","_cov")
  )

data.frame(old.colnames,new.colnames=colnames(mcols(meth.cov))) %>% arrange(new.colnames)
length(meth.cov)

## Order and subset chromomes
meth.cov <- sort(sortSeqlevels(meth.cov),ignore.strand=T)
meth.cov <- meth.cov[which(seqnames(meth.cov) %in% paste0("chr",1:22)),]
meth.cov %>% width() %>% table
gc()

##
## In combined strand files, the informatino of methylation for each strand is merged! 
##

# check it out:!
meth.cov.seq <- getSeq(BSgenome.Hsapiens.UCSC.hg38,meth.cov)
meth.cov.seq
identical(length(meth.cov.seq),length(meth.cov))
##2 and 3 nucleotide should always be CG! # 1st one is relatex to differnet reference systems in R
apply(as.matrix(meth.cov.seq),2,table) ##ok!!

## set coordinates accordingly to match array CG annotation
start(meth.cov) <- start(meth.cov)+1
meth.cov
meth.cov.seq <- getSeq(BSgenome.Hsapiens.UCSC.hg38,meth.cov)
meth.cov.seq
identical(length(meth.cov.seq),length(meth.cov))
## All should be CG!!!
apply(as.matrix(meth.cov.seq),2,unique) ##ok!!

##make some checks
width(meth.cov) %>% table() ## 
meth.cov
##all good! manually check at UCSC browser!

2 QC checkings

We next check some QC metrics, such as sequencing depth, CpGs passing, etc.

2.1 CpGs passing

##checkings
meth.cov %>% 
  mcols() %>% 
  data.frame() %>% 
  dplyr::select(n_pass) %>% 
  ggplot(aes(x=n_pass)) +
  geom_histogram()+
  ggtitle("Number of CpGs passing in across smaples")+
  theme_bw()+
  theme(text = element_text(colour = "grey0",size = 5),
        panel.grid.major = element_line(linewidth = 0.075),
        # panel.grid.minor.x = element_line(linewidth = 0.025),
        strip.placement = "out",
        strip.background = element_blank(),
        strip.text =  element_text(angle = 0),
        plot.margin = unit(c(0,0,0,0),"pt"),
        line=element_line(linewidth = 0.1),
        legend.position = "bottom",
        legend.box.margin=margin(-5,-5,-5,-5),
        # legend.justification="top",
        legend.text = element_text(angle = 90,vjust = 0.5,hjust = 0.5),
        plot.background =element_blank(),
        legend.key.size = unit(5,"pt")
  )

2.2 Coverage

We next check the coverage of each sample. We can see there is 1 sample with significantly less coverage, and we subseqeuntly remove it.

df.cov <-
  meth.cov %>% 
  mcols() %>% 
  data.frame() %>% 
  dplyr::select(contains("_cov")) %>% 
  reshape2::melt() %>% 
  group_by(variable) %>%
  summarise(boxplot.stats=list(boxplot.stats(value)))

df.cov <- cbind(df.cov[,"variable"],
                lapply(df.cov$boxplot.stats,function(x)x$stats) %>% do.call(rbind,.)
                )
df.cov

df.cov %>% 
  ggplot(aes(x=variable,
             y=`3`,
             ))+
  geom_linerange(aes(ymin=`1`,
                     ymax=`5`),
                 lwd=0.3
  )+
  geom_crossbar(aes(ymin=`2`,
                    ymax=`4`),
                lwd=0.3,
                fill="grey80"
                )+
  theme_bw()+
  ylab("Reads covering CpGs (outliers removed)")+
  ggtitle("Read coverage")+
  xlab(NULL)+
  theme(text = element_text(colour = "grey0",size = 5),
        axis.text.x = element_text(angle = 90,vjust = 1,hjust = 1),
        panel.grid.major = element_line(linewidth = 0.075),
        # panel.grid.minor.x = element_line(linewidth = 0.025),
        strip.placement = "out",
        strip.background = element_blank(),
        strip.text =  element_text(angle = 0),
        plot.margin = unit(c(0,0,0,0),"pt"),
        line=element_line(linewidth = 0.1),
        legend.position = "bottom",
        legend.box.margin=margin(-5,-5,-5,-5),
        # legend.justification="top",
        legend.text = element_text(angle = 90,vjust = 0.5,hjust = 0.5),
        plot.background =element_blank(),
        legend.key.size = unit(5,"pt")
  )

##
## Remove sample with low coverage
##

samples.low.coverage <- "012-06-01TD"
meth.cov <- meth.cov[,grep(samples.low.coverage,colnames(mcols(meth.cov)),invert = T,value = T)]
meth.cov$n_pass2 <- 
  rowSums(apply(meth.cov[,grep("_cov$",colnames(mcols(meth.cov)))] %>% mcols() %>% as.matrix,2,function(x)!is.na(x)))
gc()
table(meth.cov$n_pass2)
length(meth.cov)
meth.cov <- meth.cov[meth.cov$n_pass2!=0] ##remove low-coverage sample-specific regions
length(meth.cov)
gc()

3 Genome-wide 5mC

We next plot the genome-wide methylation of the cleaned DNA methylation matrix, with those CpGs covered by at least 10 reads.

N.min.reads <- 10 ##
##number of CpGs with more than 10 reads

##genome-wide meth
p <- meth.cov %>% 
  mcols() %>% 
  as.data.frame() %>% 
  ##10 reads covering a CpG in all the samples
  filter(n_pass2==ncol(.[,grep("_cov$",colnames(.))])) %>% 
  filter(rowSums(apply(.[,grep("_cov$",colnames(.))],2,function(x){x>=N.min.reads}))==ncol(.[,grep("_cov$",colnames(.))])) %>%
  select(contains("meth"))
N.CpGs <- nrow(p)

p <- p %>% 
  reshape2::melt() %>% 
  ggplot(aes(x=value))+
  facet_wrap(variable~.,scales = "free_y")+
  geom_histogram()+
  theme_classic()+
  ggtitle(paste0(N.min.reads," reads in all samples. CpGs=",N.CpGs))+
  theme(text = element_text(colour = "grey0",size = 5),
        panel.grid.major = element_line(linewidth = 0.075),
        # panel.grid.minor.x = element_line(linewidth = 0.025),
        strip.placement = "out",
        strip.background = element_blank(),
        strip.text.y =  element_text(angle = 0),
        plot.margin = unit(c(0,0,0,0),"pt"),
        line=element_line(linewidth = 0.1),
        legend.position = "bottom",
        legend.box.margin=margin(-5,-5,-5,-5),
        # legend.justification="top",
        legend.text = element_text(angle = 90,vjust = 0.5,hjust = 0.5),
        plot.background =element_blank(),
        legend.key.size = unit(5,"pt")
  )
print(p)

gc()

4 fCpG methylation

We next check the methylation of fCpGs

## meth fcpgs
# update annotation for - stand, as array annot of CG always refer to + strand and first nucleotide
fcpgs.hg38[which(strand(fcpgs.hg38)=="-")] <- 
  resize(fcpgs.hg38[which(strand(fcpgs.hg38)=="-")],width = 2,fix = "end")
fcpgs.hg38
start(fcpgs.hg38[which(strand(fcpgs.hg38)=="-")]) <- 
  end(fcpgs.hg38[which(strand(fcpgs.hg38)=="-")])
fcpgs.hg38 %>% width() %>% table
fcpgs.hg38

##check nucleotide seq, always should be a C!
getSeq(BSgenome.Hsapiens.UCSC.hg38,fcpgs.hg38) %>% as.character() %>% table



##get overlaps


##sort before overlapping
gc()
fcpgs.hg38 <- sort(sortSeqlevels(fcpgs.hg38),ignore.strand=TRUE)
meth.cov <- sort(sortSeqlevels(meth.cov),ignore.strand=TRUE)

fcpgs.overlaps <- findOverlaps(query = fcpgs.hg38,
                               subject = meth.cov,
                               ignore.strand=T
                               )
fcpgs.overlaps ## all present!
subjectHits(fcpgs.overlaps) %>% length()
subjectHits(fcpgs.overlaps) %>% unique() %>% length()

fcpgs.meth <- meth.cov[subjectHits(fcpgs.overlaps)]
fcpgs.meth %>% width() %>% table
fcpgs.meth

##names regions with cpg names
queryHits(fcpgs.overlaps) %>% length()
queryHits(fcpgs.overlaps) %>% unique() %>% length()
names(fcpgs.meth) <- fcpgs.hg38$name[queryHits(fcpgs.overlaps)]
identical(fcpgs.meth,sort(sortSeqlevels(fcpgs.meth),ignore.strand=TRUE))
fcpgs.meth

getSeq(BSgenome.Hsapiens.UCSC.hg38,fcpgs.meth) %>% as.character() %>% table()



N.min.reads <- 10

p <- fcpgs.meth %>% 
  mcols() %>% 
  as.data.frame() %>% 
  ##10 reads covering a CpG in all the samples
  filter(n_pass2==ncol(.[,grep("_cov$",colnames(.))])) %>% 
  filter(rowSums(apply(.[,grep("_cov$",colnames(.))],2,function(x){x>=N.min.reads}))==ncol(.[,grep("_cov$",colnames(.))])) %>%
  select(contains("meth"))
N.CpGs <- nrow(p)
N.CpGs

p %>% 
  reshape2::melt() %>% 
  ggplot(aes(x=value))+
  facet_wrap(variable~.,scales = "free_y")+
  geom_histogram()+
  xlab("DNA methylation")+
  ggtitle(paste0(N.min.reads," reads as minimum coversge. CpGs=",N.CpGs))+
  theme_classic()+
  theme(text = element_text(colour = "grey0",size = 5),
        panel.grid.major = element_line(linewidth = 0.075),
        # panel.grid.minor.x = element_line(linewidth = 0.025),
        strip.placement = "out",
        strip.background = element_blank(),
        strip.text.y =  element_text(angle = 0),
        plot.margin = unit(c(0,0,0,0),"pt"),
        line=element_line(linewidth = 0.1),
        legend.position = "bottom",
        legend.box.margin=margin(-5,-5,-5,-5),
        # legend.justification="top",
        legend.text = element_text(angle = 90,vjust = 0.5,hjust = 0.5),
        plot.background =element_blank(),
        legend.key.size = unit(5,"pt")
  )

5 Matched nanopore and array methylation data

We next compare the fCpG methylation values measured by Nanopore and arrays of matched samples, and see great agreement.

#pairwise comparisons between same sample. illumina and nanopore

array.rep.samples <-
  fcpgs.betas %>% 
  select( (fcpgs.metadata$supplementary_table_1 %>% 
             filter(sample_id_new=="019-15-01TD" | sample_id_new=="019-0047-01TD" | sample_id_new=="012-19-01TD") %>% 
             pull(participant_id_anonymous) 
           )
          ) %>% 
  tibble::rownames_to_column("CpG") %>% 
  reshape2::melt() %>% 
  mutate(sample_id=fcpgs.metadata$supplementary_table_1$sample_id_new[match(variable,fcpgs.metadata$supplementary_table_1$participant_id_anonymous)],
         timepoint=fcpgs.metadata$supplementary_table_1$sample_timepoints[match(variable,fcpgs.metadata$supplementary_table_1$participant_id_anonymous)],
         platform = fcpgs.metadata$supplementary_table_1$platform[match(variable,fcpgs.metadata$supplementary_table_1$participant_id_anonymous)]
         ) %>% 
  rename(variable="participant_id_anonymous",
         value="array.meth")

head(array.rep.samples);dim(array.rep.samples)
array.rep.samples %>% count(sample_id,participant_id_anonymous)



##nanopore comparison with all paired array-nanopore samples (ie facets)

for(i in c(1,5,10,15,20,25,30)){
  
  N.min.reads <- i
  
  nano.rep.samples <-
    fcpgs.meth %>% 
    mcols() %>% 
    data.frame() %>% 
    select(starts_with(c("X019.0047.01TD","X019.15.01TD","X012.19.01TD"))) %>% 
    filter(rowSums(apply(.[,grep("_cov$",colnames(.)),drop=F],2,function(x){x>=N.min.reads}))==ncol(.[,grep("_cov$",colnames(.)),drop=F])) %>%
    select(all_of(c("X019.0047.01TD_meth","X019.15.01TD_meth","X012.19.01TD_meth"))) %>% 
    tibble::rownames_to_column("CpG") %>% 
    rename(all_of(setNames(c("019-15-01TD","019-0047-01TD","012-19-01TD"),
                           c("X019.15.01TD_meth","X019.0047.01TD_meth","X012.19.01TD_meth")
                           )
                  )
           ) %>% 
    reshape2::melt() %>% 
    rename(variable="sample_id",
           value="nanopore.meth"
    )
  head(nano.rep.samples);dim(nano.rep.samples)
  nano.rep.samples %>% count(sample_id)
  
  meth.rep.samples <- 
    full_join(x = array.rep.samples,
              y=nano.rep.samples,
              by = join_by(CpG==CpG,sample_id==sample_id,),
              keep = T
    )
  
  meth.rep.samples %>% head
  #samples are sorted by samples and CpG
  meth.rep.samples$cpgs_diff <- abs(meth.rep.samples$array.meth - meth.rep.samples$nanopore.meth) 
  meth.rep.samples <- 
    meth.rep.samples %>% 
    tidyr::drop_na()
  N.CpGs <- meth.rep.samples$CpG.x %>% unique() %>% length()
  p <- 
    meth.rep.samples %>% 
    mutate(facet=paste0(sample_id.x,"_",platform)) %>%
    ggplot(aes(x = nanopore.meth, ##nanopre data in X
               y= array.meth, ##array data on Y
               color=cpgs_diff
    ))+
    geom_abline(slope = 1,color="grey40",linetype="dashed",lwd=0.2) +
    stat_density2d(aes(alpha=after_stat(level)),
                   lwd=0.1,
                   color="grey0"
    )+
    geom_point(pch=19,stroke=0,size=0.8)+
    geom_smooth(aes(group=facet),
                lwd=0.2,
                method = "lm",
                color="grey0"
    )+
    stat_cor(size=1)+
    ggtitle(paste0("At least ",N.min.reads," reads for all Cs in all samples, CpGs=",N.CpGs))+
    scale_color_gradientn("Methylation\ndifference",
                          colours = viridis(100) %>% rev(),
                          limits=c(0,0.25),
                          na.value = viridis(100)[1]
    )+
    facet_grid(~facet)+
    theme_classic()+
    theme(text = element_text(colour = "grey0",size = 5),
          panel.grid.major = element_line(linewidth = 0.075),
          # panel.grid.minor.x = element_line(linewidth = 0.025),
          strip.placement = "out",
          strip.background = element_blank(),
          strip.text =  element_text(angle = 0),
          plot.margin = unit(c(0,0,0,0),"pt"),
          line=element_line(linewidth = 0.1),
          legend.position = "bottom",
          legend.box.margin=margin(-5,-5,-5,-5),
          # legend.justification="top",
          legend.text = element_text(angle = 90,vjust = 0.5,hjust = 0.5),
          plot.background =element_blank(),
          legend.key.size = unit(5,"pt")
    )
  
  print(p)
  
}

5.1 Sample 019-15-01TD

We next use the sample with the highest overage.

5.1.1 Scatterplots

for(i in c(1,10,20,30,40,45,50,55,60,65)){
  
  N.min.reads <- i
  
  nano.rep.samples <-
    fcpgs.meth %>% 
    mcols() %>% 
    data.frame() %>% 
    select(starts_with(c("X019.15.01TD"))) %>% 
    filter(rowSums(apply(.[,grep("_cov$",colnames(.)),drop=F],2,function(x){x>=N.min.reads}))==ncol(.[,grep("_cov$",colnames(.)),drop=F])) %>%
    select(all_of(c("X019.15.01TD_meth"))) %>% 
    tibble::rownames_to_column("CpG") %>% 
    rename(all_of(setNames(c("019-15-01TD"),
                           c("X019.15.01TD_meth")
    )
    )
    ) %>% 
    reshape2::melt() %>% 
    rename(variable="sample_id",
           value="nanopore.meth"
    )
  head(nano.rep.samples);dim(nano.rep.samples)
  nano.rep.samples %>% count(sample_id)
  
  meth.rep.samples <- 
    full_join(x = array.rep.samples,
              y=nano.rep.samples,
              by = join_by(CpG==CpG,sample_id==sample_id,),
              keep = T
    )
  
  meth.rep.samples %>% head
  #samples are sorted by samples and CpG
  meth.rep.samples$cpgs_diff <- abs(meth.rep.samples$array.meth - meth.rep.samples$nanopore.meth) 
  meth.rep.samples <- 
    meth.rep.samples %>% 
    tidyr::drop_na()
  N.CpGs <- meth.rep.samples$CpG.x %>% unique() %>% length()
  p <- 
    meth.rep.samples %>% 
    mutate(facet=paste0(sample_id.x,"_",platform)) %>%
    ggplot(aes(x = nanopore.meth, ##nanopre data in X
               y= array.meth, ##array data on Y
               color=cpgs_diff
    ))+
    geom_abline(slope = 1,color="grey40",linetype="dashed",lwd=0.2) +
    stat_density2d(aes(alpha=after_stat(level)),
                   lwd=0.1,
                   color="grey0"
    )+
    geom_point(pch=19,stroke=0,size=0.8)+
    geom_smooth(aes(group=facet),
                lwd=0.2,
                method = "lm",
                color="grey0"
    )+
    stat_cor(size=1)+
    ggtitle(paste0(N.min.reads," reads, CpGs=",N.CpGs))+
    scale_color_gradientn("Methylation\ndifference",
                          colours = viridis(100) %>% rev(),
                          limits=c(0,0.25),
                          na.value = viridis(100)[1]
    )+
    facet_grid(~facet)+
    theme_classic()+
    theme(text = element_text(colour = "grey0",size = 5),
          panel.grid.major = element_line(linewidth = 0.075),
          # panel.grid.minor.x = element_line(linewidth = 0.025),
          strip.placement = "out",
          strip.background = element_blank(),
          strip.text =  element_text(angle = 0),
          plot.margin = unit(c(0,0,0,0),"pt"),
          line=element_line(linewidth = 0.1),
          legend.position = "bottom",
          legend.box.margin=margin(-5,-5,-5,-5),
          # legend.justification="top",
          legend.text = element_text(angle = 90,vjust = 0.5,hjust = 0.5),
          plot.background =element_blank(),
          legend.key.size = unit(5,"pt")
    )
  
  print(p)
  
}

5.1.2 Histograms

We compare the distribution of methylation values of fCpGs measured by nanopore with arrays with increasing coverage. The distribution is very similar!

for(i in c(1,10,20,30,40,45,50,55,60,65)){
  
  N.min.reads <- i
    
    nano.rep.samples <-
    fcpgs.meth %>% 
    mcols() %>% 
    data.frame() %>% 
    select(starts_with(c("X019.15.01TD"))) %>% 
    filter(rowSums(apply(.[,grep("_cov$",colnames(.)),drop=F],2,function(x){x>=N.min.reads}))==ncol(.[,grep("_cov$",colnames(.)),drop=F])) %>%
    select(all_of(c("X019.15.01TD_meth"))) %>% 
    tibble::rownames_to_column("CpG") %>% 
    rename(all_of(setNames(c("019-15-01TD"),
                           c("X019.15.01TD_meth")
    )
    )
    ) %>% 
    reshape2::melt() %>% 
    rename(variable="sample_id",
           value="nanopore.meth"
    )
  head(nano.rep.samples);dim(nano.rep.samples)
  nano.rep.samples %>% count(sample_id)
  
  meth.rep.samples <- 
    full_join(x = array.rep.samples,
              y=nano.rep.samples,
              by = join_by(CpG==CpG,sample_id==sample_id,),
              keep = T
    )
  
  meth.rep.samples <- 
    meth.rep.samples %>% 
    tidyr::drop_na()
  N.CpGs <- meth.rep.samples$CpG.x %>% unique() %>% length()
  
  ##hisogram
  
  p <- 
    meth.rep.samples %>% 
    reshape2::melt(id.vars=c("sample_id.x","CpG.x"),measure.vars=c("array.meth","nanopore.meth")) %>% 
    ggplot(aes(x = value,  ##array data on Y
               fill=variable
    ))+
    geom_histogram(color="grey20",lwd=0.2)+
    scale_fill_brewer(palette = "Set2")+
    facet_grid(variable~.)+
    xlab("DNA methylation")+
    ggtitle(paste0(N.min.reads," reads, CpGs=",N.CpGs))+
    theme_classic()+
    theme(text = element_text(colour = "grey0",size = 5),
          panel.grid.major = element_line(linewidth = 0.075),
          # panel.grid.minor.x = element_line(linewidth = 0.025),
          strip.placement = "out",
          strip.background = element_blank(),
          strip.text =  element_text(angle = 0),
          plot.margin = unit(c(0,0,0,0),"pt"),
          line=element_line(linewidth = 0.1),
          legend.position = "bottom",
          legend.box.margin=margin(-5,-5,-5,-5),
          # legend.justification="top",
          legend.text = element_text(angle = 0,vjust = 0.5,hjust = 0.5),
          plot.background =element_blank(),
          legend.key.size = unit(5,"pt")
    )
  
  print(p)

}

5.2 Sample 012-19-01TD

5.2.1 Scatterplots

for(i in c(1,10,20,30,40,45,50,55,60,65)){
  
  N.min.reads <- i
  
  nano.rep.samples <-
    fcpgs.meth %>% 
    mcols() %>% 
    data.frame() %>% 
    select(starts_with(c("X012.19.01TD"))) %>% 
    filter(rowSums(apply(.[,grep("_cov$",colnames(.)),drop=F],2,function(x){x>=N.min.reads}))==ncol(.[,grep("_cov$",colnames(.)),drop=F])) %>%
    select(all_of(c("X012.19.01TD_meth"))) %>% 
    tibble::rownames_to_column("CpG") %>% 
    rename(all_of(setNames(c("012-19-01TD"),
                           c("X012.19.01TD_meth")
    )
    )
    ) %>% 
    reshape2::melt() %>% 
    rename(variable="sample_id",
           value="nanopore.meth"
    )
  head(nano.rep.samples);dim(nano.rep.samples)
  nano.rep.samples %>% count(sample_id)
  
  meth.rep.samples <- 
    full_join(x = array.rep.samples,
              y=nano.rep.samples,
              by = join_by(CpG==CpG,sample_id==sample_id,),
              keep = T
    )
  
  meth.rep.samples %>% head
  #samples are sorted by samples and CpG
  meth.rep.samples$cpgs_diff <- abs(meth.rep.samples$array.meth - meth.rep.samples$nanopore.meth) 
  meth.rep.samples <- 
    meth.rep.samples %>% 
    tidyr::drop_na()
  N.CpGs <- meth.rep.samples$CpG.x %>% unique() %>% length()
  p <- 
    meth.rep.samples %>% 
    mutate(facet=paste0(sample_id.x,"_",platform)) %>%
    ggplot(aes(x = nanopore.meth, ##nanopre data in X
               y= array.meth, ##array data on Y
               color=cpgs_diff
    ))+
    geom_abline(slope = 1,color="grey40",linetype="dashed",lwd=0.2) +
    stat_density2d(aes(alpha=after_stat(level)),
                   lwd=0.1,
                   color="grey0"
    )+
    geom_point(pch=19,stroke=0,size=0.8)+
    geom_smooth(aes(group=facet),
                lwd=0.2,
                method = "lm",
                color="grey0"
    )+
    stat_cor(size=1)+
    ggtitle(paste0(N.min.reads," reads, CpGs=",N.CpGs))+
    scale_color_gradientn("Methylation\ndifference",
                          colours = viridis(100) %>% rev(),
                          limits=c(0,0.25),
                          na.value = viridis(100)[1]
    )+
    facet_grid(~facet)+
    theme_classic()+
    theme(text = element_text(colour = "grey0",size = 5),
          panel.grid.major = element_line(linewidth = 0.075),
          # panel.grid.minor.x = element_line(linewidth = 0.025),
          strip.placement = "out",
          strip.background = element_blank(),
          strip.text =  element_text(angle = 0),
          plot.margin = unit(c(0,0,0,0),"pt"),
          line=element_line(linewidth = 0.1),
          legend.position = "bottom",
          legend.box.margin=margin(-5,-5,-5,-5),
          # legend.justification="top",
          legend.text = element_text(angle = 90,vjust = 0.5,hjust = 0.5),
          plot.background =element_blank(),
          legend.key.size = unit(5,"pt")
    )
  
  print(p)
  
}

5.2.2 Histograms

for(i in c(1,10,20,30,40,45,50,55,60,65)){
  
  N.min.reads <- i
  
  nano.rep.samples <-
    fcpgs.meth %>% 
    mcols() %>% 
    data.frame() %>% 
    select(starts_with(c("X012.19.01TD"))) %>% 
    filter(rowSums(apply(.[,grep("_cov$",colnames(.)),drop=F],2,function(x){x>=N.min.reads}))==ncol(.[,grep("_cov$",colnames(.)),drop=F])) %>%
    select(all_of(c("X012.19.01TD_meth"))) %>% 
    tibble::rownames_to_column("CpG") %>% 
    rename(all_of(setNames(c("012-19-01TD"),
                           c("X012.19.01TD_meth")
    )
    )
    ) %>% 
    reshape2::melt() %>% 
    rename(variable="sample_id",
           value="nanopore.meth"
    )
  head(nano.rep.samples);dim(nano.rep.samples)
  nano.rep.samples %>% count(sample_id)
  
  meth.rep.samples <- 
    full_join(x = array.rep.samples,
              y=nano.rep.samples,
              by = join_by(CpG==CpG,sample_id==sample_id,),
              keep = T
    )
  
  meth.rep.samples <- 
    meth.rep.samples %>% 
    tidyr::drop_na()
  N.CpGs <- meth.rep.samples$CpG.x %>% unique() %>% length()
  
  ##hisogram
  
  p <- 
    meth.rep.samples %>% 
    reshape2::melt(id.vars=c("sample_id.x","CpG.x"),measure.vars=c("array.meth","nanopore.meth")) %>% 
    ggplot(aes(x = value,  ##array data on Y
               fill=variable
    ))+
    geom_histogram(color="grey20",lwd=0.2)+
    scale_fill_brewer(palette = "Set2")+
    facet_grid(variable~.)+
    xlab("DNA methylation")+
    ggtitle(paste0(N.min.reads," reads, CpGs=",N.CpGs))+
    theme_classic()+
    theme(text = element_text(colour = "grey0",size = 5),
          panel.grid.major = element_line(linewidth = 0.075),
          # panel.grid.minor.x = element_line(linewidth = 0.025),
          strip.placement = "out",
          strip.background = element_blank(),
          strip.text =  element_text(angle = 0),
          plot.margin = unit(c(0,0,0,0),"pt"),
          line=element_line(linewidth = 0.1),
          legend.position = "bottom",
          legend.box.margin=margin(-5,-5,-5,-5),
          # legend.justification="top",
          legend.text = element_text(angle = 0,vjust = 0.5,hjust = 0.5),
          plot.background =element_blank(),
          legend.key.size = unit(5,"pt")
    )
  
  print(p)
  
  
}

6 Heatmap Nanopore data

We finally plot the heatmap with Nanopore data for all samples

##heamtap
library(ComplexHeatmap)
library(circlize)

ht_opt$legend_grid_width <- unit(2,"mm")
ht_opt$legend_grid_height <-unit(2,"mm")
ht_opt$legend_labels_gp <- gpar(fontsize=5)
ht_opt$legend_title_gp <- gpar(fontsize=5)
ht_opt$heatmap_column_title_gp <- gpar(fontsize=5)
ht_opt$heatmap_row_title_gp <- gpar(fontsize=5)
ht_opt$heatmap_row_names_gp <- gpar(fontsize=5)
ht_opt$heatmap_column_names_gp <- gpar(fontsize=5)
ht_opt$HEATMAP_LEGEND_PADDING <- unit(1,"mm")
ht_opt$ANNOTATION_LEGEND_PADDING <- unit(1,"mm")
ht_opt$legend_gap <- unit(c(1,1),"mm")
ht_opt$fast_hclust <- T


N.min.reads<-10

mat.meth <- 
  fcpgs.meth %>% 
  mcols() %>% data.frame() %>% 
  filter(rowSums(apply(.[,grep("_cov$",colnames(.)),drop=F],2,function(x){x>=N.min.reads}))==
           ncol(.[,grep("_cov$",colnames(.)),drop=F])) %>%
  select(contains("_meth"))
head(mat.meth);dim(mat.meth)

fcpgs.metadata$supplementary_table_1 %>% 
  filter(participant_id_new=="SCLL-019") %>% 
  select(sample_id_new,diagnosis_clinical)

sample.groups<- colnames(mat.meth)
sample.groups
sample.groups[grep("NBC|MBC",colnames(mat.meth))] <- "Bcell"
sample.groups[grep("19.15|12.19",colnames(mat.meth))] <- "RT"
sample.groups[grep("19.15|12.19|NBC|MBC",colnames(mat.meth),invert = T)] <- "CLL"
sample.groups



##load color palette used for paper
evoflux.meth <- fread("../Revision/Data/meth_palette_evoflux.tsv")
evoflux.meth




h <- Heatmap(matrix = mat.meth,
             col=evoflux.meth$color,
             row_title=paste0("N=",nrow(mat.meth)," (min reads ",N.min.reads,")"),
             cluster_columns = T,
             cluster_rows = T,
             show_row_names = F,
             show_row_dend = F,
             show_column_dend = F,
             name="DNA methylation",
             border = T,
             column_split = ifelse(sample.groups=="Bcell","Normal B cell", "B-cell tumor"),
             cluster_column_slices = F,
             row_dend_gp = gpar(lwd=0.01),
             column_dend_gp = gpar(lwd=0.3),
             na_col = "grey0",
             top_annotation = HeatmapAnnotation("Cell type"=sample.groups,
                                                annotation_legend_param = list(direction="horizontal",
                                                                               nrow=1),
                                                col=list("Cell type"=c("Bcell"="grey90",
                                                             "CLL"="#009E73",
                                                             "RT"="#02C599"
                                                             )
                                                         ),
                                                show_annotation_name = F,
                                                na_col = "grey45",
                                                annotation_name_side = "left",
                                                show_legend = F,
                                                annotation_name_gp = gpar(fontsize=5),
                                                simple_anno_size = unit(2,"mm")
                                                ),
             heatmap_legend_param = list(break_dist = 1,at=seq(0,1,by=0.2),direction="horizontal"),
)

draw(h,
     ht_gap = unit(2,"mm"),
     merge_legend=T,
     heatmap_legend_side = "bottom",
     annotation_legend_side = "bottom"
)

7 Session info

print(sessionInfo())

## R version 4.5.0 (2025-04-11)
## Platform: aarch64-apple-darwin20
## Running under: macOS Ventura 13.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: Europe/Madrid
## tzcode source: internal
## 
## attached base packages:
## [1] grid      stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] circlize_0.4.16                   ComplexHeatmap_2.24.1            
##  [3] BSgenome.Hsapiens.UCSC.hg38_1.4.5 BSgenome_1.76.0                  
##  [5] rtracklayer_1.68.0                BiocIO_1.18.0                    
##  [7] Biostrings_2.76.0                 XVector_0.48.0                   
##  [9] openxlsx_4.2.8                    GenomicRanges_1.60.0             
## [11] GenomeInfoDb_1.44.2               IRanges_2.42.0                   
## [13] S4Vectors_0.46.0                  BiocGenerics_0.54.0              
## [15] generics_0.1.4                    pals_1.10                        
## [17] ggpubr_0.6.1                      ggplot2_3.5.2                    
## [19] dplyr_1.1.4                       janitor_2.2.1                    
## [21] data.table_1.17.8                 BiocStyle_2.36.0                 
## 
## loaded via a namespace (and not attached):
##   [1] bitops_1.0-9                      rlang_1.1.6                      
##   [3] magrittr_2.0.3                    clue_0.3-66                      
##   [5] GetoptLong_1.0.5                  snakecase_0.11.1                 
##   [7] matrixStats_1.5.0                 compiler_4.5.0                   
##   [9] mgcv_1.9-3                        png_0.1-8                        
##  [11] reshape2_1.4.4                    vctrs_0.6.5                      
##  [13] maps_3.4.3                        stringr_1.5.1                    
##  [15] shape_1.4.6.1                     pkgconfig_2.0.3                  
##  [17] crayon_1.5.3                      fastmap_1.2.0                    
##  [19] magick_2.8.7                      backports_1.5.0                  
##  [21] labeling_0.4.3                    Rsamtools_2.24.0                 
##  [23] rmarkdown_2.29                    UCSC.utils_1.4.0                 
##  [25] tinytex_0.57                      purrr_1.1.0                      
##  [27] xfun_0.53                         cachem_1.1.0                     
##  [29] jsonlite_2.0.0                    DelayedArray_0.34.1              
##  [31] BiocParallel_1.42.1               cluster_2.1.8.1                  
##  [33] broom_1.0.9                       parallel_4.5.0                   
##  [35] R6_2.6.1                          bslib_0.9.0                      
##  [37] stringi_1.8.7                     RColorBrewer_1.1-3               
##  [39] car_3.1-3                         lubridate_1.9.4                  
##  [41] jquerylib_0.1.4                   iterators_1.0.14                 
##  [43] Rcpp_1.1.0                        bookdown_0.44                    
##  [45] SummarizedExperiment_1.38.1       knitr_1.50                       
##  [47] R.utils_2.13.0                    splines_4.5.0                    
##  [49] Matrix_1.7-4                      timechange_0.3.0                 
##  [51] tidyselect_1.2.1                  rstudioapi_0.17.1                
##  [53] dichromat_2.0-0.1                 abind_1.4-8                      
##  [55] yaml_2.3.10                       doParallel_1.0.17                
##  [57] codetools_0.2-20                  curl_7.0.0                       
##  [59] plyr_1.8.9                        lattice_0.22-7                   
##  [61] tibble_3.3.0                      Biobase_2.68.0                   
##  [63] withr_3.0.2                       evaluate_1.0.5                   
##  [65] isoband_0.2.7                     zip_2.3.3                        
##  [67] pillar_1.11.0                     BiocManager_1.30.26              
##  [69] MatrixGenerics_1.20.0             carData_3.0-5                    
##  [71] foreach_1.5.2                     RCurl_1.98-1.17                  
##  [73] scales_1.4.0                      glue_1.8.0                       
##  [75] mapproj_1.2.12                    tools_4.5.0                      
##  [77] ggsignif_0.6.4                    GenomicAlignments_1.44.0         
##  [79] fastcluster_1.3.0                 XML_3.99-0.19                    
##  [81] Cairo_1.6-5                       tidyr_1.3.1                      
##  [83] colorspace_2.1-1                  nlme_3.1-168                     
##  [85] GenomeInfoDbData_1.2.14           restfulr_0.0.16                  
##  [87] Formula_1.2-5                     cli_3.6.5                        
##  [89] S4Arrays_1.8.1                    gtable_0.3.6                     
##  [91] R.methodsS3_1.8.2                 rstatix_0.7.2.999                
##  [93] BSgenome.Hsapiens.UCSC.hg19_1.4.3 sass_0.4.10                      
##  [95] digest_0.6.37                     SparseArray_1.8.1                
##  [97] rjson_0.2.23                      farver_2.1.2                     
##  [99] htmltools_0.5.8.1                 R.oo_1.27.1                      
## [101] lifecycle_1.0.4                   httr_1.4.7                       
## [103] GlobalOptions_0.1.2               MASS_7.3-65

Long-read Nanopore data, Gabbutt and Duran-Ferrer 2025

10/07/2025