Analyses related to long-read Nanopore data. We start the analyses with the counts provided by CNAG. This matrix needs filtering and curation.
##
## Load necessary packages and set global options
##
library(data.table)
library(janitor)
library(dplyr)
library(ggpubr)
library(pals)
library(GenomicRanges)
library(openxlsx)
library(BSgenome.Hsapiens.UCSC.hg38)
options(stringsAsFactors = F,max.print = 10000,error=NULL,
"openxlsx.dateFormat" = "dd/mm/yyyy"
)
## load fCpGs with meth values
fcpgs.betas <-
fread("../Data/FMC/Final_data/beta_filtered_samples_fcpgs_laplacian.tsv",data.table = F) %>%
tibble::column_to_rownames(var = "V1")
fcpgs.betas[1:5,1:5]
dim(fcpgs.betas)
fcpgs <-
openxlsx::read.xlsx("../Revision/Data/Supplementary_Tables_revision_curated.2.xlsx",
sheet = "supplementary_table_3") %>%
clean_names()
fcpgs <-
GRanges(fcpgs$chromosome,IRanges(fcpgs$position,fcpgs$position),strand = fcpgs$strand,
fcpgs %>% select(-c(chromosome,position,strand))
)
fcpgs
getSeq(BSgenome.Hsapiens.UCSC.hg19::BSgenome.Hsapiens.UCSC.hg19,fcpgs) %>% as.character() %>% table
getSeq(BSgenome.Hsapiens.UCSC.hg19::BSgenome.Hsapiens.UCSC.hg19,fcpgs[strand(fcpgs)=="+"]) %>% as.character() %>% table
getSeq(BSgenome.Hsapiens.UCSC.hg19::BSgenome.Hsapiens.UCSC.hg19,fcpgs[strand(fcpgs)=="-"]) %>% as.character() %>% table
##perform liftover
hg19.to.hg38.chain <- rtracklayer::import.chain(con = "../Revision/Data/Nanopore/hg19ToHg38.over.chain")
fcpgs.hg38 <- rtracklayer::liftOver(x = fcpgs,chain = hg19.to.hg38.chain)
fcpgs.hg38 <- unlist(fcpgs.hg38)
length(fcpgs)
length(fcpgs.hg38)
fcpgs$name[which(!fcpgs$name %in% fcpgs.hg38$name)]
fcpgs.hg38
getSeq(BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38,fcpgs.hg38) %>% as.character() %>% table
getSeq(BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38,fcpgs.hg38[strand(fcpgs.hg38)=="+"]) %>% as.character() %>% table
getSeq(BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38,fcpgs.hg38[strand(fcpgs.hg38)=="-"]) %>% as.character() %>% table
fcpgs.hg38[strand(fcpgs.hg38)=="-"] %>% granges()
##liftover do not work on this CpG
fcpgs.hg38[strand(fcpgs.hg38)=="-"][which(getSeq(BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38,fcpgs.hg38[strand(fcpgs.hg38)=="-"]) %>% as.character() !="G"),]
exclude <- fcpgs.hg38[strand(fcpgs.hg38)=="-"]$name[which(getSeq(BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38,fcpgs.hg38[strand(fcpgs.hg38)=="-"]) %>% as.character() !="G")]
fcpgs.hg38 <- fcpgs.hg38[fcpgs.hg38$name!=exclude]
fcpgs.hg38
#check nucleotides
getSeq(BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38,fcpgs.hg38[strand(fcpgs.hg38)=="+"]) %>% as.character() %>% table
getSeq(BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38,fcpgs.hg38[strand(fcpgs.hg38)=="-"]) %>% as.character() %>% table
# ok!
##load metadata
sheets <- getSheetNames("../Revision/Data/Supplementary_Tables_revision_curated.2.xlsx")
sheets <- setNames(sheets,sheets)
sheets
fcpgs.metadata <- parallel::mclapply(sheets,function(i){
openxlsx::read.xlsx("../Revision/Data/Supplementary_Tables_revision_curated.2.xlsx",
sheet = i,
detectDates = T)
},mc.cores = 9)
head(fcpgs.metadata$supplementary_table_1)
## load metadata of meth nanopore samples
nano.metadata <-
openxlsx::read.xlsx("../Revision/Data/Nanopore/Nanopore_CLL_Bcell_curated.xlsx") %>%
clean_names()
head(nano.metadata)
nano.metadata %>% select(sample_barcode,sample_name,sample_id_curated)
nano.metadata %>% View()
##load meth data
gc()
meth.cov <-
fread("../Revision/Data/Nanopore/beds/BCLLatlas_5mC_CpG_meth_cov.txt.gz") %>%
GRanges()
gc()
meth.cov
colnames(mcols(meth.cov)) <- gsub(":","_",colnames(mcols(meth.cov)))
old.colnames <- colnames(mcols(meth.cov))
old.colnames
nano.metadata %>% arrange(sample_id_curated) %>% View()
##change name normal B cells
colnames(mcols(meth.cov))[gsub("_meth|_cov","",colnames(mcols(meth.cov))) %in% nano.metadata$sample_barcode] <-
paste0(nano.metadata$sample_id_curated[match(gsub("_meth|_cov","",colnames(mcols(meth.cov))),nano.metadata$sample_barcode) %>% na.omit()],
c("_meth","_cov")
)
colnames(mcols(meth.cov))
##change name CLLs
colnames(mcols(meth.cov))[gsub("_meth|_cov","",colnames(mcols(meth.cov))) %in% nano.metadata$flowcell_name] <-
paste0(nano.metadata$sample_id_curated[match(gsub("_meth|_cov","",colnames(mcols(meth.cov))),nano.metadata$flowcell_name) %>% na.omit()],
c("_meth","_cov")
)
data.frame(old.colnames,new.colnames=colnames(mcols(meth.cov))) %>% arrange(new.colnames)
length(meth.cov)
## Order and subset chromomes
meth.cov <- sort(sortSeqlevels(meth.cov),ignore.strand=T)
meth.cov <- meth.cov[which(seqnames(meth.cov) %in% paste0("chr",1:22)),]
meth.cov %>% width() %>% table
gc()
##
## In combined strand files, the informatino of methylation for each strand is merged!
##
# check it out:!
meth.cov.seq <- getSeq(BSgenome.Hsapiens.UCSC.hg38,meth.cov)
meth.cov.seq
identical(length(meth.cov.seq),length(meth.cov))
##2 and 3 nucleotide should always be CG! # 1st one is relatex to differnet reference systems in R
apply(as.matrix(meth.cov.seq),2,table) ##ok!!
## set coordinates accordingly to match array CG annotation
start(meth.cov) <- start(meth.cov)+1
meth.cov
meth.cov.seq <- getSeq(BSgenome.Hsapiens.UCSC.hg38,meth.cov)
meth.cov.seq
identical(length(meth.cov.seq),length(meth.cov))
## All should be CG!!!
apply(as.matrix(meth.cov.seq),2,unique) ##ok!!
##make some checks
width(meth.cov) %>% table() ##
meth.cov
##all good! manually check at UCSC browser!
We next check some QC metrics, such as sequencing depth, CpGs passing, etc.
##checkings
meth.cov %>%
mcols() %>%
data.frame() %>%
dplyr::select(n_pass) %>%
ggplot(aes(x=n_pass)) +
geom_histogram()+
ggtitle("Number of CpGs passing in across smaples")+
theme_bw()+
theme(text = element_text(colour = "grey0",size = 5),
panel.grid.major = element_line(linewidth = 0.075),
# panel.grid.minor.x = element_line(linewidth = 0.025),
strip.placement = "out",
strip.background = element_blank(),
strip.text = element_text(angle = 0),
plot.margin = unit(c(0,0,0,0),"pt"),
line=element_line(linewidth = 0.1),
legend.position = "bottom",
legend.box.margin=margin(-5,-5,-5,-5),
# legend.justification="top",
legend.text = element_text(angle = 90,vjust = 0.5,hjust = 0.5),
plot.background =element_blank(),
legend.key.size = unit(5,"pt")
)
We next check the coverage of each sample. We can see there is 1 sample with significantly less coverage, and we subseqeuntly remove it.
df.cov <-
meth.cov %>%
mcols() %>%
data.frame() %>%
dplyr::select(contains("_cov")) %>%
reshape2::melt() %>%
group_by(variable) %>%
summarise(boxplot.stats=list(boxplot.stats(value)))
df.cov <- cbind(df.cov[,"variable"],
lapply(df.cov$boxplot.stats,function(x)x$stats) %>% do.call(rbind,.)
)
df.cov
df.cov %>%
ggplot(aes(x=variable,
y=`3`,
))+
geom_linerange(aes(ymin=`1`,
ymax=`5`),
lwd=0.3
)+
geom_crossbar(aes(ymin=`2`,
ymax=`4`),
lwd=0.3,
fill="grey80"
)+
theme_bw()+
ylab("Reads covering CpGs (outliers removed)")+
ggtitle("Read coverage")+
xlab(NULL)+
theme(text = element_text(colour = "grey0",size = 5),
axis.text.x = element_text(angle = 90,vjust = 1,hjust = 1),
panel.grid.major = element_line(linewidth = 0.075),
# panel.grid.minor.x = element_line(linewidth = 0.025),
strip.placement = "out",
strip.background = element_blank(),
strip.text = element_text(angle = 0),
plot.margin = unit(c(0,0,0,0),"pt"),
line=element_line(linewidth = 0.1),
legend.position = "bottom",
legend.box.margin=margin(-5,-5,-5,-5),
# legend.justification="top",
legend.text = element_text(angle = 90,vjust = 0.5,hjust = 0.5),
plot.background =element_blank(),
legend.key.size = unit(5,"pt")
)
##
## Remove sample with low coverage
##
samples.low.coverage <- "012-06-01TD"
meth.cov <- meth.cov[,grep(samples.low.coverage,colnames(mcols(meth.cov)),invert = T,value = T)]
meth.cov$n_pass2 <-
rowSums(apply(meth.cov[,grep("_cov$",colnames(mcols(meth.cov)))] %>% mcols() %>% as.matrix,2,function(x)!is.na(x)))
gc()
table(meth.cov$n_pass2)
length(meth.cov)
meth.cov <- meth.cov[meth.cov$n_pass2!=0] ##remove low-coverage sample-specific regions
length(meth.cov)
gc()
We next plot the genome-wide methylation of the cleaned DNA methylation matrix, with those CpGs covered by at least 10 reads.
N.min.reads <- 10 ##
##number of CpGs with more than 10 reads
##genome-wide meth
p <- meth.cov %>%
mcols() %>%
as.data.frame() %>%
##10 reads covering a CpG in all the samples
filter(n_pass2==ncol(.[,grep("_cov$",colnames(.))])) %>%
filter(rowSums(apply(.[,grep("_cov$",colnames(.))],2,function(x){x>=N.min.reads}))==ncol(.[,grep("_cov$",colnames(.))])) %>%
select(contains("meth"))
N.CpGs <- nrow(p)
p <- p %>%
reshape2::melt() %>%
ggplot(aes(x=value))+
facet_wrap(variable~.,scales = "free_y")+
geom_histogram()+
theme_classic()+
ggtitle(paste0(N.min.reads," reads in all samples. CpGs=",N.CpGs))+
theme(text = element_text(colour = "grey0",size = 5),
panel.grid.major = element_line(linewidth = 0.075),
# panel.grid.minor.x = element_line(linewidth = 0.025),
strip.placement = "out",
strip.background = element_blank(),
strip.text.y = element_text(angle = 0),
plot.margin = unit(c(0,0,0,0),"pt"),
line=element_line(linewidth = 0.1),
legend.position = "bottom",
legend.box.margin=margin(-5,-5,-5,-5),
# legend.justification="top",
legend.text = element_text(angle = 90,vjust = 0.5,hjust = 0.5),
plot.background =element_blank(),
legend.key.size = unit(5,"pt")
)
print(p)
gc()
We next check the methylation of fCpGs
## meth fcpgs
# update annotation for - stand, as array annot of CG always refer to + strand and first nucleotide
fcpgs.hg38[which(strand(fcpgs.hg38)=="-")] <-
resize(fcpgs.hg38[which(strand(fcpgs.hg38)=="-")],width = 2,fix = "end")
fcpgs.hg38
start(fcpgs.hg38[which(strand(fcpgs.hg38)=="-")]) <-
end(fcpgs.hg38[which(strand(fcpgs.hg38)=="-")])
fcpgs.hg38 %>% width() %>% table
fcpgs.hg38
##check nucleotide seq, always should be a C!
getSeq(BSgenome.Hsapiens.UCSC.hg38,fcpgs.hg38) %>% as.character() %>% table
##get overlaps
##sort before overlapping
gc()
fcpgs.hg38 <- sort(sortSeqlevels(fcpgs.hg38),ignore.strand=TRUE)
meth.cov <- sort(sortSeqlevels(meth.cov),ignore.strand=TRUE)
fcpgs.overlaps <- findOverlaps(query = fcpgs.hg38,
subject = meth.cov,
ignore.strand=T
)
fcpgs.overlaps ## all present!
subjectHits(fcpgs.overlaps) %>% length()
subjectHits(fcpgs.overlaps) %>% unique() %>% length()
fcpgs.meth <- meth.cov[subjectHits(fcpgs.overlaps)]
fcpgs.meth %>% width() %>% table
fcpgs.meth
##names regions with cpg names
queryHits(fcpgs.overlaps) %>% length()
queryHits(fcpgs.overlaps) %>% unique() %>% length()
names(fcpgs.meth) <- fcpgs.hg38$name[queryHits(fcpgs.overlaps)]
identical(fcpgs.meth,sort(sortSeqlevels(fcpgs.meth),ignore.strand=TRUE))
fcpgs.meth
getSeq(BSgenome.Hsapiens.UCSC.hg38,fcpgs.meth) %>% as.character() %>% table()
N.min.reads <- 10
p <- fcpgs.meth %>%
mcols() %>%
as.data.frame() %>%
##10 reads covering a CpG in all the samples
filter(n_pass2==ncol(.[,grep("_cov$",colnames(.))])) %>%
filter(rowSums(apply(.[,grep("_cov$",colnames(.))],2,function(x){x>=N.min.reads}))==ncol(.[,grep("_cov$",colnames(.))])) %>%
select(contains("meth"))
N.CpGs <- nrow(p)
N.CpGs
p %>%
reshape2::melt() %>%
ggplot(aes(x=value))+
facet_wrap(variable~.,scales = "free_y")+
geom_histogram()+
xlab("DNA methylation")+
ggtitle(paste0(N.min.reads," reads as minimum coversge. CpGs=",N.CpGs))+
theme_classic()+
theme(text = element_text(colour = "grey0",size = 5),
panel.grid.major = element_line(linewidth = 0.075),
# panel.grid.minor.x = element_line(linewidth = 0.025),
strip.placement = "out",
strip.background = element_blank(),
strip.text.y = element_text(angle = 0),
plot.margin = unit(c(0,0,0,0),"pt"),
line=element_line(linewidth = 0.1),
legend.position = "bottom",
legend.box.margin=margin(-5,-5,-5,-5),
# legend.justification="top",
legend.text = element_text(angle = 90,vjust = 0.5,hjust = 0.5),
plot.background =element_blank(),
legend.key.size = unit(5,"pt")
)
We next compare the fCpG methylation values measured by Nanopore and arrays of matched samples, and see great agreement.
#pairwise comparisons between same sample. illumina and nanopore
array.rep.samples <-
fcpgs.betas %>%
select( (fcpgs.metadata$supplementary_table_1 %>%
filter(sample_id_new=="019-15-01TD" | sample_id_new=="019-0047-01TD" | sample_id_new=="012-19-01TD") %>%
pull(participant_id_anonymous)
)
) %>%
tibble::rownames_to_column("CpG") %>%
reshape2::melt() %>%
mutate(sample_id=fcpgs.metadata$supplementary_table_1$sample_id_new[match(variable,fcpgs.metadata$supplementary_table_1$participant_id_anonymous)],
timepoint=fcpgs.metadata$supplementary_table_1$sample_timepoints[match(variable,fcpgs.metadata$supplementary_table_1$participant_id_anonymous)],
platform = fcpgs.metadata$supplementary_table_1$platform[match(variable,fcpgs.metadata$supplementary_table_1$participant_id_anonymous)]
) %>%
rename(variable="participant_id_anonymous",
value="array.meth")
head(array.rep.samples);dim(array.rep.samples)
array.rep.samples %>% count(sample_id,participant_id_anonymous)
##nanopore comparison with all paired array-nanopore samples (ie facets)
for(i in c(1,5,10,15,20,25,30)){
N.min.reads <- i
nano.rep.samples <-
fcpgs.meth %>%
mcols() %>%
data.frame() %>%
select(starts_with(c("X019.0047.01TD","X019.15.01TD","X012.19.01TD"))) %>%
filter(rowSums(apply(.[,grep("_cov$",colnames(.)),drop=F],2,function(x){x>=N.min.reads}))==ncol(.[,grep("_cov$",colnames(.)),drop=F])) %>%
select(all_of(c("X019.0047.01TD_meth","X019.15.01TD_meth","X012.19.01TD_meth"))) %>%
tibble::rownames_to_column("CpG") %>%
rename(all_of(setNames(c("019-15-01TD","019-0047-01TD","012-19-01TD"),
c("X019.15.01TD_meth","X019.0047.01TD_meth","X012.19.01TD_meth")
)
)
) %>%
reshape2::melt() %>%
rename(variable="sample_id",
value="nanopore.meth"
)
head(nano.rep.samples);dim(nano.rep.samples)
nano.rep.samples %>% count(sample_id)
meth.rep.samples <-
full_join(x = array.rep.samples,
y=nano.rep.samples,
by = join_by(CpG==CpG,sample_id==sample_id,),
keep = T
)
meth.rep.samples %>% head
#samples are sorted by samples and CpG
meth.rep.samples$cpgs_diff <- abs(meth.rep.samples$array.meth - meth.rep.samples$nanopore.meth)
meth.rep.samples <-
meth.rep.samples %>%
tidyr::drop_na()
N.CpGs <- meth.rep.samples$CpG.x %>% unique() %>% length()
p <-
meth.rep.samples %>%
mutate(facet=paste0(sample_id.x,"_",platform)) %>%
ggplot(aes(x = nanopore.meth, ##nanopre data in X
y= array.meth, ##array data on Y
color=cpgs_diff
))+
geom_abline(slope = 1,color="grey40",linetype="dashed",lwd=0.2) +
stat_density2d(aes(alpha=after_stat(level)),
lwd=0.1,
color="grey0"
)+
geom_point(pch=19,stroke=0,size=0.8)+
geom_smooth(aes(group=facet),
lwd=0.2,
method = "lm",
color="grey0"
)+
stat_cor(size=1)+
ggtitle(paste0("At least ",N.min.reads," reads for all Cs in all samples, CpGs=",N.CpGs))+
scale_color_gradientn("Methylation\ndifference",
colours = viridis(100) %>% rev(),
limits=c(0,0.25),
na.value = viridis(100)[1]
)+
facet_grid(~facet)+
theme_classic()+
theme(text = element_text(colour = "grey0",size = 5),
panel.grid.major = element_line(linewidth = 0.075),
# panel.grid.minor.x = element_line(linewidth = 0.025),
strip.placement = "out",
strip.background = element_blank(),
strip.text = element_text(angle = 0),
plot.margin = unit(c(0,0,0,0),"pt"),
line=element_line(linewidth = 0.1),
legend.position = "bottom",
legend.box.margin=margin(-5,-5,-5,-5),
# legend.justification="top",
legend.text = element_text(angle = 90,vjust = 0.5,hjust = 0.5),
plot.background =element_blank(),
legend.key.size = unit(5,"pt")
)
print(p)
}
We next use the sample with the highest overage.
for(i in c(1,10,20,30,40,45,50,55,60,65)){
N.min.reads <- i
nano.rep.samples <-
fcpgs.meth %>%
mcols() %>%
data.frame() %>%
select(starts_with(c("X019.15.01TD"))) %>%
filter(rowSums(apply(.[,grep("_cov$",colnames(.)),drop=F],2,function(x){x>=N.min.reads}))==ncol(.[,grep("_cov$",colnames(.)),drop=F])) %>%
select(all_of(c("X019.15.01TD_meth"))) %>%
tibble::rownames_to_column("CpG") %>%
rename(all_of(setNames(c("019-15-01TD"),
c("X019.15.01TD_meth")
)
)
) %>%
reshape2::melt() %>%
rename(variable="sample_id",
value="nanopore.meth"
)
head(nano.rep.samples);dim(nano.rep.samples)
nano.rep.samples %>% count(sample_id)
meth.rep.samples <-
full_join(x = array.rep.samples,
y=nano.rep.samples,
by = join_by(CpG==CpG,sample_id==sample_id,),
keep = T
)
meth.rep.samples %>% head
#samples are sorted by samples and CpG
meth.rep.samples$cpgs_diff <- abs(meth.rep.samples$array.meth - meth.rep.samples$nanopore.meth)
meth.rep.samples <-
meth.rep.samples %>%
tidyr::drop_na()
N.CpGs <- meth.rep.samples$CpG.x %>% unique() %>% length()
p <-
meth.rep.samples %>%
mutate(facet=paste0(sample_id.x,"_",platform)) %>%
ggplot(aes(x = nanopore.meth, ##nanopre data in X
y= array.meth, ##array data on Y
color=cpgs_diff
))+
geom_abline(slope = 1,color="grey40",linetype="dashed",lwd=0.2) +
stat_density2d(aes(alpha=after_stat(level)),
lwd=0.1,
color="grey0"
)+
geom_point(pch=19,stroke=0,size=0.8)+
geom_smooth(aes(group=facet),
lwd=0.2,
method = "lm",
color="grey0"
)+
stat_cor(size=1)+
ggtitle(paste0(N.min.reads," reads, CpGs=",N.CpGs))+
scale_color_gradientn("Methylation\ndifference",
colours = viridis(100) %>% rev(),
limits=c(0,0.25),
na.value = viridis(100)[1]
)+
facet_grid(~facet)+
theme_classic()+
theme(text = element_text(colour = "grey0",size = 5),
panel.grid.major = element_line(linewidth = 0.075),
# panel.grid.minor.x = element_line(linewidth = 0.025),
strip.placement = "out",
strip.background = element_blank(),
strip.text = element_text(angle = 0),
plot.margin = unit(c(0,0,0,0),"pt"),
line=element_line(linewidth = 0.1),
legend.position = "bottom",
legend.box.margin=margin(-5,-5,-5,-5),
# legend.justification="top",
legend.text = element_text(angle = 90,vjust = 0.5,hjust = 0.5),
plot.background =element_blank(),
legend.key.size = unit(5,"pt")
)
print(p)
}
We compare the distribution of methylation values of fCpGs measured by nanopore with arrays with increasing coverage. The distribution is very similar!
for(i in c(1,10,20,30,40,45,50,55,60,65)){
N.min.reads <- i
nano.rep.samples <-
fcpgs.meth %>%
mcols() %>%
data.frame() %>%
select(starts_with(c("X019.15.01TD"))) %>%
filter(rowSums(apply(.[,grep("_cov$",colnames(.)),drop=F],2,function(x){x>=N.min.reads}))==ncol(.[,grep("_cov$",colnames(.)),drop=F])) %>%
select(all_of(c("X019.15.01TD_meth"))) %>%
tibble::rownames_to_column("CpG") %>%
rename(all_of(setNames(c("019-15-01TD"),
c("X019.15.01TD_meth")
)
)
) %>%
reshape2::melt() %>%
rename(variable="sample_id",
value="nanopore.meth"
)
head(nano.rep.samples);dim(nano.rep.samples)
nano.rep.samples %>% count(sample_id)
meth.rep.samples <-
full_join(x = array.rep.samples,
y=nano.rep.samples,
by = join_by(CpG==CpG,sample_id==sample_id,),
keep = T
)
meth.rep.samples <-
meth.rep.samples %>%
tidyr::drop_na()
N.CpGs <- meth.rep.samples$CpG.x %>% unique() %>% length()
##hisogram
p <-
meth.rep.samples %>%
reshape2::melt(id.vars=c("sample_id.x","CpG.x"),measure.vars=c("array.meth","nanopore.meth")) %>%
ggplot(aes(x = value, ##array data on Y
fill=variable
))+
geom_histogram(color="grey20",lwd=0.2)+
scale_fill_brewer(palette = "Set2")+
facet_grid(variable~.)+
xlab("DNA methylation")+
ggtitle(paste0(N.min.reads," reads, CpGs=",N.CpGs))+
theme_classic()+
theme(text = element_text(colour = "grey0",size = 5),
panel.grid.major = element_line(linewidth = 0.075),
# panel.grid.minor.x = element_line(linewidth = 0.025),
strip.placement = "out",
strip.background = element_blank(),
strip.text = element_text(angle = 0),
plot.margin = unit(c(0,0,0,0),"pt"),
line=element_line(linewidth = 0.1),
legend.position = "bottom",
legend.box.margin=margin(-5,-5,-5,-5),
# legend.justification="top",
legend.text = element_text(angle = 0,vjust = 0.5,hjust = 0.5),
plot.background =element_blank(),
legend.key.size = unit(5,"pt")
)
print(p)
}
for(i in c(1,10,20,30,40,45,50,55,60,65)){
N.min.reads <- i
nano.rep.samples <-
fcpgs.meth %>%
mcols() %>%
data.frame() %>%
select(starts_with(c("X012.19.01TD"))) %>%
filter(rowSums(apply(.[,grep("_cov$",colnames(.)),drop=F],2,function(x){x>=N.min.reads}))==ncol(.[,grep("_cov$",colnames(.)),drop=F])) %>%
select(all_of(c("X012.19.01TD_meth"))) %>%
tibble::rownames_to_column("CpG") %>%
rename(all_of(setNames(c("012-19-01TD"),
c("X012.19.01TD_meth")
)
)
) %>%
reshape2::melt() %>%
rename(variable="sample_id",
value="nanopore.meth"
)
head(nano.rep.samples);dim(nano.rep.samples)
nano.rep.samples %>% count(sample_id)
meth.rep.samples <-
full_join(x = array.rep.samples,
y=nano.rep.samples,
by = join_by(CpG==CpG,sample_id==sample_id,),
keep = T
)
meth.rep.samples %>% head
#samples are sorted by samples and CpG
meth.rep.samples$cpgs_diff <- abs(meth.rep.samples$array.meth - meth.rep.samples$nanopore.meth)
meth.rep.samples <-
meth.rep.samples %>%
tidyr::drop_na()
N.CpGs <- meth.rep.samples$CpG.x %>% unique() %>% length()
p <-
meth.rep.samples %>%
mutate(facet=paste0(sample_id.x,"_",platform)) %>%
ggplot(aes(x = nanopore.meth, ##nanopre data in X
y= array.meth, ##array data on Y
color=cpgs_diff
))+
geom_abline(slope = 1,color="grey40",linetype="dashed",lwd=0.2) +
stat_density2d(aes(alpha=after_stat(level)),
lwd=0.1,
color="grey0"
)+
geom_point(pch=19,stroke=0,size=0.8)+
geom_smooth(aes(group=facet),
lwd=0.2,
method = "lm",
color="grey0"
)+
stat_cor(size=1)+
ggtitle(paste0(N.min.reads," reads, CpGs=",N.CpGs))+
scale_color_gradientn("Methylation\ndifference",
colours = viridis(100) %>% rev(),
limits=c(0,0.25),
na.value = viridis(100)[1]
)+
facet_grid(~facet)+
theme_classic()+
theme(text = element_text(colour = "grey0",size = 5),
panel.grid.major = element_line(linewidth = 0.075),
# panel.grid.minor.x = element_line(linewidth = 0.025),
strip.placement = "out",
strip.background = element_blank(),
strip.text = element_text(angle = 0),
plot.margin = unit(c(0,0,0,0),"pt"),
line=element_line(linewidth = 0.1),
legend.position = "bottom",
legend.box.margin=margin(-5,-5,-5,-5),
# legend.justification="top",
legend.text = element_text(angle = 90,vjust = 0.5,hjust = 0.5),
plot.background =element_blank(),
legend.key.size = unit(5,"pt")
)
print(p)
}
for(i in c(1,10,20,30,40,45,50,55,60,65)){
N.min.reads <- i
nano.rep.samples <-
fcpgs.meth %>%
mcols() %>%
data.frame() %>%
select(starts_with(c("X012.19.01TD"))) %>%
filter(rowSums(apply(.[,grep("_cov$",colnames(.)),drop=F],2,function(x){x>=N.min.reads}))==ncol(.[,grep("_cov$",colnames(.)),drop=F])) %>%
select(all_of(c("X012.19.01TD_meth"))) %>%
tibble::rownames_to_column("CpG") %>%
rename(all_of(setNames(c("012-19-01TD"),
c("X012.19.01TD_meth")
)
)
) %>%
reshape2::melt() %>%
rename(variable="sample_id",
value="nanopore.meth"
)
head(nano.rep.samples);dim(nano.rep.samples)
nano.rep.samples %>% count(sample_id)
meth.rep.samples <-
full_join(x = array.rep.samples,
y=nano.rep.samples,
by = join_by(CpG==CpG,sample_id==sample_id,),
keep = T
)
meth.rep.samples <-
meth.rep.samples %>%
tidyr::drop_na()
N.CpGs <- meth.rep.samples$CpG.x %>% unique() %>% length()
##hisogram
p <-
meth.rep.samples %>%
reshape2::melt(id.vars=c("sample_id.x","CpG.x"),measure.vars=c("array.meth","nanopore.meth")) %>%
ggplot(aes(x = value, ##array data on Y
fill=variable
))+
geom_histogram(color="grey20",lwd=0.2)+
scale_fill_brewer(palette = "Set2")+
facet_grid(variable~.)+
xlab("DNA methylation")+
ggtitle(paste0(N.min.reads," reads, CpGs=",N.CpGs))+
theme_classic()+
theme(text = element_text(colour = "grey0",size = 5),
panel.grid.major = element_line(linewidth = 0.075),
# panel.grid.minor.x = element_line(linewidth = 0.025),
strip.placement = "out",
strip.background = element_blank(),
strip.text = element_text(angle = 0),
plot.margin = unit(c(0,0,0,0),"pt"),
line=element_line(linewidth = 0.1),
legend.position = "bottom",
legend.box.margin=margin(-5,-5,-5,-5),
# legend.justification="top",
legend.text = element_text(angle = 0,vjust = 0.5,hjust = 0.5),
plot.background =element_blank(),
legend.key.size = unit(5,"pt")
)
print(p)
}
We finally plot the heatmap with Nanopore data for all samples
##heamtap
library(ComplexHeatmap)
library(circlize)
ht_opt$legend_grid_width <- unit(2,"mm")
ht_opt$legend_grid_height <-unit(2,"mm")
ht_opt$legend_labels_gp <- gpar(fontsize=5)
ht_opt$legend_title_gp <- gpar(fontsize=5)
ht_opt$heatmap_column_title_gp <- gpar(fontsize=5)
ht_opt$heatmap_row_title_gp <- gpar(fontsize=5)
ht_opt$heatmap_row_names_gp <- gpar(fontsize=5)
ht_opt$heatmap_column_names_gp <- gpar(fontsize=5)
ht_opt$HEATMAP_LEGEND_PADDING <- unit(1,"mm")
ht_opt$ANNOTATION_LEGEND_PADDING <- unit(1,"mm")
ht_opt$legend_gap <- unit(c(1,1),"mm")
ht_opt$fast_hclust <- T
N.min.reads<-10
mat.meth <-
fcpgs.meth %>%
mcols() %>% data.frame() %>%
filter(rowSums(apply(.[,grep("_cov$",colnames(.)),drop=F],2,function(x){x>=N.min.reads}))==
ncol(.[,grep("_cov$",colnames(.)),drop=F])) %>%
select(contains("_meth"))
head(mat.meth);dim(mat.meth)
fcpgs.metadata$supplementary_table_1 %>%
filter(participant_id_new=="SCLL-019") %>%
select(sample_id_new,diagnosis_clinical)
sample.groups<- colnames(mat.meth)
sample.groups
sample.groups[grep("NBC|MBC",colnames(mat.meth))] <- "Bcell"
sample.groups[grep("19.15|12.19",colnames(mat.meth))] <- "RT"
sample.groups[grep("19.15|12.19|NBC|MBC",colnames(mat.meth),invert = T)] <- "CLL"
sample.groups
##load color palette used for paper
evoflux.meth <- fread("../Revision/Data/meth_palette_evoflux.tsv")
evoflux.meth
h <- Heatmap(matrix = mat.meth,
col=evoflux.meth$color,
row_title=paste0("N=",nrow(mat.meth)," (min reads ",N.min.reads,")"),
cluster_columns = T,
cluster_rows = T,
show_row_names = F,
show_row_dend = F,
show_column_dend = F,
name="DNA methylation",
border = T,
column_split = ifelse(sample.groups=="Bcell","Normal B cell", "B-cell tumor"),
cluster_column_slices = F,
row_dend_gp = gpar(lwd=0.01),
column_dend_gp = gpar(lwd=0.3),
na_col = "grey0",
top_annotation = HeatmapAnnotation("Cell type"=sample.groups,
annotation_legend_param = list(direction="horizontal",
nrow=1),
col=list("Cell type"=c("Bcell"="grey90",
"CLL"="#009E73",
"RT"="#02C599"
)
),
show_annotation_name = F,
na_col = "grey45",
annotation_name_side = "left",
show_legend = F,
annotation_name_gp = gpar(fontsize=5),
simple_anno_size = unit(2,"mm")
),
heatmap_legend_param = list(break_dist = 1,at=seq(0,1,by=0.2),direction="horizontal"),
)
draw(h,
ht_gap = unit(2,"mm"),
merge_legend=T,
heatmap_legend_side = "bottom",
annotation_legend_side = "bottom"
)
print(sessionInfo())
## R version 4.5.0 (2025-04-11)
## Platform: aarch64-apple-darwin20
## Running under: macOS Ventura 13.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: Europe/Madrid
## tzcode source: internal
##
## attached base packages:
## [1] grid stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] circlize_0.4.16 ComplexHeatmap_2.24.1
## [3] BSgenome.Hsapiens.UCSC.hg38_1.4.5 BSgenome_1.76.0
## [5] rtracklayer_1.68.0 BiocIO_1.18.0
## [7] Biostrings_2.76.0 XVector_0.48.0
## [9] openxlsx_4.2.8 GenomicRanges_1.60.0
## [11] GenomeInfoDb_1.44.2 IRanges_2.42.0
## [13] S4Vectors_0.46.0 BiocGenerics_0.54.0
## [15] generics_0.1.4 pals_1.10
## [17] ggpubr_0.6.1 ggplot2_3.5.2
## [19] dplyr_1.1.4 janitor_2.2.1
## [21] data.table_1.17.8 BiocStyle_2.36.0
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-9 rlang_1.1.6
## [3] magrittr_2.0.3 clue_0.3-66
## [5] GetoptLong_1.0.5 snakecase_0.11.1
## [7] matrixStats_1.5.0 compiler_4.5.0
## [9] mgcv_1.9-3 png_0.1-8
## [11] reshape2_1.4.4 vctrs_0.6.5
## [13] maps_3.4.3 stringr_1.5.1
## [15] shape_1.4.6.1 pkgconfig_2.0.3
## [17] crayon_1.5.3 fastmap_1.2.0
## [19] magick_2.8.7 backports_1.5.0
## [21] labeling_0.4.3 Rsamtools_2.24.0
## [23] rmarkdown_2.29 UCSC.utils_1.4.0
## [25] tinytex_0.57 purrr_1.1.0
## [27] xfun_0.53 cachem_1.1.0
## [29] jsonlite_2.0.0 DelayedArray_0.34.1
## [31] BiocParallel_1.42.1 cluster_2.1.8.1
## [33] broom_1.0.9 parallel_4.5.0
## [35] R6_2.6.1 bslib_0.9.0
## [37] stringi_1.8.7 RColorBrewer_1.1-3
## [39] car_3.1-3 lubridate_1.9.4
## [41] jquerylib_0.1.4 iterators_1.0.14
## [43] Rcpp_1.1.0 bookdown_0.44
## [45] SummarizedExperiment_1.38.1 knitr_1.50
## [47] R.utils_2.13.0 splines_4.5.0
## [49] Matrix_1.7-4 timechange_0.3.0
## [51] tidyselect_1.2.1 rstudioapi_0.17.1
## [53] dichromat_2.0-0.1 abind_1.4-8
## [55] yaml_2.3.10 doParallel_1.0.17
## [57] codetools_0.2-20 curl_7.0.0
## [59] plyr_1.8.9 lattice_0.22-7
## [61] tibble_3.3.0 Biobase_2.68.0
## [63] withr_3.0.2 evaluate_1.0.5
## [65] isoband_0.2.7 zip_2.3.3
## [67] pillar_1.11.0 BiocManager_1.30.26
## [69] MatrixGenerics_1.20.0 carData_3.0-5
## [71] foreach_1.5.2 RCurl_1.98-1.17
## [73] scales_1.4.0 glue_1.8.0
## [75] mapproj_1.2.12 tools_4.5.0
## [77] ggsignif_0.6.4 GenomicAlignments_1.44.0
## [79] fastcluster_1.3.0 XML_3.99-0.19
## [81] Cairo_1.6-5 tidyr_1.3.1
## [83] colorspace_2.1-1 nlme_3.1-168
## [85] GenomeInfoDbData_1.2.14 restfulr_0.0.16
## [87] Formula_1.2-5 cli_3.6.5
## [89] S4Arrays_1.8.1 gtable_0.3.6
## [91] R.methodsS3_1.8.2 rstatix_0.7.2.999
## [93] BSgenome.Hsapiens.UCSC.hg19_1.4.3 sass_0.4.10
## [95] digest_0.6.37 SparseArray_1.8.1
## [97] rjson_0.2.23 farver_2.1.2
## [99] htmltools_0.5.8.1 R.oo_1.27.1
## [101] lifecycle_1.0.4 httr_1.4.7
## [103] GlobalOptions_0.1.2 MASS_7.3-65