Due to the large time of execution, here we just provide the code.

1 Code

##
## Get QC in batches of the sample sheets generated
##
options(stringsAsFactors = F,error=NULL)

sample.sheets <- list.files(path = "../../Data/Processings/2_create_QC_samplesheets/",pattern = ".tsv$")
batches <- gsub(".tsv$","",sample.sheets)
batches

for (batch in batches){
  
  writeLines(batch)
  
  #User defined parameters interactively
  project <- paste0("/Users/martiduranferrer/Desktop/Marti_SSD/WORK/fCpGs/Data/QC_",batch)
  
  if(grepl("450k",batch)){
    #450k
    samp.sheet.path.450k <- paste0("/Users/martiduranferrer/Desktop/Marti_SSD/WORK/fCpGs/Data/Processings/2_create_QC_samplesheets/",batch,".tsv")
    baseDir.450k <- "/Users/martiduranferrer/Desktop/Marti_SSD/WORK/IDATS/IDATS_450k/"
    samp.sheet.path.EPIC <- NULL
    baseDir.EPIC <- NULL
  }
  
  if(grepl("EPIC",batch)){
    #EPIC
    samp.sheet.path.EPIC <- paste0("/Users/martiduranferrer/Desktop/Marti_SSD/WORK/fCpGs/Data/Processings/2_create_QC_samplesheets/",batch,".tsv")
    baseDir.EPIC <- "/Users/martiduranferrer/Desktop/Marti_SSD/WORK/IDATS/IDATS_850k/"
    samp.sheet.path.450k <- NULL
    baseDir.450k <- NULL
  }
  
  # controls and calculations
  excludingCpGs <- NULL#as.character(read.table("path")[,1]) ## Predefined set of CpGs to remove
  
  ##Names of samples and sample group analysis
  Sample_Name_Analysis_column <- "SAMPLE_NAME_ANALYSIS"
  Sample_Group_Analysis_column <- "SAMPLE_GROUP_ANALYSIS"
  
  ##qc
  plot.minfi.qcReport <- TRUE
  
  # NORMALIZATION
  ArrayOutType<-NULL
  Detection.Pval.Mean.Sample.cutoff <- 0.01
  Detection.Pval.CpGs.cutoff <- 1e-16
  Detect.Pval.pct.samples <- 90 ##options 50, 60, 70,80,90,100, from less to more restrictive. 
  RemoveSNPs <- TRUE
  MAF <- 0
  RemoveSexualChrs <- TRUE
  Normalization.type <- "ssNob"#SWAN" #2 recomended, batch-independent normalizations. Check minfi package for further details
  
  # Savings 
  saveRGset <- T
  saveRGset.norm <- T
  
  
  
  
  ###########################################################
  ## Define dirs and necessary variables to start analysis
  ###########################################################
  
  writeLines("Starting the analysis")
  writeLines(paste0(rep("_",100),collapse = ""))
  
  writeLines("Loading requires libraries")
  writeLines(paste0(rep("_",100),collapse = ""))
  suppressMessages(library(minfi))
  suppressMessages(library(ggplot2))
  suppressMessages(library(ggrepel))
  suppressMessages(library(data.table))
  suppressMessages(library(irlba))
  suppressMessages(library(qs))
  
  
  ##########################################################
  ## Define dirs and necessary variables to start analysis
  ###########################################################
  
  ## mine arguments to extract things needed
  project.name <- unlist(strsplit(x = project,split = "\\/"))[length(unlist(strsplit(x = project,split = "\\/")))]
  dir.project <- unlist(strsplit(x = project,split = project.name))[[1]]
  
  ###########################################################
  ## Reading IDAT files and pheno data
  ###########################################################
  
  writeLines("Reading IDAT files")
  writeLines(paste0(rep("_",100),collapse = ""))
  setwd(dir.project)
  dir.create(project)
  setwd(project)
  
  
  
  read_idats_to_RGSet <- function(IDATS.path = NULL,Sample.sheet.path = NULL){
    stopifnot(!is.null(IDATS.path) && !is.null(Sample.sheet.path))
    baseDir <- IDATS.path
    samp.sheet.path <- Sample.sheet.path
    
    # checking
    if(!dir.exists(dir.project)){
      stop(paste0("The following directory do not exist!:\n",dir.project))
    }
    if(!file.exists(samp.sheet.path)){
      stop(paste0("File do not exits!:\n",samp.sheet.path))
    }
    if(!dir.exists(baseDir)){
      stop(paste0("Directory of IDAT files do no exist:\n",baseDir))
    }
    
    # v.3.1
    # 
    # if(file.exists(file.path(baseDir,samp.sheet))){
    #   invisible(file.remove(file.path(baseDir,samp.sheet)))
    # }
    # FileCopied <- file.copy(from = samp.sheet.path,to = baseDir)
    # if(!FileCopied){
    #   stop(paste0("Could not start analysis. Cannot arrange necessary directories!"))
    # }
    
    # targets <- read.metharray.sheet(base = baseDir,pattern = samp.sheet,ignore.case = F,verbose = T)
    # stopifnot(all(!is.na(targets$Basename)))
    
    ##changed in v.3.2
    ##get targets data.frame with more freedom and w/o the necessity to copy it nto idat directory.
    targets <- fread(input = Sample.sheet.path,data.table = F)
    ##infer slide and array columns
    Slide.column <- grep("Slide|Sentrix_ID",colnames(targets),ignore.case = T,value = T)
    Array.column <- grep("Array|Sentrix_Position",colnames(targets),ignore.case = T,value = T)
    
    if(any(c(length(Slide.column)==0,length(Array.column)==0))){
      stop("Please, your data sheet should contain columns with Slide/Sentrix_ID and/or Array/Sentrix_Position")
    }
    
    if(any(c(length(Slide.column)>1,length(Array.column)>1))){
      stop("Please, your data sheet should contain only 1 column for  Slide/Sentrix_ID and/or Array/Sentrix_Position")
    }
    
    targets$Basename <- paste0(IDATS.path,"/",targets$SLIDE,"/",targets$SLIDE,"_",targets$ARRAY)
    
    
    # print(head(targets,20))
    # print(tail(targets,20))
    ##little check of Sample sheet
    
    if(!all(c(Sample_Name_Analysis_column,Sample_Group_Analysis_column)%in%colnames(targets))){
      stop("You should specify 'Sample_Name_Analysis_column' and 'Sample_Group_Analysis_column' from your data sheet")
    }
    
    ##sample name for analsyes should be unique!
    stopifnot(!any(duplicated(targets[,Sample_Name_Analysis_column])))
    
    RGset <- read.metharray.exp(targets = targets,verbose = T,force = T)
    rownames(pData(RGset)) <- as.character(pData(RGset)[,Sample_Name_Analysis_column])
    invisible(gc())
    return(RGset)
  }
  
  ## Read IDATS from 450k arrays
  if(!is.null(baseDir.450k) && !is.null(samp.sheet.path.450k)){
    RGset450k <- read_idats_to_RGSet(IDATS.path = baseDir.450k,Sample.sheet.path = samp.sheet.path.450k)
    writeLines("450k data loaded!")
  }
  invisible(gc())
  
  ## Read IDATS from EPIC arrays
  if(!is.null(baseDir.EPIC) && !is.null(samp.sheet.path.EPIC)){
    RGsetEPIC <- read_idats_to_RGSet(IDATS.path = baseDir.EPIC,Sample.sheet.path = samp.sheet.path.EPIC)
    writeLines("EPIC data loaded!")
  }
  invisible(gc())
  
  if(exists("RGset450k") && exists("RGsetEPIC")){
    RGset <- combineArrays(object1 = RGset450k,object2 = RGsetEPIC,outType = ArrayOutType,verbose=T)
    rm(RGset450k)
    rm(RGsetEPIC)
  }else if(exists("RGset450k")){
    RGset <- RGset450k
    rm(RGset450k)
  }else{
    RGset <- RGsetEPIC
    rm(RGsetEPIC)
  }
  invisible(gc())
  
  
  ##save RGSet
  dir.create("RData")
  
  if(saveRGset){
    writeLines("Saving RGSet object")
    qs::qsave(RGset,file=paste0("RData/",project.name,"_RGset.qs"))
  }
  
  ###########################################################
  ## Quality Control
  ###########################################################
  
  
  
  writeLines("Performing quality control of samples")
  writeLines(paste0(rep("_",100),collapse = ""))
  
  invisible(gc())
  dir.create("QC")
  
  ##general QC report
  if(plot.minfi.qcReport){
    writeLines("minfi qcReport")
    qcReport(rgSet = RGset,pdf = "QC/QC_RGset_minifi.qc.Report.pdf",
             sampNames = pData(RGset)[,Sample_Name_Analysis_column],
             sampGroups = factor(pData(RGset)[,Sample_Group_Analysis_column],levels = unique(pData(RGset)[,Sample_Group_Analysis_column])),
             maxSamplesPerPage = ifelse(nrow(pData(RGset))>100,100,nrow(pData(RGset)))
    )
    invisible(gc())
  }
  
  
  
  
  ###########################################################
  ## Start normalization
  ###########################################################
  
  invisible(gc())
  
  writeLines("Normalization")
  writeLines(paste0(rep("_",100),collapse = ""))
  
  if(Normalization.type=="Functional"){
    set.seed(6)
    warning("Using 'Functional Normalization' to normalize data. Between array normalization, beta values will change upon batches! Returning an grSet.")
    RGset.norm <- preprocessFunnorm(rgSet = RGset,verbose = T)
  }else if(Normalization.type=="Quantile"){
    warning("Using 'Quantile normalization' to normalize data. Between array normalization, beta values will change upon batches! Returning an grSet.")
    RGset.norm <- preprocessQuantile(object = RGset,verbose = T)
  }else if(Normalization.type=="SWAN"){
    set.seed(6)
    writeLines("Using 'SWAN normalization' to normalize data")
    RGset.norm <- preprocessSWAN(rgSet = RGset,verbose = T)
    invisible(gc())
  }else if(Normalization.type=="ssNob"){
    set.seed(6)
    writeLines("Using 'ssNob normalization' to normalize data")
    RGset.norm <- preprocessNoob(rgSet = RGset,verbose = T)
    invisible(gc())
  }else{
    writeLines("Using 'Illumina normalization' to normalize data")
    RGset.norm <- preprocessIllumina(object = RGset)
    invisible(gc())
  }
  invisible(gc())
  
  ## Save normalized objsect without any further filtering
  if(saveRGset.norm){
    writeLines("Saving normalized RGSet object")
    qs::qsave(RGset.norm,file=paste0("RData/",project.name,"_RGset.normalized.",Normalization.type,".qs"))
  }
  
  ##
  ## Transform normalized object to Genomic intervals
  ##
  
  if(class(RGset.norm)=="MethylSet"){
    RGset.norm <- mapToGenome(RGset.norm)
    invisible(gc())
  }
  
  
  ##
  ## perform minfi QC, including signal intensity and Sex predictions after normalization
  ##
  
  writeLines("Performing qc on raw signal intensitites")
  RGset.qc <- minfiQC(object = preprocessRaw(RGset),fixOutliers = T,verbose = T)$object
  writeLines("Performing qc on normalized signal intensitites")
  RGset.norm <- minfiQC(object = RGset.norm,fixOutliers = T,verbose = T)$object
  invisible(gc())
  
  pdf("QC/QC_minfiQC.pdf",height = 5,width = 6,useDingbats = F)
  
  ##
  ## Sex predictions
  ##
  
  plotSex(object = RGset.qc)
  title(main="Sex prediction with Raw signal intensities")
  
  plotSex(object = RGset.norm)
  title(main=paste0("Sex prediction with ",Normalization.type," normalized signal intensities"))
  
  plotSex(object = RGset.qc,id = pData(RGset.qc)[,Sample_Name_Analysis_column])
  title(main="Sex prediction with Raw signal intensities with sample names")
  
  plotSex(object = RGset.norm,id = pData(RGset.norm)[,Sample_Name_Analysis_column])
  title(main=paste0("Sex prediction with ",Normalization.type," normalized signal intensities"))
  
  ##
  ## Signal intensities
  ##
  
  plotQC(qc = RGset.qc)
  title(main="Raw signal intensities")
  
  plotQC(qc = RGset.norm)
  title(main=paste0(Normalization.type," normalized signal intensities"))
  
  ## QC raw data
  ##change qc names
  colnames(pData(RGset.qc))[grep("^mMed$|^uMed$|^xMed$|^yMed$",colnames(pData(RGset.qc)))] <-
    paste0("QC_raw_",grep("^mMed$|^uMed$|^xMed$|^yMed$",colnames(pData(RGset.norm)),value = T))
  colnames(pData(RGset.qc))[which(colnames(pData(RGset.qc))=="predictedSex")] <- "QC_raw_Sex"
  pData(RGset.qc)$QC_good_signal_intensity_raw_minfi <- 
    ifelse(((pData(RGset.qc)$QC_raw_mMed + pData(RGset.qc)$QC_raw_uMed)/2)>=10.5,TRUE,FALSE)
  
  ## QC normalized data
  colnames(pData(RGset.norm))[grep("^mMed$|^uMed$|^xMed$|^yMed$",colnames(pData(RGset.norm)))] <-
    paste0("QC_normalized_",grep("^mMed$|^uMed$|^xMed$|^yMed$",colnames(pData(RGset.norm)),value = T))
  colnames(pData(RGset.norm))[which(colnames(pData(RGset.norm))=="predictedSex")] <- "QC_normalized_Sex"
  pData(RGset.norm)$QC_good_signal_intensity_normalized_minfi <- 
    ifelse(((pData(RGset.norm)$QC_normalized_mMed + pData(RGset.norm)$QC_normalized_uMed)/2)>=10.5,TRUE,FALSE)
  
  ## Merge Raw and normalized QC into the normalized RGset
  pData(RGset.norm) <- cbind(pData(RGset.norm)[,grep("^QC_",colnames(pData(RGset.norm)),invert = T)],
                             pData(RGset.qc)[,grep("^QC_",colnames(pData(RGset.qc)))],
                             pData(RGset.norm)[,grep("^QC_",colnames(pData(RGset.norm)))]
  )
  
  ## plot minfiQC
  qc.raw.ggplot <- ggplot(data.frame(pData(RGset.qc)),
                          aes(x = QC_raw_mMed,
                              y= QC_raw_uMed,
                              color=factor(QC_good_signal_intensity_raw_minfi,levels = c("FALSE","TRUE"),labels = c("Bad","Good"))
                          )
  )+
    geom_point(pch=1,size=2)+
    geom_text_repel(aes(label=ifelse(!pData(RGset.qc)$QC_good_signal_intensity_raw_minfi,rownames(pData(RGset.qc)),NA)),
                    size=1,max.overlaps = Inf,min.segment.length = 0,segment.size=0.2
    )+
    guides(color=guide_legend(title = "Signal intensity"))+
    xlab("Meth median raw intensity (log2)")+
    ylab("Unmeth median raw intensity (log2)")+
    ggtitle(label = "Raw signal intensities",
            subtitle = paste0("Good=",length(which(pData(RGset.qc)$QC_good_signal_intensity_raw_minfi)),
                              ", Bad=",length(which(!pData(RGset.qc)$QC_good_signal_intensity_raw_minfi))
            )
    )+
    coord_cartesian(xlim = c(8,14),ylim = c(8,14))+
    geom_abline(intercept = 10.5*2,slope = -1,lty=2)+
    theme_bw()
  print(qc.raw.ggplot)
  
  
  
  ## plot minfiQC
  qc.norm.ggplot <- ggplot(data.frame(pData(RGset.norm)),
                           aes(x = QC_normalized_mMed,
                               y= QC_normalized_uMed,
                               color=factor(QC_good_signal_intensity_normalized_minfi,levels = c("FALSE","TRUE"),labels = c("Bad","Good"))
                           )
  )+
    geom_point(pch=1,size=2)+
    geom_text_repel(aes(label=ifelse(!pData(RGset.norm)$QC_good_signal_intensity_normalized_minfi,rownames(pData(RGset.norm)),NA)),
                    size=1,max.overlaps = Inf,min.segment.length = 0,segment.size=0.2
    )+
    guides(color=guide_legend(title = "Signal intensity"))+
    xlab("Meth median intensity (log2)")+
    ylab("Unmeth median intensity (log2)")+
    ggtitle(label = paste0(Normalization.type," normalized signal intensities"),
            subtitle = paste0("Good=",length(which(pData(RGset.norm)$QC_good_signal_intensity_normalized_minfi)),
                              ", Bad=",length(which(!pData(RGset.norm)$QC_good_signal_intensity_normalized_minfi))
            )
    )+
    coord_cartesian(xlim = c(8,14),ylim = c(8,14))+
    geom_abline(intercept = 10.5*2,slope = -1,lty=2)+
    theme_bw()
  print(qc.norm.ggplot)
  
  dev.off()
  
  ##relase RAM
  rm(RGset.qc)
  invisible(gc())
  
  ## Filering based on pvalues
  writeLines("Finding probes with bad signal intensities")
  ##get full annotation for reporting info of removed CpGs
  Full.CpGs.annot <- getAnnotation(RGset.norm)
  CpGs.detection.pvals <- detectionP(rgSet = RGset)
  invisible(gc())
  
  ## visualize detection pvalues from all samples
  pData(object = RGset.norm)$QC_mean_detection_pval <- colMeans(CpGs.detection.pvals)
  pData(object = RGset.norm)$QC_good_mean_detection_pval <- 
    ifelse(pData(object = RGset.norm)$QC_mean_detection_pval<=Detection.Pval.Mean.Sample.cutoff,TRUE,FALSE)
  
  
  #plot mean signal intensities across samples
  pMean.detection.pval <- ggplot(data.frame(pData(RGset.norm)[order(pData(RGset.norm)$QC_mean_detection_pval),]),
                                 aes(x = factor(get(Sample_Name_Analysis_column),levels = unique(get(Sample_Name_Analysis_column))),
                                     y = QC_mean_detection_pval,
                                     fill=QC_good_mean_detection_pval
                                 )
  )+
    geom_col()+
    geom_hline(yintercept = Detection.Pval.Mean.Sample.cutoff,color="red",linetype="dashed",lwd=0.2)+
    coord_cartesian(ylim = c(0,0.05))+
    facet_grid(.~get(Sample_Group_Analysis_column),scales = "free_x",space = "free_x")+
    xlab("Samples")+
    ylab("Mean detection pvalue")+
    theme_bw()+
    theme(axis.text.x = element_blank(),
          panel.grid.major.x = element_blank(),
          strip.text.x = element_text(size = 5,angle = 90)
    )
  ggsave(filename = "QC/QC_mean_detect_pval.pdf",plot = pMean.detection.pval,height = 3,width = 10,useDingbats=F)
  
  
  ## Select good samples based on Raw signal intensity and good mean detection pvalue
  pData(RGset.norm)$QC_good_sample_quality <- ifelse(pData(RGset.norm)$QC_good_signal_intensity_raw_minfi & pData(RGset.norm)$QC_good_mean_detection_pval, TRUE, FALSE)
  QC_good_sample_quality <- pData(RGset.norm)[which(pData(RGset.norm)$QC_good_sample_quality),Sample_Name_Analysis_column]
  
  
  if(Detect.Pval.pct.samples==100){
    CpGs.pval.ok <- rownames(CpGs.detection.pvals)[which(rowAlls(x = CpGs.detection.pvals[,QC_good_sample_quality]<=Detection.Pval.CpGs.cutoff,value = TRUE))] ## all samples with lower pval
    writeLines(paste0("Number of probes with good detection pvalue criteria in all samples: ",length(CpGs.pval.ok)))
  }else{
    CpGs.pval.ok <- CpGs.detection.pvals<=Detection.Pval.CpGs.cutoff
    CpGs.pval.ok <- names(which(round(rowMeans(CpGs.pval.ok[,QC_good_sample_quality],na.rm = T),1)>=(as.numeric(Detect.Pval.pct.samples)/100))) # Retain positions  do  have good pvalue in more than X fraction of the samples 
    writeLines(paste0("Number of probes with good detection pvalue criteria in ",Detect.Pval.pct.samples,"% of the samples: ",length(CpGs.pval.ok)))
  }
  
  ## save detection p values matrix
  qsave(CpGs.detection.pvals,"QC/QC_Detection_pvals.qs")
  
  ##calcualte mean detection pval for each cpgs acros samples to later report in removed cpgs
  CpGs.mean.Detection.pval <- rowMeans(CpGs.detection.pvals[,QC_good_sample_quality],na.rm = T)
  Fraction.samples.above.Detection.pval.cutoff <- rowMeans(CpGs.detection.pvals[,QC_good_sample_quality]<=Detection.Pval.CpGs.cutoff,na.rm = T)
  
  ##Release RAM
  rm(CpGs.detection.pvals,RGset)
  invisible(gc())
  
  
  
  ##
  ## Retain CpGs with good detection p value.
  ##
  
  if(length(CpGs.pval.ok)<length(rownames(RGset.norm))){
    writeLines("Removing CpGs with bad detection pvalues")
    RGset.norm <- RGset.norm[rownames(RGset.norm)[which(rownames(RGset.norm) %in% CpGs.pval.ok)],] ## in b4 annotation, some cg are not mapped to annotation package.
  }
  invisible(gc())
  
  
  ##  get out SNPs
  if(RemoveSNPs){
    writeLines(paste0("Removing SNPs"))
    RGset.norm <- dropMethylationLoci(object = RGset.norm,dropRS = T,dropCH = T)
    RGset.norm <- dropLociWithSnps(object = RGset.norm,snps=c("SBE","CpG"), maf=MAF)
  }
  invisible(gc())
  
  ##  get out  and individual specific CpGs
  if(!is.null(excludingCpGs)){
    writeLines("Getting out individual-specific CpGs...")
    RGset.norm <- RGset.norm[which(!rownames(RGset.norm)%in%excludingCpGs),]
  }
  invisible(gc())
  
  
  ## Filter sexual chromosomes
  if(RemoveSexualChrs){
    writeLines("Removing sexual chromosomes")
    RGset.norm <- RGset.norm[which(as.character(seqnames(RGset.norm))!="chrX" & as.character(seqnames(RGset.norm))!="chrY"),]
    seqlevels(RGset.norm) <- as.character(unique(seqnames(RGset.norm)))
  }
  invisible(gc())
  
  
  ##report removed CpGs
  Removed.CpGs <- Full.CpGs.annot[setdiff(rownames(Full.CpGs.annot),rownames(RGset.norm)),
                                  c("chr","pos","strand","Name","Probe_rs","Probe_maf","CpG_rs","CpG_maf","SBE_rs",
                                    "SBE_maf","Relation_to_Island","UCSC_RefGene_Group","UCSC_RefGene_Name")]
  if(!is.null(excludingCpGs)){
    Removed.CpGs$Prior.removal.CpGs <- F
    Removed.CpGs$Prior.removal.CpGs[which(Removed.CpGs$Name%in%excludingCpGs)] <- T  
  }
  invisible(gc())
  
  ## Add mean detection pvalue across samples
  Removed.CpGs$CpGs.mean.Detection.pval <- CpGs.mean.Detection.pval[Removed.CpGs$Name]
  Removed.CpGs$Fraction.samples.above.Detection.pval.cutoff <- Fraction.samples.above.Detection.pval.cutoff[Removed.CpGs$Name]
  
  fwrite(as.data.frame(Removed.CpGs),"QC/QC_removed.CpGs.tsv",sep="\t")
  
  ###########################################################
  ## 3. Get beta values
  ###########################################################
  
  writeLines("Getting beta values")
  ## Get betas
  betas <- as.data.frame(getBeta(RGset.norm))
  invisible(gc())
  
  # ## Gett matrix without converting NA's by normalizations
  # betasNAs <- betas
  # whichBetasNAs <- whichBetasNAs[rownames(betas),]
  # betasNAs[whichBetasNAs] <- NA
  # rm("whichBetasNAs")
  # invisible(gc())
  
  ##
  ## Plot PCA and densityplot of samples with bad quality if present
  ##
  
  if(any(!pData(RGset.norm)$QC_good_sample_quality)){
    
    pdf("QC/QC_Bad_samples_density_PCA.pdf",height = 5,width = 6,useDingbats = F)
    
    writeLines("Ploting density estimates of bad quality samples")
    ##Densityplot
    densityPlot(dat = as.matrix(betas),sampGroups = pData(RGset.norm)[,"QC_good_sample_quality"],
                main = paste0(Normalization.type," normalized data for good(true) and bad(false) samples from QC"),
    )
    
    ## PCA
    writeLines("Ploting PCA of bad quality samples")
    set.seed(6)
    pca <- prcomp_irlba(x = t(betas),n = 5)
    dfpca <- cbind(pca$x[,1:5],data.frame(pData(RGset.norm)))
    pca.ggplot <- ggplot(dfpca,
                         aes(x = PC1,
                             y= PC2,
                             fill=QC_good_sample_quality
                         )
    )+
      xlab(paste0("PC1(",summary(pca)$importance["Proportion of Variance","PC1"]*100,"%)"))+
      ylab(paste0("PC2(",summary(pca)$importance["Proportion of Variance","PC2"]*100,"%)"))+
      geom_point(pch=21,size=2.5)+
      theme_bw()+
      theme(legend.key.size = unit(7,"pt"))
    print(pca.ggplot)
    
    dev.off()
  }
  
  ##
  ## Plot density estimates for good quality data and pca 
  ## 
  
  if(any(pData(RGset.norm)$QC_good_sample_quality)){
    
    pdf("QC/QC_Good_samples_density_PCA.pdf",height = 5,width = 6,useDingbats = F)
    
    writeLines("Ploting density estimates of good quality samples")
    ##Densityplot
    densityPlot(dat = as.matrix(betas)[,pData(RGset.norm)[which(pData(RGset.norm)$QC_good_sample_quality),Sample_Name_Analysis_column]],
                sampGroups = pData(RGset.norm)[which(pData(RGset.norm)$QC_good_sample_quality),Sample_Group_Analysis_column],
                main = paste0(Normalization.type," normalized data for good samples from QC")
    )
    
    writeLines("Ploting PCA of good quality samples")
    set.seed(6)
    pca <- prcomp_irlba(x = t(betas[,pData(RGset.norm)[which(pData(RGset.norm)$QC_good_sample_quality),Sample_Name_Analysis_column]]),n = 5)
    dfpca <- cbind(pca$x[,1:5],data.frame(pData(RGset.norm)[which(pData(RGset.norm)$QC_good_sample_quality),]))
    pca.ggplot <- ggplot(dfpca,
                         aes(x = PC1,
                             y= PC2,
                             fill=get(Sample_Group_Analysis_column)
                         )
    )+
      geom_point(pch=21,size=2.5)+
      guides(fill=guide_legend(title = "Sample group"))+
      xlab(paste0("PC1(",summary(pca)$importance["Proportion of Variance","PC1"]*100,"%)"))+
      ylab(paste0("PC2(",summary(pca)$importance["Proportion of Variance","PC2"]*100,"%)"))+
      theme_bw()+
      theme(legend.key.size = unit(7,"pt"))
    print(pca.ggplot)
    
    dev.off()
  }
  
  
  ###########################################################
  ## 4. Export betas 
  ###########################################################
  
  writeLines("Exporting normalized betas")
  writeLines(paste0(rep("_",100),collapse = ""))
  
  dir.create("Normalized.Betas")
  data.table::fwrite(x = cbind(CpGs=rownames(betas),as.data.frame(betas)),
                     file = paste0("Normalized.Betas/",project.name,"_Betas_",Normalization.type,".tsv"),sep = "\t")
  
  ## export all phenodata associated
  data.table::fwrite(x = as.data.frame(pData(RGset.norm)),file = paste0("Normalized.Betas/QC_",project.name,".tsv"),sep = "\t")
  
  # export sessioninfo and date of analsyes
  writeLines(text = capture.output(sessionInfo()),con = paste0(project.name,"_sessionInfo_",gsub(" |:","_",Sys.time()),".txt"))
  
  writeLines("ANALYSIS DONE! Congrats! :-)")

  
  ## clean memory and session
  
  rm(list = ls())
  invisible(gc())
}

2 Package versions

The versions of packages used are detailed in the manuscript.