################################################################################ ######## PRSice analysis for creation of polygenic risk scores (PRS) ########### ################################################################################ ## PRSice software was developed by Choi SW and O’Reilly P (https://www.prsice.info) ## PRSice_v2.2.13 was used base="~/base_summary_statistics" ### base summary statistics target="~/plink_binary" ### target plink binary files keep="~/post_qc_sub_ID_list" ### list of subjects passing quality control extract="~/post_qc_SNPs_list" ### list of SNPs passing quality control output="~/PRS_results" ### output file name Rscript ~/PRSice.R \ --dir . \ --prsice ~/PRSice \ --base $base \ --target $target \ --keep $keep \ --extract $extract \ --clump-r2 0.1 \ --clump-p 1 \ --clump-kb 250 \ --bar-levels 5e-8,1e-5,1e-3,0.01,0.05,0.1,0.2,0.3,0.4,0.5,1 \ # p thresholds --binary-target T \ --fastscore \ --no-regress \ --out $output ################################################################################################################## ################## R code for regression models of phenotype and PRS of traits of interest ####################### ################################################################################################################## ## R version 3.6.0 setwd("~/PRS_directory") ### set directory with PRS files obtained by PRSice file.names <- dir(pattern ="*.all.score") ### read PRS files created with PRSice, where the first two columns are FID and IID, and the following columns correspond to PRS at 11 p thresholds; PRS at different p thresholds were standardised to calculate effect size scores<-list() for (k in 1:length(file.names)){scores[[k]] <- read.table(file.names[k],h=T)} pheno<-read.table("phenotype_file", h=T) ### read phenotype file (two columns called FID and pheno) cov<-read.table("covariate_file", h=T) ### read covariate file (FID, first six population principal components, batch and assessment_centre) for (k in 1:length(file.names)) {scores[[k]]<-merge(scores[[k]],pheno, by='FID') } ### merging of PRS and phenotype files by individual ID (FID) for (k in 1:length(file.names)) {scores[[k]]<-merge(scores[[k]],cov, by='FID') } ### merging of PRS-phenotype and covariate files by individual ID (FID) for (k in 1:length(file.names)) {scores[[k]][,3:13] <- apply(scores[[k]][3:13], 2, scale)} m1 <-list() # Regression at 11 p threshold for all PRS (assuming that PRS at different threshold are in columns 3-13 of each element in the list called scores), covariates are the first 6 population principal components (PC), batch (coded as factor) and assessment_centre (factor) for (i in 1:length(scores)) m1[[i]]<-apply(scores[[i]][3:13],2,FUN <- function(x) glm(pheno ~ x + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + batch + as.factor(assessment.centre), family='binomial',data=scores[[i]])) results<-list() for (i in 1:length(scores)) results[[i]] <- data.frame (do.call(rbind,lapply(m1[[i]], function(z)summary(z)$coefficients[2,]))) for (i in 1:length(results)) results[[i]]$OR <- exp(results[[i]]$Estimate) ### adds OR for (i in 1:length(results)) results[[i]]$ci_l <- exp(results[[i]]$Estimate-(1.96 * results[[i]]$Std..Error)) ### adds lower limit of 95% CI for (i in 1:length(results)) results[[i]]$up_l <- exp(results[[i]]$Estimate+(1.96 * results[[i]]$Std..Error)) ### adds upper limit of 95% CI # calculate Nagelkerke R2 library(sizeMat) r2 <- list() for (i in 1:length(results)) r2[[i]]<-data.frame(do.call(rbind,lapply(m1[[i]],function(z)nagelkerkeR2(z)))) n <- list() for (i in 1:length(results)) n[[i]] <- glm(pheno ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + batch + as.factor(assessment.centre), family='binomial',data=scores[[i]])) for (i in 1:length(results)) r2[[i]]$prs.r2 <- r2[[i]][,1] - nagelkerkeR2(n[[i]]) for (i in 1:length(results)) results[[i]]$full.r2 <- r2[[i]][,1] for (i in 1:length(results)) results[[i]]$prs.r2 <- r2[[i]]$prs.r2 # write the results setwd("~/results_directory/") for (i in 1:length(results)) write.table(results[[i]], paste(file.names[i], "results", sep='_'), col.names=T,row.names=T, sep=' ', quote=F) ### writes the results as separate tables for each PRS in the set directory ############################################################ ####### convert Nagelkerke R2 to the liability scale ####### ############################################################ ### R h2l_R2N <- function(k, r2n, p) { # k baseline disease risk # r2n Nagelkerke's attributable to genomic profile risk score # proportion of sample that are cases # calculates proportion of variance explained on the liability scale #from ABC at http://www.complextraitgenomics.com/software/ #Lee SH, Goddard ME, Wray NR, Visscher PM. (2012) A better coefficient of determination for genetic profile analysis. Genet Epidemiol. 2012 Apr;36(3):214-24. x <- qnorm(1 - k) z <- dnorm(x) i <- z / k cc <- k * (1 - k) * k * (1 - k) / (z^2 * p * (1 - p)) theta <- i * ((p - k)/(1 - k)) * (i * ((p - k) / ( 1 - k)) - x) e <- 1 - p^(2 * p) * (1 - p)^(2 * (1 - p)) h2l_R2N <- cc * e * r2n / (1 + cc * e * theta * r2n) }