################################################################################
######## PRSice analysis for creation of polygenic risk scores (PRS) ###########
################################################################################

## PRSice software was developed by Choi SW and O’Reilly P (https://www.prsice.info) 
## PRSice_v2.2.13 was used

base="~/base_summary_statistics" ### base summary statistics
target="~/plink_binary" ### target plink binary files
keep="~/post_qc_sub_ID_list" ### list of subjects passing quality control
extract="~/post_qc_SNPs_list" ### list of SNPs passing quality control
output="~/PRS_results" ### output file name

Rscript ~/PRSice.R \
--dir . \
--prsice ~/PRSice \
--base $base \
--target $target \
--keep $keep \
--extract $extract \
--clump-r2 0.1 \
--clump-p 1 \
--clump-kb 250 \
--bar-levels 5e-8,1e-5,1e-3,0.01,0.05,0.1,0.2,0.3,0.4,0.5,1 \ # p thresholds
--binary-target T \
--fastscore \
--no-regress \
--out $output


##################################################################################################################
################## R code for regression models of phenotype and PRS of traits of interest #######################
##################################################################################################################

## R version 3.6.0

setwd("~/PRS_directory") ### set directory with PRS files obtained by PRSice

file.names <- dir(pattern ="*.all.score") ### read PRS files created with PRSice, where the first two columns are FID and IID, and the following columns correspond to PRS at 11 p thresholds; PRS at different p thresholds were standardised to calculate effect size

scores<-list()
for (k in 1:length(file.names)){scores[[k]] <- read.table(file.names[k],h=T)}

pheno<-read.table("phenotype_file", h=T) ### read phenotype file (two columns called FID and pheno)
cov<-read.table("covariate_file", h=T) ### read covariate file (FID, first six population principal components, batch and assessment_centre)

for (k in 1:length(file.names)) {scores[[k]]<-merge(scores[[k]],pheno, by='FID') } ### merging of PRS and phenotype files by individual ID (FID)

for (k in 1:length(file.names)) {scores[[k]]<-merge(scores[[k]],cov, by='FID') } ### merging of PRS-phenotype and covariate files by individual ID (FID)

for (k in 1:length(file.names)) {scores[[k]][,3:13] <- apply(scores[[k]][3:13], 2, scale)} 

m1 <-list()

# Regression at 11 p threshold for all PRS (assuming that PRS at different threshold are in columns 3-13 of each element in the list called scores), covariates are the first 6 population principal components (PC), batch (coded as factor) and assessment_centre (factor)

for (i in 1:length(scores))
m1[[i]]<-apply(scores[[i]][3:13],2,FUN <- function(x) glm(pheno ~ x + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + batch + as.factor(assessment.centre), family='binomial',data=scores[[i]]))

results<-list()

for (i in 1:length(scores))
results[[i]] <- data.frame (do.call(rbind,lapply(m1[[i]], function(z)summary(z)$coefficients[2,])))

for (i in 1:length(results)) results[[i]]$OR <- exp(results[[i]]$Estimate) ### adds OR

for (i in 1:length(results)) results[[i]]$ci_l <- exp(results[[i]]$Estimate-(1.96 * results[[i]]$Std..Error)) ### adds lower limit of 95% CI

for (i in 1:length(results)) results[[i]]$up_l <- exp(results[[i]]$Estimate+(1.96 * results[[i]]$Std..Error)) ### adds upper limit of 95% CI

# calculate Nagelkerke R2

library(sizeMat)

r2 <- list()
for (i in 1:length(results))
r2[[i]]<-data.frame(do.call(rbind,lapply(m1[[i]],function(z)nagelkerkeR2(z))))

n <- list()
for (i in 1:length(results)) n[[i]] <- glm(pheno ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + batch + as.factor(assessment.centre), family='binomial',data=scores[[i]]))

for (i in 1:length(results)) r2[[i]]$prs.r2 <- r2[[i]][,1] - nagelkerkeR2(n[[i]])

for (i in 1:length(results)) results[[i]]$full.r2 <- r2[[i]][,1]
for (i in 1:length(results)) results[[i]]$prs.r2 <- r2[[i]]$prs.r2

# write the results
setwd("~/results_directory/")

for (i in 1:length(results)) 
write.table(results[[i]], paste(file.names[i], "results", sep='_'), col.names=T,row.names=T, sep=' ', quote=F) ### writes the results as separate tables for each PRS in the set directory


############################################################
####### convert Nagelkerke R2 to the liability scale #######
############################################################

### R

h2l_R2N <- function(k, r2n, p) {
  # k baseline disease risk
  # r2n Nagelkerke's attributable to genomic profile risk score
  # proportion of sample that are cases
  # calculates proportion of variance explained on the liability scale
  #from ABC at http://www.complextraitgenomics.com/software/
  #Lee SH, Goddard ME, Wray NR, Visscher PM. (2012) A better coefficient of determination for genetic profile analysis. Genet Epidemiol. 2012 Apr;36(3):214-24.
  x <- qnorm(1 - k)
  z <- dnorm(x)
  i <- z / k
  cc <- k * (1 - k) * k * (1 - k) / (z^2 * p * (1 - p))
  theta <- i * ((p - k)/(1 - k)) * (i * ((p - k) / ( 1 - k)) - x)
  e <- 1 - p^(2 * p) * (1 - p)^(2 * (1 - p))
  h2l_R2N <- cc * e * r2n / (1 + cc * e * theta * r2n)
}