## 1.PRS using SBayesR # To run SBayesR run the lines below. Make sure you degin the input variable before (name of sumstats in GCTA ma format) echo " #!/bin/bash #PBS -N SbayesR #PBS -l mem=260GB,walltime=24:00:00 cd \$PBS_O_WORKDIR module load gctb/2.03 gctb --sbayes R \\ --mldm /reference/genepi/public_reference_panels/ldm_ukb_50k_bigset_2.8M/ukb50k_2.8M_shrunk_sparse.mldmlist \\ --pi 0.95,0.02,0.02,0.01 \\ --gamma 0.0,0.01,0.1,1 \\ --gwas-summary ../${input}.ma \\ --chain-length 10000 --burn-in 2000 --out-freq 10 \\ --out ${input}_SBayesR_fullLDM " > Submit_SBayesR.PBS qsub Submit_SBayesR.PBS #Extract the effect sizes and prepare them for plink scoring. In this case I am printing the chr:bp:a1:a2 (or a2:a1) plus the effect allele ($5) and the effect size ($8). awk '(FNR>1){print $3":"$4":"$5":"$6,$5,$8"\n"$3":"$4":"$6":"$5,$5,$8}' ${input}_SBayesR_fullLDM.snpRes > ${input}_SBayesR_betas.txt # Scirpt for PLINK polygenic risk scoring. This script is the base for all other PRS what you do before is what differs in C+T vs other methods echo " #!/bin/bash #PBS -N PlinkScoring #PBS -l mem=35GB,walltime=12:00:00,ncpus=4 cd \$PBS_O_WORKDIR chr=\$PBS_ARRAY_INDEX /working/genepi/software/bin/plink_2.00 --vcf /working/lab_nickm/adrianC/SAMPLE/ImputedFiles/chr\${chr}.dose.vcf.gz dosage=DS --double-id \\ --extract /working/lab_nickm/adrianC/SAMPLE/ImputedFiles/QCed_variants_MAF01_CR90_RSQ60.list \\ --score ${input}_SBayesR_betas.txt cols=+scoresums \\ --threads 4 \\ --memory 35000 \\ --out PRScalc/SAMPLE_${input}_PRS_chr\${chr} " > Submit_PLINKscore_array.PBS mkdir logfiles mkdir PRScalc #Note this is specific to the case where the genotype data is split by chromosomes, so we are taking advantage of that in the submission scripts qsub -J 1-22 -e logfiles/ -o logfiles/ Submit_PLINKscore_array.PBS # Run this for submissionm do not run if you want to just check the script generated # This will yield PRS by chromosome. You want to combine all of them into a single file by summing across samples. # you can do this in R or python or whatever your tool of choice. In my case I made a custom python script. ## 2.PRS using Clumping and Thresholding # In this example we will be clumping using UKB as a reference panel, but you can use 1000G or any other LD panel: #This assumes the input file to have these columns: SNP CHR BP A1 A2 FREQ BETA SE P N #where SNP is rs number or the matched SNPID between sumstats, LDref panel and target sample mkdir clumping cd clumping awk '(FNR>1) {print $1}' ../${input} > allSnpRsNumberUniqueNoIndels echo " #!/bin/bash #PBS -N clump #PBS -l mem=24GB,walltime=2:00:00 module load plink/1.90b6.8 cd \$PBS_O_WORKDIR chr=\$PBS_ARRAY_INDEX plink --bfile /reference/data/UKBB_500k/versions/LDref/HRC_only/UKBB_LD_bestguess_ref_panel_maf1e4_rsq_point3_HRC_SNPs_only \\ --extract allSnpRsNumberUniqueNoIndels \\ --clump ../${input} \\ --clump-p1 1 \\ --clump-p2 1 \\ --clump-r2 0.2 \\ --clump-kb 2000 \\ --threads 1 \\ --memory 20000 \\ --out ./${input} " > Submit_Clumping.PBS qsub Submit_Clumping.PBS # Once clumping is done match back to get A1 and beta: awk 'BEGIN{OFS="\t"}(NR==FNR){leadSNP[$3]=1;next}(FNR==1){print "SNP","A1","effect","P";next}($1 in leadSNP){print $2":"$3":"$4":"$5,$4,$7,$9"\n"$2":"$3":"$5":"$4,$4,$7,$9}' ./${input}.clumped ../${input} > indexSNPsumstats.ctg # Now this is the scoring submission script echo " #!/bin/bash #PBS -N PlinkScoring #PBS -l mem=35GB,walltime=12:00:00,ncpus=4 cd \$PBS_O_WORKDIR chr=\$PBS_ARRAY_INDEX module load plink/20200511 plink2 --pfile /working/lab_nickm/adrianC/SAMPLE/ImputedFiles/plink2Format/SAMPLE_imputed_plink2_DS_chr\${chr} \\ --extract /working/lab_nickm/adrianC/SAMPLE/ImputedFiles/QCed_variants_MAF01_CR90_RSQ60.list \\ --score indexSNPsumstats.ctg 1 2 header cols=+scoresums \\ --score-col-nums 3 \\ --q-score-range pvalue.ranges indexSNPsumstats.ctg 1 4 \\ --threads 4 \\ --memory 35000 \\ --out ./${prefix}_SAMPLEclumpedPRS_chr\${chr} " > Submit_PLINKscore_array.PBS qsub -J1-22 -e /dev/null -o /dev/null Submit_PLINKscore_array.PBS # Again this will yield PRS by chromosome. You want to combine all of them into a single file by summing across samples. # you can do this in R or python or whatever your tool of choice.