my.dir <- "C:/Documents and Settings/Rich/My Documents/projects2/Zoo-MyWeb/Rdata" source("http://www.uvm.edu/~rsingle/Rdata/scripts_stat295F14.R") library(genetics) #-------------------------------------------------------------------- # HWP & Population Structure #-------------------------------------------------------------------- obs.1 <- matrix(c(128, 32, 32, 8), nrow=2) #split the heterozygotes evenly obs.2 <- matrix(c(32, 48, 48, 72), nrow=2) #split the heterozygotes evenly obs.tot <- obs.1 + obs.2 obs.1 chisq.test(obs.1, correct=F)$expected chisq.test(obs.1, correct=F) chisq.test(obs.2, correct=F) chisq.test(obs.tot, correct=F) #-------------------------------------------------------------------- source("http://www.uvm.edu/~rsingle/Rdata/scripts_stat295F14.R") hgdp <- otherdata("HGDP_AKT1.txt") #hgdp <- otherdata("HGDP_AKT1.txt", web=F, Rdata.dir=my.dir) #4 SNPs in the AKT1 gene (v-akt murine thymoma viral oncogene homolog 1) #http://ghr.nlm.nih.gov/gene/AKT1 #(HWE and geographic origin): attach(hgdp) head(hgdp) table(Geographic.area) summary( genotype( AKT1.C0756A[Geographic.area=="Central_Africa"], sep="" ) ) FREQGeoArea <- by( genotype(AKT1.C0756A, sep=""), Geographic.area, summary) FREQGeoArea$Russia FREQGeoArea$Russia$allele.freq FREQGeoArea$Russia$allele.freq[1,] FREQGeoArea$Russia$allele.freq[,1] FREQGeoArea$Russia$allele.freq[,2] HWE.chisq( genotype( AKT1.C0756A[Geographic.area=="Central_Africa"], sep="" ) ) HWE.chisq( genotype( AKT1.C0756A[Geographic.area=="Russia"], sep="" ) ) HWEGeoArea <- by( genotype(AKT1.C0756A, sep=""), Geographic.area, HWE.chisq) Population[Geographic.area=="Russia"] by( genotype(AKT1.C0756A[Geographic.area=="Russia"], sep=""), Population[Geographic.area=="Russia"], summary) par(mfrow=c(2,2)) by( genotype(AKT1.C0756A[Geographic.area=="Russia"], sep=""), Population[Geographic.area=="Russia"], plot) par(mfrow=c(1,1)) #-------------------------------------------------------------------- # Principal Component Analysis (PCA) #-------------------------------------------------------------------- #Set up data for Principal Components (as a matrix) NamesAKT1snps <- names(hgdp)[substr(names(hgdp),1,4)=="AKT1"] NamesAKT1snps REGIONgeno <- hgdp[Geographic.area=="Russia", NamesAKT1snps] # Russia vs. Central_Africa head(REGIONgeno) REGIONgenoNum <- data.matrix(REGIONgeno) head(REGIONgenoNum) #(Principal components analysis (PCA) for identifying population substructure): PC.REGION <- prcomp(REGIONgenoNum, na.action=na.omit) plot(PC.REGION$"x"[,1],PC.REGION$"x"[,2],xlab="PC1",ylab="PC2") #sdev = SD of principal components = SQRT(eigenvalues) of the covariance matrix sum(PC.REGION$sdev[1:2]^2)/sum(PC.REGION$sdev^2) cor(REGIONgenoNum) cov(REGIONgenoNum) eigen.cov <- eigen( cov(REGIONgenoNum) ) eigen.cov sum(eigen.cov$values[1:2])/sum(eigen.cov$values) t(REGIONgenoNum) %*% REGIONgenoNum #--------------------------------------------------------------------