Bioconductor PETAL

#################1.Getting data##################

#/usr/local/bin/R

#### before PETAL, apply "cleanCol.pl" on the output file of msInspect and get filtered results.

##################################### begin to do alignment

InforColumn=18
CHAR=6
source(file="R_cod_PETAL_function.txt")

###############specify the file names subjected to alignment

#file.name=????

setwd("XXX")
file.name = list.files(path="YYY",pattern="mzXML$")
#"XXX","YYY"は適宜設定 ############################## read in file tsv2csv <- function(file) #read mzXML file and normalize it. { pp = read.table(file, header=TRUE, sep="\t") pp <- pp[,-18] #added by nakahara 13/05/25 dd = lapply(as.character(pp[,18]), function(x) as.numeric(unlist*1 #revised from 217 to 216 by nakahara 13/05/25 } # ddd = matrix(unlist(dd), ncol=216, byrow=TRUE) #revised from 217 to 216 by nakahara csv = cbind(pp[,1:17], ddd) csv <- csv[csv[,"charge"]>0,] print(paste("dimention of raw data is (",nrow(csv),ncol(csv),")")) return(csv) } file.n<-length(file.name) data.list<-list(NULL) for(i in 1:file.n) { data.list[[i]]<-tsv2csv(file.name[i]) print(paste("FN=",i,"is read")) } FN<-length(data.list) RAW.data<-data.list head(RAW.data[[1]]) ############## preparing template #We need much time to make this module done! total.data<-list(NULL) total.infor<-list(NULL) for(char in 1:CHAR) { total.data[[char]]<-rep(0, 216) total.data[[char]]<-total.data[[char]][-(1:216)] #added by nakahara in 05/27/13 total.infor[[char]]<-rep(0, 18) total.infor[[char]]<-total.infor[[char]][-(1:18)] #added by nakahara in 05/27/13 for(i in 1:FN) { cur<-RAW.data[[i]] pick<- cur[, "charge"]==char if(sum(pick)>0) { total.data[[char]]<-rbind(total.data[[char]],cur[pick,-(1:17)]) total.infor[[char]]<-rbind(total.infor[[char]], cbind(i,cur[pick, (1:17)])) #i=file number print(paste("(char=",char,"FN=",i,")","is done")) } } #change order by mz order.temp<-order(total.infor[[char]][,"mz"]) total.infor[[char]]<-total.infor[[char]][order.temp,] total.data[[char]]<-total.data[[char]][order.temp,] } ################################ don't need that much for template sample.data<-total.data sample.infor<-total.infor for(char in 1:CHAR) { count=nrow(total.data[[char]]) print(paste("# of row of total.data,","charge=",char,",count=",count)) if(count>10000) { pick=sort(sample(1:count, 10000, replace=FALSE)) sample.data[[char]]=(total.data[[char]])[pick,] sample.infor[[char]]=(total.infor[[char]])[pick,] } } ########## normalize and truncate total.data.n<-list(NULL) for(take in 1:CHAR) { cur.data<-as.matrix(sample.data[[take]])[,90:216] cur.mag<-apply(cur.data,1,max) cur.min<-apply(cur.data,1,min) temp1<-matrix(cur.mag, nrow=nrow(cur.data), ncol=ncol(cur.data), byrow=F) temp2<-matrix(cur.min, nrow=nrow(cur.data), ncol=ncol(cur.data), byrow=F) total.data.n[[take]]<-(cur.data-temp2)/(temp1-temp2+0.0001) } ########### quality control for templates template.data<-list(NULL) template.infor<-list(NULL) for(take in 1:CHAR) { cur.data<-total.data.n[[take]] if(take<4) { # filtering wrong charge status wrong.right.1<-round(1/(take+1)/(6/216),0)+20 wrong.right.2<-round(1/(take+2)/(6/216),0)+20 temp<-apply(cur.data, 1, wrong.mono, k=20, cut=0.25) good<-(!temp) & cur.data[,20]>0.4 & cur.data[, wrong.right.1]<0.6 & cur.data[, wrong.right.2]<0.6 } else{# filtering wrong mono-isotope position temp<-apply(cur.data, 1, wrong.mono, k=20, cut=0.25) good<-(1:nrow(cur.data))[!temp] } template.data[[take]]<-cur.data[good,] template.infor[[take]]<-sample.infor[[take]][good,] } ############### save datas save(template.data, template.infor, file = "template.Rdata") save(RAW.data, file="RAW.Rdata") data.f<-RAW.data for(i in 1:FN) { cur.data<-RAW.data[[i]] cur.mz<-cur.data[,'mz'] temp.order<-order(cur.mz) data.f[[i]]<-cur.data[temp.order,] print(paste("FN=",i,"is stored in data.f")) } ######################################### Global Alignment ################################ time warping, same as msInspect data.fg<-data.f ##################### template TAG=1 Quan=0.6 sigma=100 sigma2=20 plot.curve=TRUE g1=data.f[[TAG]] I.1=g1[, 'intensity'] cut.1<-quantile(I.1, Quan) f1<-g1[g1[,'intensity']>cut.1,] for(i in (1:FN)[-TAG]) { g2=data.f[[i]] I.2=g2[, 'intensity'] cut.2<-quantile(I.2, Quan) f2<-g2[g2[,'intensity']>cut.2,] P=get_pairs(f1,f2) ss1=f1[P[,1],'scan'] ss2=f2[P[,2],'scan'] # coef=linear_align(ss2, ss1, sigma) coef=c(0,1) s2t=coef[1]+coef[2]*ss2 sm2=msmooth(s2t,ss1-s2t,df0=10,sigma2,sigma2) sm=function(x) { xt=coef[1]+coef[2]*x xtt=predict(sm2,xt)$y+xt } ####### how about running med pdf(file=paste("globalsmooth", i, ".pdf", sep=""), width=8, height=8) plot(ss2,ss1-ss2,ylim=c(-300,300), cex=0.8, xlab="scan number of file2", ylab="diff between scan number of file1 and file2", main="Global Smooth Transform") o=order(ss2) lines(ss2[o],sm(ss2[o])-ss2[o],col='green',lwd=2) dev.off() old.scan<-data.f[[i]][,'scan'] data.fg[[i]][, 'scan']<-sm(g2[,'scan']) data.fg[[i]][,'scanFirst']<-sm(g2[,'scanFirst']) data.fg[[i]][,'scanLast']<-sm(g2[,'scanLast']) data.fg[[i]]<-cbind(old.scan, data.fg[[i]]) print(paste("FN=",i,"is reshaped")) } i=TAG old.scan<-data.f[[i]][,'scan'] data.fg[[i]]<-cbind(old.scan, data.fg[[i]]) save(data.fg, file="data.fg.Rdata") mzpool<-NULL for(i in 1:FN) mzpool<-c(mzpool, data.fg[[i]][,'mz']) mzpoint<-quantile(mzpool, seq(0, 1, by=0.005)) save(mzpool, file="mzpool.Rdata") rm(mzpool) save(mzpoint, file="mzpoint.Rdata") ######################################################################### ######################################################################### ######################################################################### ######################################################################### #################2.alignbin################# #divide mz intervals from 85 to 500 into 200 parts #load("data.fg.Rdata") #number of files #load("template.Rdata") #output of PETAL_R_cod_1.R #InforColumn=18 #FN=length(data.fg) #Revised by nakahara in 05/27/13 #length(data.fg) is # of files inforcolumn=InforColumn #take=scan(file="index.R") for (TAKE in 1:(length(mzpoint)-1)) { #this iteration continues to 423th row #mzpoint is accumulated value of mz mz.begin=mzpoint[TAKE] mz.end=mzpoint[TAKE+1] #[mz.begin,mz.end] means one of 0.5% intervals of mz ######################### get data bin.data<-NULL bin.infor<-NULL for(i in 1:FN) { cur<-data.fg[[i]] #identify the 0.5% interval of mz pick<- cur[, "mz"]>=mz.begin & cur[,"mz"]<mz.end #choose data in the 0.5% interval of mz bin.data<-rbind(bin.data, cur[pick,-(1:inforcolumn)]) bin.infor<-rbind(bin.infor, cbind(i,cur[pick,(1:inforcolumn)])) print(paste(TAKE,"th bin","FN=",i,"data is choosen")) } bin.data<-as.matrix(bin.data) #change the column name of "scan" ##Why do we need change these columns name? bin.infor[, "scanFirst"]<-bin.infor[,"scan"] bin.infor[,"scanLast"]<-bin.infor[,"scan"] ################################# mz.cur<-mean(bin.infor[,"mz"]) #mono isotope at 20 shape.cur<-specEle(mz.cur, template.data, template.infor, K=100) ################################# mz.end.new<-max(mz.end, mz.begin+3) mz.begin.new<-mz.begin-1 n<-nrow(bin.infor) #mz.index is a sequence from mz.begin.new to mz.end.new+0.028 mz.index<-seq(mz.begin.new, mz.end.new+0.028, by=6/216) m<-length(mz.index) #temp has x rows and y columns, #where x is 2xmz.index #and y is # of bins in the 0.5% interval of mz temp<-apply(as.matrix(1:n), 1, GenBasis, mz.index=mz.index,bin.infor=bin.infor, shape.cur=shape.cur, bin.data=bin.data) #totaldata is a first half of temp totaldata<-t(temp[-(1:length(mz.index)),]) #totalbaisis is the latter half of temp totalbasis<-t(temp[1:length(mz.index),]) rm(temp) #revise temp to data in the final 0.5% interval of mz temp<-as.matrix(bin.infor) numcol <- c(1:5,7:19) temp[,numcol] <- as.numeric(temp[,numcol]) temp[,6] <- as.character(temp[,6]) totalinfor<-matrix(as.numeric(as.vector(temp)), nrow=nrow(totaldata)) colnames(totalinfor)<-colnames(temp) totalinfor[is.na(totalinfor)]<-0 ############################################################################# ####################################################### #begin.infor and begin.basis both have x rows and y columns, #where x is xmz.index #and y is # of bins in the final 0.5% interval of mz begin.infor<-totalinfor begin.basis<-totalbasis ####### normalize the initial data #scal.matrix is a matrix for normalization scal.matrix<-matrix(apply(totaldata, 1, max), nrow=nrow(totaldata), ncol=ncol(totaldata), byrow=F) totaldata.norm<-totaldata/scal.matrix ###### regression #begin.infor and begin.basis both have x rows and y columns, #where x is xmz.index #and y is # of bins in the 0.5% interval of mz CUR.basis<-begin.basis CUR.infor<-begin.infor #set parameters for optimization niter=5 max.step=c(FN+1, rep(5, niter)) track=rep(0, niter+1) track[1]<-nrow(CUR.basis) tree.delta<-0.5 ####### iteration procedure to produce nice basises library(lars) library(elasticnet) for(i in 1:5) { Sys.sleep(1) coematrix<-Do.elastic.new(CUR.basis, CUR.infor, totaldata.norm, totalinfor, k=max.step[i], scan.window=50) ######### combine some redundent basis base.count<-apply(abs(coematrix)>0.001, 2, sum) colnames(coematrix)=1:ncol(coematrix) #coe.dis<-dist(normline(t(coematrix[,base.count>0]))) coe.dis<-consen.dist(t(coematrix[,base.count>0])) ### modified, removed "normline" coe.clust<-hclust(coe.dis, method="average") coe.ct<-cutree(coe.clust, h=tree.delta) base.cl<-rep(0, length(base.count)) base.cl[base.count>0]<-coe.ct ########## new.basis<-combine.base.new(base.cl, CUR.basis, CUR.infor, mz.begin.new,mz.end.new, shape.cur) CUR.basis<-new.basis$basis CUR.infor<-new.basis$info track[i+1]<-nrow(CUR.basis) if(track[i+1]==track[i]) {break} } ############################### back to original data ######## use lasso instead coematrix.final<-Do.lasso.new(CUR.basis, CUR.infor, totaldata, totalinfor, k=2, scan.window=50) ######## manually clean the matrix, remove those frustrating confounding coematrix.clean<-clean.coe(coematrix.final, CUR.infor, mz.window=0.3) base.count<-apply(abs(coematrix.clean)>0.001, 2, sum) base.cl<-rep(0, length(base.count)) base.cl[base.count>0]<-1:sum(base.count>0) final.ba<-combine.base.new(base.cl, CUR.basis, CUR.infor, mz.begin,mz.end, shape.cur) Final.basis<-final.ba$basis Final.infor<-final.ba$info coematrix.f<-Do.lasso.new(Final.basis, Final.infor, totaldata, totalinfor, k=2, scan.window=50) coematrix.clean.f<-clean.coe(coematrix.f, Final.infor, mz.window=0.3) ########################### convert to peptide array result<-call.basis(coematrix.clean.f, totalinfor, basis.infor=Final.infor) basis.select<-result$Result PepA<-result$PeptideArray sample.ind<-totalinfor[,1] scan.name<-paste("OldScan", 1:max(sample.ind), sep="") inten.name<-paste("Intensity", 1:max(sample.ind), sep="") col.name<-c('scan', 'mz', 'charge', scan.name, inten.name) colnames(PepA)<-col.name write(t(PepA), ncolumns=ncol(PepA), file=paste("Peptide", TAKE, sep="")) print(paste(0.5*TAKE,"% of elasticnet is done")) } ######################3.Finalization###################### #load("mzpoint.Rdata") #load("data.fg.Rdata") #number of files #FN=length(data.fg) #number of files #source(file="R_cod_PETAL_function.txt") FN<-length(data.list) PepArray.result<-NULL #check.cur2<-1:200 for(i in 1:200) { filename<-paste("./Peptide", i,sep="") cur.data<-matrix(scan(file=filename), ncol=2*FN+3, byrow=T) #temp<-cur.data[,10]>100 & cur.data[,11]>100 #check.cur2[i]<-cor(log(cur.data[temp,10]), log(cur.data[temp,11])) PepArray.result<-rbind(PepArray.result, cur.data) print(paste("peptide",i,"=","(",nrow(PepArray.result), ncol(PepArray.result),")")) } #change order of PepArray.result by mz mz.order<-order(PepArray.result[,2]) PepArray.result<-PepArray.result[mz.order,] cbind(mzpoint, mzpoint) bins<-cbind(mzpoint-0.05, mzpoint+0.05) #pick.up row number of peaks inclueded in same mz bins #and has same retention time and charge #"getPair.inbin" is function for detecting the peaks double.count<-apply(as.matrix(1:nrow(bins)), 1, getPair.inbin, bins=bins, result=PepArray.result) ################################################################# remove<-NULL n.c<-length(double.count) if(n.c>0) { for(i in 1:n.c) { #double.count the peaks has by peptide file. cur<-double.count[[i]] temp<-nrow(cur) if(temp>0) { for(j in 1:temp) { #pick up the peaks cur.pair<-cur[j, ] #merge the peaks's intensity temp1<-apply(PepArray.result[cur.pair,-(1:(FN+3))], 1, sum) #pick up the lower intensity's row number n.keep<-cur.pair[which.min(temp1)[1]] remove<-c(remove, n.keep) } } } } #remove the lower intensity's row from PepArray.Final if(length(remove)>0) { PepArray.Final<-PepArray.result[-remove, ] } else{ PepArray.Final<-PepArray.result} #rename columns of PepArray.Final default <- c("scan","mz","charge") colscan <- NULL colint <- NULL for (i in 1:FN) { colscan <- c(colscan, paste(i,"_scan_old",sep="")) colint <- c(colint, paste(i,"_intensity",sep="")) } colall <- c(default,colscan,colint) colnames(PepArray.Final)<-colall write.table(PepArray.Final, file="PepArray.Final.txt") save(PepArray.Final, file="msInspect.PETAL.PepArray.Rdata")

*1:strsplit(x,","))))) k<-length(dd) ddd<-matrix(0, nrow=k, ncol=216) #revised from 217 to 216 by nakahara 13/05/25 for(i in 1:k) { ddd[i,]<-as.numeric(as.vector(dd[i][[1]][1:216]