Bioconductor PETAL
- Rのソースコードはここから取得する:http://peiwang.fhcrc.org/research-project.html
- 論文はこれ:http://www.ncbi.nlm.nih.gov/pubmed/16880200
- 要約:http://nakhirot.hatenablog.com/entry/2013/05/02/103659
- ReadMe.txtの通りに実行すれば良いが、コードミスや不要なデータの入出力も多い
- 下記、修正版
#################1.Getting data##################
#/usr/local/bin/R
#### before PETAL, apply "cleanCol.pl" on the output file of msInspect and get filtered results.
##################################### begin to do alignment
InforColumn=18
CHAR=6
source(file="R_cod_PETAL_function.txt")
###############specify the file names subjected to alignment
#file.name=????
setwd("XXX")
file.name = list.files(path="YYY",pattern="mzXML$")
#"XXX","YYY"は適宜設定
############################## read in file
tsv2csv <- function(file) #read mzXML file and normalize it.
{
pp = read.table(file, header=TRUE, sep="\t")
pp <- pp[,-18] #added by nakahara 13/05/25
dd = lapply(as.character(pp[,18]), function(x) as.numeric(unlist*1 #revised from 217 to 216 by nakahara 13/05/25
}
# ddd = matrix(unlist(dd), ncol=216, byrow=TRUE) #revised from 217 to 216 by nakahara
csv = cbind(pp[,1:17], ddd)
csv <- csv[csv[,"charge"]>0,]
print(paste("dimention of raw data is (",nrow(csv),ncol(csv),")"))
return(csv)
}
file.n<-length(file.name)
data.list<-list(NULL)
for(i in 1:file.n)
{
data.list[[i]]<-tsv2csv(file.name[i])
print(paste("FN=",i,"is read"))
}
FN<-length(data.list)
RAW.data<-data.list
head(RAW.data[[1]])
############## preparing template
#We need much time to make this module done!
total.data<-list(NULL)
total.infor<-list(NULL)
for(char in 1:CHAR)
{
total.data[[char]]<-rep(0, 216)
total.data[[char]]<-total.data[[char]][-(1:216)] #added by nakahara in 05/27/13
total.infor[[char]]<-rep(0, 18)
total.infor[[char]]<-total.infor[[char]][-(1:18)] #added by nakahara in 05/27/13
for(i in 1:FN) {
cur<-RAW.data[[i]]
pick<- cur[, "charge"]==char
if(sum(pick)>0) {
total.data[[char]]<-rbind(total.data[[char]],cur[pick,-(1:17)])
total.infor[[char]]<-rbind(total.infor[[char]],
cbind(i,cur[pick, (1:17)])) #i=file number
print(paste("(char=",char,"FN=",i,")","is done"))
}
}
#change order by mz
order.temp<-order(total.infor[[char]][,"mz"])
total.infor[[char]]<-total.infor[[char]][order.temp,]
total.data[[char]]<-total.data[[char]][order.temp,]
}
################################ don't need that much for template
sample.data<-total.data
sample.infor<-total.infor
for(char in 1:CHAR)
{
count=nrow(total.data[[char]])
print(paste("# of row of total.data,","charge=",char,",count=",count))
if(count>10000)
{
pick=sort(sample(1:count, 10000, replace=FALSE))
sample.data[[char]]=(total.data[[char]])[pick,]
sample.infor[[char]]=(total.infor[[char]])[pick,]
}
}
########## normalize and truncate
total.data.n<-list(NULL)
for(take in 1:CHAR)
{
cur.data<-as.matrix(sample.data[[take]])[,90:216]
cur.mag<-apply(cur.data,1,max)
cur.min<-apply(cur.data,1,min)
temp1<-matrix(cur.mag, nrow=nrow(cur.data), ncol=ncol(cur.data), byrow=F)
temp2<-matrix(cur.min, nrow=nrow(cur.data), ncol=ncol(cur.data), byrow=F)
total.data.n[[take]]<-(cur.data-temp2)/(temp1-temp2+0.0001)
}
########### quality control for templates
template.data<-list(NULL)
template.infor<-list(NULL)
for(take in 1:CHAR)
{
cur.data<-total.data.n[[take]]
if(take<4)
{ # filtering wrong charge status
wrong.right.1<-round(1/(take+1)/(6/216),0)+20
wrong.right.2<-round(1/(take+2)/(6/216),0)+20
temp<-apply(cur.data, 1, wrong.mono, k=20, cut=0.25)
good<-(!temp) & cur.data[,20]>0.4 & cur.data[, wrong.right.1]<0.6 & cur.data[, wrong.right.2]<0.6
} else{# filtering wrong mono-isotope position
temp<-apply(cur.data, 1, wrong.mono, k=20, cut=0.25)
good<-(1:nrow(cur.data))[!temp]
}
template.data[[take]]<-cur.data[good,]
template.infor[[take]]<-sample.infor[[take]][good,]
}
############### save datas
save(template.data, template.infor, file = "template.Rdata")
save(RAW.data, file="RAW.Rdata")
data.f<-RAW.data
for(i in 1:FN)
{
cur.data<-RAW.data[[i]]
cur.mz<-cur.data[,'mz']
temp.order<-order(cur.mz)
data.f[[i]]<-cur.data[temp.order,]
print(paste("FN=",i,"is stored in data.f"))
}
######################################### Global Alignment
################################ time warping, same as msInspect
data.fg<-data.f
##################### template
TAG=1
Quan=0.6
sigma=100
sigma2=20
plot.curve=TRUE
g1=data.f[[TAG]]
I.1=g1[, 'intensity']
cut.1<-quantile(I.1, Quan)
f1<-g1[g1[,'intensity']>cut.1,]
for(i in (1:FN)[-TAG])
{
g2=data.f[[i]]
I.2=g2[, 'intensity']
cut.2<-quantile(I.2, Quan)
f2<-g2[g2[,'intensity']>cut.2,]
P=get_pairs(f1,f2)
ss1=f1[P[,1],'scan']
ss2=f2[P[,2],'scan']
# coef=linear_align(ss2, ss1, sigma)
coef=c(0,1)
s2t=coef[1]+coef[2]*ss2
sm2=msmooth(s2t,ss1-s2t,df0=10,sigma2,sigma2)
sm=function(x)
{
xt=coef[1]+coef[2]*x
xtt=predict(sm2,xt)$y+xt
}
####### how about running med
pdf(file=paste("globalsmooth", i, ".pdf", sep=""), width=8, height=8)
plot(ss2,ss1-ss2,ylim=c(-300,300), cex=0.8, xlab="scan number of file2", ylab="diff between scan number of file1 and file2", main="Global Smooth Transform")
o=order(ss2)
lines(ss2[o],sm(ss2[o])-ss2[o],col='green',lwd=2)
dev.off()
old.scan<-data.f[[i]][,'scan']
data.fg[[i]][, 'scan']<-sm(g2[,'scan'])
data.fg[[i]][,'scanFirst']<-sm(g2[,'scanFirst'])
data.fg[[i]][,'scanLast']<-sm(g2[,'scanLast'])
data.fg[[i]]<-cbind(old.scan, data.fg[[i]])
print(paste("FN=",i,"is reshaped"))
}
i=TAG
old.scan<-data.f[[i]][,'scan']
data.fg[[i]]<-cbind(old.scan, data.fg[[i]])
save(data.fg, file="data.fg.Rdata")
mzpool<-NULL
for(i in 1:FN)
mzpool<-c(mzpool, data.fg[[i]][,'mz'])
mzpoint<-quantile(mzpool, seq(0, 1, by=0.005))
save(mzpool, file="mzpool.Rdata")
rm(mzpool)
save(mzpoint, file="mzpoint.Rdata")
#########################################################################
#########################################################################
#########################################################################
#########################################################################
#################2.alignbin#################
#divide mz intervals from 85 to 500 into 200 parts
#load("data.fg.Rdata") #number of files
#load("template.Rdata") #output of PETAL_R_cod_1.R
#InforColumn=18
#FN=length(data.fg)
#Revised by nakahara in 05/27/13
#length(data.fg) is # of files
inforcolumn=InforColumn
#take=scan(file="index.R")
for (TAKE in 1:(length(mzpoint)-1)) { #this iteration continues to 423th row
#mzpoint is accumulated value of mz
mz.begin=mzpoint[TAKE]
mz.end=mzpoint[TAKE+1]
#[mz.begin,mz.end] means one of 0.5% intervals of mz
######################### get data
bin.data<-NULL
bin.infor<-NULL
for(i in 1:FN)
{
cur<-data.fg[[i]]
#identify the 0.5% interval of mz
pick<- cur[, "mz"]>=mz.begin & cur[,"mz"]<mz.end
#choose data in the 0.5% interval of mz
bin.data<-rbind(bin.data, cur[pick,-(1:inforcolumn)])
bin.infor<-rbind(bin.infor, cbind(i,cur[pick,(1:inforcolumn)]))
print(paste(TAKE,"th bin","FN=",i,"data is choosen"))
}
bin.data<-as.matrix(bin.data)
#change the column name of "scan"
##Why do we need change these columns name?
bin.infor[, "scanFirst"]<-bin.infor[,"scan"]
bin.infor[,"scanLast"]<-bin.infor[,"scan"]
#################################
mz.cur<-mean(bin.infor[,"mz"])
#mono isotope at 20
shape.cur<-specEle(mz.cur, template.data, template.infor, K=100)
#################################
mz.end.new<-max(mz.end, mz.begin+3)
mz.begin.new<-mz.begin-1
n<-nrow(bin.infor)
#mz.index is a sequence from mz.begin.new to mz.end.new+0.028
mz.index<-seq(mz.begin.new, mz.end.new+0.028, by=6/216)
m<-length(mz.index)
#temp has x rows and y columns,
#where x is 2xmz.index
#and y is # of bins in the 0.5% interval of mz
temp<-apply(as.matrix(1:n), 1, GenBasis,
mz.index=mz.index,bin.infor=bin.infor,
shape.cur=shape.cur, bin.data=bin.data)
#totaldata is a first half of temp
totaldata<-t(temp[-(1:length(mz.index)),])
#totalbaisis is the latter half of temp
totalbasis<-t(temp[1:length(mz.index),])
rm(temp)
#revise temp to data in the final 0.5% interval of mz
temp<-as.matrix(bin.infor)
numcol <- c(1:5,7:19)
temp[,numcol] <- as.numeric(temp[,numcol])
temp[,6] <- as.character(temp[,6])
totalinfor<-matrix(as.numeric(as.vector(temp)), nrow=nrow(totaldata))
colnames(totalinfor)<-colnames(temp)
totalinfor[is.na(totalinfor)]<-0
#############################################################################
#######################################################
#begin.infor and begin.basis both have x rows and y columns,
#where x is xmz.index
#and y is # of bins in the final 0.5% interval of mz
begin.infor<-totalinfor
begin.basis<-totalbasis
####### normalize the initial data
#scal.matrix is a matrix for normalization
scal.matrix<-matrix(apply(totaldata, 1, max), nrow=nrow(totaldata), ncol=ncol(totaldata), byrow=F)
totaldata.norm<-totaldata/scal.matrix
###### regression
#begin.infor and begin.basis both have x rows and y columns,
#where x is xmz.index
#and y is # of bins in the 0.5% interval of mz
CUR.basis<-begin.basis
CUR.infor<-begin.infor
#set parameters for optimization
niter=5
max.step=c(FN+1, rep(5, niter))
track=rep(0, niter+1)
track[1]<-nrow(CUR.basis)
tree.delta<-0.5
####### iteration procedure to produce nice basises
library(lars)
library(elasticnet)
for(i in 1:5) {
Sys.sleep(1)
coematrix<-Do.elastic.new(CUR.basis, CUR.infor, totaldata.norm, totalinfor, k=max.step[i], scan.window=50)
######### combine some redundent basis
base.count<-apply(abs(coematrix)>0.001, 2, sum)
colnames(coematrix)=1:ncol(coematrix)
#coe.dis<-dist(normline(t(coematrix[,base.count>0])))
coe.dis<-consen.dist(t(coematrix[,base.count>0])) ### modified, removed "normline"
coe.clust<-hclust(coe.dis, method="average")
coe.ct<-cutree(coe.clust, h=tree.delta)
base.cl<-rep(0, length(base.count))
base.cl[base.count>0]<-coe.ct
##########
new.basis<-combine.base.new(base.cl, CUR.basis, CUR.infor, mz.begin.new,mz.end.new, shape.cur)
CUR.basis<-new.basis$basis
CUR.infor<-new.basis$info
track[i+1]<-nrow(CUR.basis)
if(track[i+1]==track[i])
{break}
}
############################### back to original data
######## use lasso instead
coematrix.final<-Do.lasso.new(CUR.basis, CUR.infor, totaldata, totalinfor, k=2, scan.window=50)
######## manually clean the matrix, remove those frustrating confounding
coematrix.clean<-clean.coe(coematrix.final, CUR.infor, mz.window=0.3)
base.count<-apply(abs(coematrix.clean)>0.001, 2, sum)
base.cl<-rep(0, length(base.count))
base.cl[base.count>0]<-1:sum(base.count>0)
final.ba<-combine.base.new(base.cl, CUR.basis, CUR.infor, mz.begin,mz.end, shape.cur)
Final.basis<-final.ba$basis
Final.infor<-final.ba$info
coematrix.f<-Do.lasso.new(Final.basis, Final.infor, totaldata, totalinfor, k=2, scan.window=50)
coematrix.clean.f<-clean.coe(coematrix.f, Final.infor, mz.window=0.3)
########################### convert to peptide array
result<-call.basis(coematrix.clean.f, totalinfor, basis.infor=Final.infor)
basis.select<-result$Result
PepA<-result$PeptideArray
sample.ind<-totalinfor[,1]
scan.name<-paste("OldScan", 1:max(sample.ind), sep="")
inten.name<-paste("Intensity", 1:max(sample.ind), sep="")
col.name<-c('scan', 'mz', 'charge', scan.name, inten.name)
colnames(PepA)<-col.name
write(t(PepA), ncolumns=ncol(PepA), file=paste("Peptide", TAKE, sep=""))
print(paste(0.5*TAKE,"% of elasticnet is done"))
}
######################3.Finalization######################
#load("mzpoint.Rdata")
#load("data.fg.Rdata") #number of files
#FN=length(data.fg) #number of files
#source(file="R_cod_PETAL_function.txt")
FN<-length(data.list)
PepArray.result<-NULL
#check.cur2<-1:200
for(i in 1:200)
{
filename<-paste("./Peptide", i,sep="")
cur.data<-matrix(scan(file=filename), ncol=2*FN+3, byrow=T)
#temp<-cur.data[,10]>100 & cur.data[,11]>100
#check.cur2[i]<-cor(log(cur.data[temp,10]), log(cur.data[temp,11]))
PepArray.result<-rbind(PepArray.result, cur.data)
print(paste("peptide",i,"=","(",nrow(PepArray.result),
ncol(PepArray.result),")"))
}
#change order of PepArray.result by mz
mz.order<-order(PepArray.result[,2])
PepArray.result<-PepArray.result[mz.order,]
cbind(mzpoint, mzpoint)
bins<-cbind(mzpoint-0.05, mzpoint+0.05)
#pick.up row number of peaks inclueded in same mz bins
#and has same retention time and charge
#"getPair.inbin" is function for detecting the peaks
double.count<-apply(as.matrix(1:nrow(bins)), 1,
getPair.inbin, bins=bins,
result=PepArray.result)
#################################################################
remove<-NULL
n.c<-length(double.count)
if(n.c>0)
{
for(i in 1:n.c)
{
#double.count the peaks has by peptide file.
cur<-double.count[[i]]
temp<-nrow(cur)
if(temp>0)
{
for(j in 1:temp)
{
#pick up the peaks
cur.pair<-cur[j, ]
#merge the peaks's intensity
temp1<-apply(PepArray.result[cur.pair,-(1:(FN+3))], 1, sum)
#pick up the lower intensity's row number
n.keep<-cur.pair[which.min(temp1)[1]]
remove<-c(remove, n.keep)
}
}
}
}
#remove the lower intensity's row from PepArray.Final
if(length(remove)>0)
{
PepArray.Final<-PepArray.result[-remove, ]
} else{ PepArray.Final<-PepArray.result}
#rename columns of PepArray.Final
default <- c("scan","mz","charge")
colscan <- NULL
colint <- NULL
for (i in 1:FN) {
colscan <- c(colscan, paste(i,"_scan_old",sep=""))
colint <- c(colint, paste(i,"_intensity",sep=""))
}
colall <- c(default,colscan,colint)
colnames(PepArray.Final)<-colall
write.table(PepArray.Final, file="PepArray.Final.txt")
save(PepArray.Final, file="msInspect.PETAL.PepArray.Rdata")
*1:strsplit(x,","))))) k<-length(dd) ddd<-matrix(0, nrow=k, ncol=216) #revised from 217 to 216 by nakahara 13/05/25 for(i in 1:k) { ddd[i,]<-as.numeric(as.vector(dd[i][[1]][1:216]