Bioconductor PETAL - DSL

Rのソースコードはここから取得する：http://peiwang.fhcrc.org/research-project.html
論文はこれ：http://www.ncbi.nlm.nih.gov/pubmed/16880200
要約：http://nakhirot.hatenablog.com/entry/2013/05/02/103659
ReadMe.txtの通りに実行すれば良いが、コードミスや不要なデータの入出力も多い
下記、修正版
#################1.Getting data##################

#/usr/local/bin/R

#### before PETAL, apply "cleanCol.pl" on the output file of msInspect and get filtered results.

##################################### begin to do alignment

InforColumn=18
CHAR=6
source(file="R_cod_PETAL_function.txt")

###############specify the file names subjected to alignment

#file.name=????

setwd("XXX")
file.name = list.files(path="YYY",pattern="mzXML$")
#"XXX","YYY"は適宜設定

############################## read in file

tsv2csv <- function(file) #read mzXML file and normalize it.
{  
  pp = read.table(file, header=TRUE, sep="\t")
  pp <- pp[,-18]  #added by nakahara 13/05/25
  dd = lapply(as.character(pp[,18]), function(x) as.numeric(unlist*1 #revised from 217 to 216 by nakahara 13/05/25
  }
  
  # ddd = matrix(unlist(dd), ncol=216, byrow=TRUE) #revised from 217 to 216 by nakahara
  csv = cbind(pp[,1:17], ddd)
  csv <- csv[csv[,"charge"]>0,]
  print(paste("dimention of raw data is (",nrow(csv),ncol(csv),")"))
  return(csv)
}

file.n<-length(file.name)
data.list<-list(NULL)

for(i in 1:file.n)
{ 
  data.list[[i]]<-tsv2csv(file.name[i])
  print(paste("FN=",i,"is read"))
}

FN<-length(data.list)
RAW.data<-data.list

head(RAW.data[[1]])

############## preparing template

#We need much time to make this module done!

total.data<-list(NULL)
total.infor<-list(NULL)

for(char in 1:CHAR)
{
  total.data[[char]]<-rep(0, 216)
  total.data[[char]]<-total.data[[char]][-(1:216)] #added by nakahara in 05/27/13
  total.infor[[char]]<-rep(0, 18)
  total.infor[[char]]<-total.infor[[char]][-(1:18)] #added by nakahara in 05/27/13
  
  for(i in 1:FN) {
    cur<-RAW.data[[i]]
    pick<- cur[, "charge"]==char
    if(sum(pick)>0) {
      total.data[[char]]<-rbind(total.data[[char]],cur[pick,-(1:17)])
      total.infor[[char]]<-rbind(total.infor[[char]],
                                 cbind(i,cur[pick, (1:17)])) #i=file number
      print(paste("(char=",char,"FN=",i,")","is done"))
    }
  }
  
  #change order by mz
  order.temp<-order(total.infor[[char]][,"mz"])
  total.infor[[char]]<-total.infor[[char]][order.temp,]
  total.data[[char]]<-total.data[[char]][order.temp,]
}

################################ don't need that much for template

sample.data<-total.data
sample.infor<-total.infor

for(char in 1:CHAR)
{
  count=nrow(total.data[[char]])
  print(paste("# of row of total.data,","charge=",char,",count=",count))
  if(count>10000)
  {
    pick=sort(sample(1:count, 10000, replace=FALSE))
    sample.data[[char]]=(total.data[[char]])[pick,]
    sample.infor[[char]]=(total.infor[[char]])[pick,]
  }
}

########## normalize and truncate
total.data.n<-list(NULL)
for(take in 1:CHAR)
{
  cur.data<-as.matrix(sample.data[[take]])[,90:216]
  cur.mag<-apply(cur.data,1,max)
  cur.min<-apply(cur.data,1,min)
  temp1<-matrix(cur.mag, nrow=nrow(cur.data), ncol=ncol(cur.data), byrow=F)
  temp2<-matrix(cur.min, nrow=nrow(cur.data), ncol=ncol(cur.data), byrow=F)
  total.data.n[[take]]<-(cur.data-temp2)/(temp1-temp2+0.0001)
}

########### quality control for templates

template.data<-list(NULL)
template.infor<-list(NULL)

for(take in 1:CHAR)
{
  cur.data<-total.data.n[[take]]
  
  if(take<4)
  {  # filtering wrong charge status
    wrong.right.1<-round(1/(take+1)/(6/216),0)+20
    wrong.right.2<-round(1/(take+2)/(6/216),0)+20
    temp<-apply(cur.data, 1, wrong.mono, k=20, cut=0.25)
    good<-(!temp) & cur.data[,20]>0.4 & cur.data[, wrong.right.1]<0.6 & cur.data[, wrong.right.2]<0.6
  } else{# filtering wrong mono-isotope position
    temp<-apply(cur.data, 1, wrong.mono, k=20, cut=0.25)
    good<-(1:nrow(cur.data))[!temp]
  }
  template.data[[take]]<-cur.data[good,]
  template.infor[[take]]<-sample.infor[[take]][good,]	   
}

############### save datas
save(template.data, template.infor, file = "template.Rdata")
save(RAW.data, file="RAW.Rdata")

data.f<-RAW.data
for(i in 1:FN)
{
  cur.data<-RAW.data[[i]]
  cur.mz<-cur.data[,'mz']
  temp.order<-order(cur.mz)
  data.f[[i]]<-cur.data[temp.order,]
  print(paste("FN=",i,"is stored in data.f"))
}
######################################### Global Alignment 
################################ time warping, same as msInspect

data.fg<-data.f

##################### template
TAG=1
Quan=0.6

sigma=100 
sigma2=20 
plot.curve=TRUE

g1=data.f[[TAG]]
I.1=g1[, 'intensity']
cut.1<-quantile(I.1, Quan)
f1<-g1[g1[,'intensity']>cut.1,]


for(i in (1:FN)[-TAG])
{ 
  g2=data.f[[i]]
  I.2=g2[, 'intensity']
  cut.2<-quantile(I.2, Quan)   
  f2<-g2[g2[,'intensity']>cut.2,]
  
  P=get_pairs(f1,f2)
  
  ss1=f1[P[,1],'scan']
  ss2=f2[P[,2],'scan']
  
  # coef=linear_align(ss2, ss1, sigma)
  coef=c(0,1)
  
  s2t=coef[1]+coef[2]*ss2
  sm2=msmooth(s2t,ss1-s2t,df0=10,sigma2,sigma2)
  sm=function(x) 
  {
    xt=coef[1]+coef[2]*x
    xtt=predict(sm2,xt)$y+xt
  }
  
  ####### how about running med
  
  pdf(file=paste("globalsmooth", i, ".pdf", sep=""), width=8, height=8)
  plot(ss2,ss1-ss2,ylim=c(-300,300), cex=0.8, xlab="scan number of file2", ylab="diff between scan number of file1 and file2", main="Global Smooth Transform")
  o=order(ss2)
  lines(ss2[o],sm(ss2[o])-ss2[o],col='green',lwd=2)
  dev.off()
  
  
  old.scan<-data.f[[i]][,'scan']
  data.fg[[i]][, 'scan']<-sm(g2[,'scan'])
  data.fg[[i]][,'scanFirst']<-sm(g2[,'scanFirst'])
  data.fg[[i]][,'scanLast']<-sm(g2[,'scanLast'])
  data.fg[[i]]<-cbind(old.scan, data.fg[[i]])
  print(paste("FN=",i,"is reshaped"))
}

i=TAG
old.scan<-data.f[[i]][,'scan']
data.fg[[i]]<-cbind(old.scan, data.fg[[i]])
save(data.fg, file="data.fg.Rdata")

mzpool<-NULL
for(i in 1:FN)
  mzpool<-c(mzpool, data.fg[[i]][,'mz'])

mzpoint<-quantile(mzpool, seq(0, 1, by=0.005))

save(mzpool, file="mzpool.Rdata")
rm(mzpool)    


save(mzpoint, file="mzpoint.Rdata")
#########################################################################
#########################################################################
#########################################################################
#########################################################################

#################2.alignbin#################
#divide mz intervals from 85 to 500 into 200 parts
#load("data.fg.Rdata") #number of files
#load("template.Rdata") #output of PETAL_R_cod_1.R

#InforColumn=18
#FN=length(data.fg)
#Revised by nakahara in 05/27/13
#length(data.fg) is # of files


inforcolumn=InforColumn
#take=scan(file="index.R")
for (TAKE in 1:(length(mzpoint)-1)) { #this iteration continues to 423th row

#mzpoint is accumulated value of mz
mz.begin=mzpoint[TAKE] 
mz.end=mzpoint[TAKE+1]
#[mz.begin,mz.end] means one of 0.5% intervals of mz

######################### get data
bin.data<-NULL
bin.infor<-NULL

for(i in 1:FN)
{
  cur<-data.fg[[i]]
  
  #identify the 0.5% interval of mz
  pick<- cur[, "mz"]>=mz.begin & cur[,"mz"]<mz.end
  
  #choose data in the 0.5% interval of mz
  bin.data<-rbind(bin.data, cur[pick,-(1:inforcolumn)])
  bin.infor<-rbind(bin.infor, cbind(i,cur[pick,(1:inforcolumn)]))
  print(paste(TAKE,"th bin","FN=",i,"data is choosen"))
}

bin.data<-as.matrix(bin.data)

#change the column name of "scan"
##Why do we need change these columns name?
bin.infor[, "scanFirst"]<-bin.infor[,"scan"]
bin.infor[,"scanLast"]<-bin.infor[,"scan"]

#################################
mz.cur<-mean(bin.infor[,"mz"])

#mono isotope at 20
shape.cur<-specEle(mz.cur, template.data, template.infor, K=100)

#################################

mz.end.new<-max(mz.end, mz.begin+3)
mz.begin.new<-mz.begin-1

n<-nrow(bin.infor)
#mz.index is a sequence from mz.begin.new to mz.end.new+0.028
mz.index<-seq(mz.begin.new, mz.end.new+0.028, by=6/216)
m<-length(mz.index)

#temp has x rows and y columns,
#where x is 2xmz.index
#and y is # of bins in the 0.5% interval of mz 
temp<-apply(as.matrix(1:n), 1, GenBasis, 
            mz.index=mz.index,bin.infor=bin.infor,
            shape.cur=shape.cur, bin.data=bin.data)

#totaldata is a first half of temp
totaldata<-t(temp[-(1:length(mz.index)),])
#totalbaisis is the latter half of temp
totalbasis<-t(temp[1:length(mz.index),])
rm(temp)

#revise temp to data in the final 0.5% interval of mz
temp<-as.matrix(bin.infor)
numcol <- c(1:5,7:19)
temp[,numcol] <- as.numeric(temp[,numcol])
temp[,6] <- as.character(temp[,6])

totalinfor<-matrix(as.numeric(as.vector(temp)), nrow=nrow(totaldata))
colnames(totalinfor)<-colnames(temp)
totalinfor[is.na(totalinfor)]<-0
#############################################################################
#######################################################

#begin.infor and begin.basis both have x rows and y columns,
#where x is xmz.index
#and y is # of bins in the final 0.5% interval of mz 

begin.infor<-totalinfor
begin.basis<-totalbasis

####### normalize the initial data
#scal.matrix is a matrix for normalization
scal.matrix<-matrix(apply(totaldata, 1, max), nrow=nrow(totaldata), ncol=ncol(totaldata), byrow=F)
totaldata.norm<-totaldata/scal.matrix

######  regression
#begin.infor and begin.basis both have x rows and y columns,
#where x is xmz.index
#and y is # of bins in the 0.5% interval of mz 
CUR.basis<-begin.basis
CUR.infor<-begin.infor

#set parameters for optimization
niter=5
max.step=c(FN+1, rep(5, niter))
track=rep(0, niter+1)
track[1]<-nrow(CUR.basis)
tree.delta<-0.5

####### iteration procedure to produce nice basises
library(lars)
library(elasticnet)

for(i in 1:5) {
  Sys.sleep(1)
  coematrix<-Do.elastic.new(CUR.basis, CUR.infor, totaldata.norm, totalinfor, k=max.step[i], scan.window=50)
  
  ######### combine some redundent basis
  base.count<-apply(abs(coematrix)>0.001, 2, sum)
  colnames(coematrix)=1:ncol(coematrix)
  
  #coe.dis<-dist(normline(t(coematrix[,base.count>0])))
  coe.dis<-consen.dist(t(coematrix[,base.count>0])) ### modified, removed "normline"
  coe.clust<-hclust(coe.dis, method="average")
  coe.ct<-cutree(coe.clust, h=tree.delta)
  
  base.cl<-rep(0, length(base.count))
  base.cl[base.count>0]<-coe.ct
  
  ##########
  
  new.basis<-combine.base.new(base.cl, CUR.basis, CUR.infor, mz.begin.new,mz.end.new, shape.cur)
  CUR.basis<-new.basis$basis
  CUR.infor<-new.basis$info
  track[i+1]<-nrow(CUR.basis)
  
  if(track[i+1]==track[i])
  {break}
}

###############################  back to original data
######## use lasso instead
coematrix.final<-Do.lasso.new(CUR.basis, CUR.infor, totaldata, totalinfor, k=2, scan.window=50)

######## manually clean the matrix, remove those frustrating confounding
coematrix.clean<-clean.coe(coematrix.final, CUR.infor, mz.window=0.3)

base.count<-apply(abs(coematrix.clean)>0.001, 2, sum)
base.cl<-rep(0, length(base.count))
base.cl[base.count>0]<-1:sum(base.count>0)
final.ba<-combine.base.new(base.cl, CUR.basis, CUR.infor, mz.begin,mz.end, shape.cur)
Final.basis<-final.ba$basis
Final.infor<-final.ba$info

coematrix.f<-Do.lasso.new(Final.basis, Final.infor, totaldata, totalinfor, k=2, scan.window=50)
coematrix.clean.f<-clean.coe(coematrix.f, Final.infor, mz.window=0.3)

########################### convert to peptide array
result<-call.basis(coematrix.clean.f, totalinfor, basis.infor=Final.infor)
basis.select<-result$Result
PepA<-result$PeptideArray

sample.ind<-totalinfor[,1]
scan.name<-paste("OldScan", 1:max(sample.ind), sep="")
inten.name<-paste("Intensity", 1:max(sample.ind), sep="")
col.name<-c('scan', 'mz', 'charge', scan.name, inten.name)
colnames(PepA)<-col.name

write(t(PepA), ncolumns=ncol(PepA), file=paste("Peptide", TAKE, sep=""))

print(paste(0.5*TAKE,"% of elasticnet is done"))
}

######################3.Finalization######################
#load("mzpoint.Rdata")
#load("data.fg.Rdata") #number of files
#FN=length(data.fg) #number of files
#source(file="R_cod_PETAL_function.txt")

FN<-length(data.list)
PepArray.result<-NULL
#check.cur2<-1:200

for(i in 1:200)
{
  filename<-paste("./Peptide", i,sep="")
  cur.data<-matrix(scan(file=filename), ncol=2*FN+3, byrow=T)
  #temp<-cur.data[,10]>100 & cur.data[,11]>100
  #check.cur2[i]<-cor(log(cur.data[temp,10]), log(cur.data[temp,11]))
  PepArray.result<-rbind(PepArray.result, cur.data)
  print(paste("peptide",i,"=","(",nrow(PepArray.result),
              ncol(PepArray.result),")"))
}

#change order of PepArray.result by mz
mz.order<-order(PepArray.result[,2])
PepArray.result<-PepArray.result[mz.order,]

cbind(mzpoint, mzpoint)
bins<-cbind(mzpoint-0.05, mzpoint+0.05)

#pick.up row number of peaks inclueded in same mz bins 
#and has same retention time and charge
#"getPair.inbin" is function for detecting the peaks
double.count<-apply(as.matrix(1:nrow(bins)), 1, 
                    getPair.inbin, bins=bins, 
                    result=PepArray.result)

#################################################################
remove<-NULL
n.c<-length(double.count)

if(n.c>0)
{
  for(i in 1:n.c)
  {
    #double.count the peaks has by peptide file.
    cur<-double.count[[i]]   
    temp<-nrow(cur)
    if(temp>0)
    {
      for(j in 1:temp)
      {
        #pick up the peaks
        cur.pair<-cur[j, ]
        #merge the peaks's intensity
        temp1<-apply(PepArray.result[cur.pair,-(1:(FN+3))], 1, sum)
        #pick up the lower intensity's row number
        n.keep<-cur.pair[which.min(temp1)[1]]
        remove<-c(remove, n.keep)
      }
    }
  }
}

#remove the lower intensity's row from PepArray.Final
if(length(remove)>0)
{
  PepArray.Final<-PepArray.result[-remove, ]
} else{ PepArray.Final<-PepArray.result}

#rename columns of PepArray.Final
default <- c("scan","mz","charge")
colscan <- NULL
colint <- NULL

for (i in 1:FN) {
  colscan <- c(colscan, paste(i,"_scan_old",sep=""))
  colint <- c(colint, paste(i,"_intensity",sep=""))
}

colall <- c(default,colscan,colint)
colnames(PepArray.Final)<-colall

write.table(PepArray.Final, file="PepArray.Final.txt")
save(PepArray.Final, file="msInspect.PETAL.PepArray.Rdata")
*1:strsplit(x,","))))) k<-length(dd) ddd<-matrix(0, nrow=k, ncol=216) #revised from 217 to 216 by nakahara 13/05/25 for(i in 1:k) { ddd[i,]<-as.numeric(as.vector(dd[i][[1]][1:216]