當前位置：首頁 > 编程资源 > 编程问答 >内容正文

编程问答

Salmon构建索引的时间效率和计算效率明显高于STAR

發布時間：2025/3/15 编程问答 18 豆豆

生活随笔收集整理的這篇文章主要介紹了 Salmon构建索引的时间效率和计算效率明显高于STAR 小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

前面我們評估了不同大小基因組基于STAR構建索引所需的計算資源和時間資源、不同大小數據集基于STAR進行比對所需的計算資源和時間資源和STAR比對速度與分配線程的關系。

將人類基因組按染色體拆分模擬不同大小基因組構建索引的計算資源需求

采用染色體累加的方式，不斷模擬不同大小的基因組對計算資源的需求。

gffread GRCh38.gtf -g GRCh38.fa -w GRCh38.transcript.fa.tmp# gffread生成的fasta文件同時包含基因名字和轉錄本名字 grep '>' GRCh38.transcript.fa.tmp | head# 去掉空格后面的字符串，保證cDNA文件中fasta序列的名字簡潔，不然后續會出錯 cut -f 1 -d ' ' GRCh38.transcript.fa.tmp >GRCh38.transcript.fa/bin/rm -f GRCh38_tmp.seqid # i=2 for i in `seq 1 22`; doecho "chr$i" >>GRCh38_tmp.seqidseqkit grep -f GRCh38_tmp.seqid GRCh38.fa -o GRCh38_tmp.fa# 一定得刪掉，不然提取轉錄本信息會少/bin/rm -f GRCh38_tmp.fa.faiawk 'BEGIN{OFS=FS="\t"}ARGIND==1{id1[$1]=1}ARGIND==2{if(id1[$1]==1) print $0}' GRCh38_tmp.seqid GRCh38.gtf >GRCh38_tmp.gtfgffread GRCh38_tmp.gtf -g GRCh38_tmp.fa -w GRCh38.transcript.fa.tmpgrep -c '>' GRCh38.transcript.fa.tmp# 去掉空格后面的字符串，保證cDNA文件中fasta序列的名字簡潔，不然后續會出錯cut -f 1 -d ' ' GRCh38.transcript.fa.tmp >GRCh38.transcript_tmp.fagrep '^>' GRCh38_tmp.fa | cut -d ' ' -f 1 | sed 's/^>//g' >GRCh38.decoys.txt# 合并cDNA和基因組序列一起# 注意：cDNA在前，基因組在后cat GRCh38.transcript_tmp.fa GRCh38_tmp.fa >GRCh38_trans_genome.fa# 構建索引（更慢，結果會更準） # --no-version-check/bin/rm -rf GRCh38.salmon_sa_index_tmp/usr/bin/time -o Salmon_human_genome_${i}.log -v salmon index -t GRCh38_trans_genome.fa -d GRCh38.decoys.txt -i GRCh38.salmon_sa_index_tmp -p 2du -s GRCh38_trans_genome.fa | awk 'BEGIN{OFS="\t"}{print "Genome_size: "$1/10^6}' >>Salmon_human_genome_${i}.logdu -s GRCh38.transcript_tmp.fa | awk 'BEGIN{OFS="\t"}{print "Transcript_size: "$1/10^6}' >>Salmon_human_genome_${i}.logdu -s GRCh38.salmon_sa_index_tmp | awk 'BEGIN{OFS="\t"}{print "Output_size: "$1/10^6}' >>Salmon_human_genome_${i}.loggrep -c '>' GRCh38.transcript_tmp.fa | awk 'BEGIN{OFS="\t"}{print "Transcript_count: "$1}' >>Salmon_human_genome_${i}.log done# 獲取所有基因組序列的名字存儲于decoy中 grep '^>' GRCh38.fa | cut -d ' ' -f 1 | sed 's/^>//g' >GRCh38.decoys.txt# 合并cDNA和基因組序列一起 # 注意：cDNA在前，基因組在后cat GRCh38.transcript.fa GRCh38.fa >GRCh38_trans_genome.fa# 構建索引（更慢，結果會更準） # --no-version-check /bin/rm -rf GRCh38.salmon_sa_index /usr/bin/time -o Salmon_human_genome.log -v salmon index -t GRCh38_trans_genome.fa -d GRCh38.decoys.txt -i GRCh38.salmon_sa_index -p 2 du -s GRCh38_trans_genome.fa | awk 'BEGIN{OFS="\t"}{print "Genome_size: "$1/10^6}' >>Salmon_human_genome.log du -s GRCh38.transcript.fa | awk 'BEGIN{OFS="\t"}{print "Transcript_size: "$1/10^6}' >>Salmon_human_genome.log du -s GRCh38.salmon_sa_index | awk 'BEGIN{OFS="\t"}{print "Output_size: "$1/10^6}' >>Salmon_human_genome.log grep -c '>' GRCh38.transcript.fa | awk 'BEGIN{OFS="\t"}{print "Transcript_count: "$1}' >>Salmon_human_genome.log

整合數據生成最終測試結果數據集

首先寫一個awk腳本，整理并轉換 CPU 使用率、程序耗時、最大物理內存需求。這個腳本存儲為timeIntegrate3.awk。

#!/usr/bin/awk -ffunction to_minutes(time_str) {a=split(time_str,array1,":"); minutes=0; count=1; for(i=a;i>=1;--i) {minutes+=array1[count]*60^(i-2);count+=1;}return minutes;} BEGIN{OFS="\t"; FS=": "; }ARGIND==1{if(FNR==1) header=$0; else datasize=$0;}ARGIND==2{if(FNR==1 && outputHeader==1) print "Time_cost\tMemory_cost\tnCPU\tOutput_size\tTranscript_size\tTranscript_count", header;if($1~/Elapsed/) {time_cost=to_minutes($2);} else if($1~/Maximum resident set size/){memory_cost=$2/10^6;} else if($1~/CPU/) {cpu=($2+0)/100} else if($1~/Output_size/){Output_size=$2} else if($1~/Transcript_size/){Transcript_size=$2} else if($1~/Transcript_count/){Transcript_count=$2} }END{print time_cost, memory_cost, cpu, Output_size, Transcript_size, Transcript_count, datasize}

用timeIntegrate3.awk整合評估結果

/bin/rm -f GRCh38_salmon_genome_build.summary for i in `seq 1 22`; doawk -v i=${i} 'BEGIN{for(j=1;j<=i;j++) scaffold["chr"j]=1;}\{if(scaffold[$1]==1) genomeSize+=$2}END{print "Genome_size";print genomeSize/10^9}' star_GRCh38/chrNameLength.txt | \awk -v outputHeader=$i -f ./timeIntegrate3.awk - Salmon_human_genome_${i}.log >>GRCh38_salmon_genome_build.summary doneawk '{genomeSize+=$2}END{print "Genome_size"; print genomeSize/10^9}' star_GRCh38/chrNameLength.txt \ | awk -f ./timeIntegrate3.awk - Salmon_human_genome.log >>GRCh38_salmon_genome_build.summary

最終獲得結果文件如下：

Time_cost Memory_cost nCPU Output_size Transcript_size Transcript_count Genome_size 4.39933 1.39078 2.27 1.27743 0.036424 20716 0.248956 9.0455 2.82505 2.28 2.59288 0.067352 38135 0.49115 12.481 4.01651 2.28 3.66561 0.091868 52968 0.689446 15.8388 5.15384 2.29 4.69 0.109064 62709 0.87966 20.1408 6.3714 2.32 5.78156 0.127944 74002 1.0612 22.9222 8.55484 2.3 6.72696 0.147956 85002 1.232 25.6965 9.34814 2.32 7.61427 0.166756 96223 1.39135 28.2602 10.0071 2.31 8.41734 0.181776 105970 1.53649 30.9775 13.3335 2.32 9.15804 0.196396 113998 1.67488 33.9318 14.3744 2.32 10.096 0.211736 122370 1.80868 38.1807 12.1258 2.33 10.8738 0.23446 136491 1.94377 40.3075 16.167 2.34 11.6486 0.255484 149677 2.07704 42.642 16.8637 2.34 12.2845 0.26292 154002 2.19141 47.5528 16.7611 2.35 12.8888 0.276596 162687 2.29845 47.7633 17.2338 2.36 13.4525 0.291424 171376 2.40044 51.327 17.7096 2.35 13.9687 0.308412 182566 2.49078 54.8048 18.2542 2.39 14.4603 0.329668 196504 2.57404 55.135 18.6732 2.35 14.9188 0.337088 201042 2.65441 56.1127 19.1378 2.37 15.2844 0.357176 214955 2.71303 57.8897 19.4656 2.35 15.6548 0.366436 220525 2.77747 58.7447 19.7024 2.38 15.9044 0.371412 223564 2.82418 59.8983 19.9482 2.36 16.185 0.379824 228615 2.875 61.1667 21.1396 2.37 17.4117 0.394968 236920 3.09975

構建索引的時間隨數據量的變化

Salmon構建索引的時間隨基因組大小/染色體大小/染色體數目增加而增加，基本成線性關系

同樣基因組大小，給定相同線程數時，Salmon速度快于STAR。

數據量更大時，salmon的速度優勢更明顯。

library(ImageGP) library(patchwork)p1 <- sp_scatterplot("GRCh38_salmon_genome_build.summary", melted = T, xvariable = "Genome_size",yvariable = "Time_cost", smooth_method = "auto",x_label ="Genome size (Gb)", y_label = "Running time (minutes)") +scale_x_continuous(breaks=seq(0,4, by=0.5)) +scale_y_continuous(breaks=seq(1,62,by=2)) p2 <- sp_scatterplot("GRCh38_salmon_genome_build.summary", melted = T, xvariable = "Transcript_size",yvariable = "Time_cost", smooth_method = "auto",x_label ="Transcript size (Gb)", y_label = "Running time (minutes)") +scale_x_continuous(breaks=seq(0,0.4, by=0.05)) +scale_y_continuous(breaks=seq(1,62,by=2)) p3 <- sp_scatterplot("GRCh38_salmon_genome_build.summary", melted = T, xvariable = "Transcript_count",yvariable = "Time_cost", smooth_method = "auto",x_label ="Transcript count", y_label = "Running time (minutes)") +scale_x_continuous(breaks=seq(20000,240000, length=5)) +scale_y_continuous(breaks=seq(1,62,by=2)) p1 + p2 + p3salmon_genome <- sp_readTable("GRCh38_salmon_genome_build.summary") star_genome <- sp_readTable("GRCh38_star_genome_build.summary")salmon_genome_merge <- salmon_genome[,c("Time_cost", "Memory_cost", "nCPU", "Output_size", "Genome_size"), drop=F] salmon_genome_merge$Tool <- "Salmon" colnames(star_genome) <- c("Time_cost", "Memory_cost", "nCPU", "Genome_size", "Output_size" ) star_genome <- star_genome[,c("Time_cost", "Memory_cost", "nCPU", "Genome_size", "Output_size" )] star_genome$Tool <- "STAR"genome_build_merge <- rbind(salmon_genome_merge, star_genome) sp_scatterplot(genome_build_merge, melted = T, xvariable = "Genome_size",yvariable = "Time_cost", smooth_method = "auto",color_variable = "Tool",x_label ="Genome size (Gb)", y_label = "Running time (minutes)") +scale_x_continuous(breaks=seq(0,4, by=0.5)) +scale_y_continuous(breaks=seq(1,100,by=4))

構建索引的所需內存隨數據量的變化

Salmon構建索引的內存需求隨基因組大小/染色體大小/染色體數目增加而增加，基本成線性關系

Salmon對內存的需求明顯小于STAR的需求。

對人類3G大小的基因組，Salmon的內存需求只占STAR的一半

如果用Salmon構建索引時不考慮基因組信息所需內存更少

p1 <- sp_scatterplot("GRCh38_salmon_genome_build.summary", melted = T, xvariable = "Genome_size",yvariable = "Memory_cost", smooth_method = "auto",x_label ="Genome size (Gb)", y_label = "Maximum physical memory required (Gb)") +scale_x_continuous(breaks=seq(0.5,5, by=0.5)) +scale_y_continuous(breaks=seq(1,22,length=22)) p2 <- sp_scatterplot("GRCh38_salmon_genome_build.summary", melted = T, xvariable = "Transcript_size",yvariable = "Memory_cost", smooth_method = "auto",x_label ="Transcript size (Gb)", y_label = "Maximum physical memory required (Gb)") +scale_x_continuous(breaks=seq(0,0.4, by=0.05)) +scale_y_continuous(breaks=seq(1,22,length=22)) p3 <- sp_scatterplot("GRCh38_salmon_genome_build.summary", melted = T, xvariable = "Transcript_count",yvariable = "Memory_cost", smooth_method = "auto",x_label ="Transcript count", y_label = "Maximum physical memory required (Gb)") +scale_x_continuous(breaks=seq(20000,240000, length=5)) +scale_y_continuous(breaks=seq(1,22,length=22)) p1 + p2 + p3sp_scatterplot(genome_build_merge, melted = T, xvariable = "Genome_size",yvariable = "Memory_cost", smooth_method = "auto",color_variable = "Tool",x_label ="Genome size (Gb)", y_label = "Maximum physical memory required (Gb)") +scale_x_continuous(breaks=seq(0,4, by=0.5)) +scale_y_continuous(breaks=seq(1,40,by=2),limits=c(0,40))

構建索引時對 CPU 的利用率

Salmon的CPU利用率跟數據大小關系不大，且并行效率很高。

Salmon的CPU利用率更穩定，且明顯高于STAR

p1 <- sp_scatterplot("GRCh38_salmon_genome_build.summary", melted = T, xvariable = "Genome_size",yvariable = "nCPU", smooth_method = "auto",x_label ="Genome size (Gb)", y_label = "Number of CPUs used") +scale_x_continuous(breaks=seq(0.5,5, by=0.5)) +scale_y_continuous(breaks=seq(1,4.5,by=0.5), limits=c(0,4)) p2 <- sp_scatterplot("GRCh38_salmon_genome_build.summary", melted = T, xvariable = "Transcript_size",yvariable = "nCPU", smooth_method = "auto",x_label ="Transcript size (Gb)", y_label = "Number of CPUs used") +scale_x_continuous(breaks=seq(0,0.4, by=0.05)) +scale_y_continuous(breaks=seq(1,4.5,by=0.5), limits=c(0,4)) p3 <- sp_scatterplot("GRCh38_salmon_genome_build.summary", melted = T, xvariable = "Transcript_count",yvariable = "nCPU", smooth_method = "auto",x_label ="Transcript count", y_label = "Number of CPUs used)") +scale_x_continuous(breaks=seq(20000,240000, length=5)) +scale_y_continuous(breaks=seq(1,4.5,by=0.5), limits=c(0,4)) p1 + p2 + p3sp_scatterplot(genome_build_merge, melted = T, xvariable = "Genome_size",yvariable = "nCPU", smooth_method = "auto",color_variable = "Tool",x_label ="Genome size (Gb)", y_label = "Number of CPUs used)") +scale_x_continuous(breaks=seq(0,4, by=0.5)) +scale_y_continuous(breaks=seq(0,3,by=0.5),limits=c(0,3))

生成的索引大小根基因組大小的關系（磁盤需求）

Salmon生成的索引大小跟基因組大小正相關

Salmon構建的索引占用磁盤空間更小

基因組增大時，Salmon所需磁盤空間增速小于STAR

p1 <- sp_scatterplot("GRCh38_salmon_genome_build.summary", melted = T, xvariable = "Genome_size",yvariable = "Output_size", smooth_method = "auto",x_label ="Genome size (Gb)", y_label = "Disk space usages (Gb)") +scale_x_continuous(breaks=seq(0.5,5, by=0.5)) +scale_y_continuous(breaks=seq(1,18,by=1)) p2 <- sp_scatterplot("GRCh38_salmon_genome_build.summary", melted = T, xvariable = "Transcript_size",yvariable = "Output_size", smooth_method = "auto",x_label ="Transcript size (Gb)", y_label = "Disk space usages (Gb)") +scale_x_continuous(breaks=seq(0,0.4, by=0.05)) +scale_y_continuous(breaks=seq(1,18,by=1)) p3 <- sp_scatterplot("GRCh38_salmon_genome_build.summary", melted = T, xvariable = "Transcript_count",yvariable = "Output_size", smooth_method = "auto",x_label ="Transcript count", y_label = "Disk space usages (Gb)") +scale_x_continuous(breaks=seq(20000,240000, length=5)) +scale_y_continuous(breaks=seq(1,18,by=1)) p1 + p2 + p3sp_scatterplot(genome_build_merge, melted = T, xvariable = "Genome_size",yvariable = "Output_size", smooth_method = "auto",color_variable = "Tool",x_label ="Genome size (Gb)", y_label = "Disk space usages (Gb)") +scale_x_continuous(breaks=seq(0,4, by=0.5)) +scale_y_continuous(breaks=seq(0,30,by=2),limits=c(0,30))

往期精品(點擊圖片直達文字對應教程)

機器學習

后臺回復“生信寶典福利第一波”或點擊閱讀原文獲取教程合集

總結

以上是生活随笔為你收集整理的Salmon构建索引的时间效率和计算效率明显高于STAR的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

上一篇： ROS在编译生成自定义消息时报错Modu
下一篇：吴恩达机器学习笔记一