Changes between Version 57 and Version 58 of SnpCallingPipeline


Ignore:
Timestamp:
Dec 9, 2010 9:44:13 AM (13 years ago)
Author:
Leon Mei
Comment:

--

Legend:

Unmodified
Added
Removed
Modified
  • SnpCallingPipeline

    v57 v58  
    1515digraph g {
    1616
    17   size="10,10" node [shape=box,style=filled,color=white] "dbsnp" "reference.fasta" "realign.intervals" "indelcalls.vcf" "chr[1-24] .fasta" "flowcell_lane.1.fq.gz" "flowcell_lane.2.fq.gz" "flowcell_lane.aligned.bam" "flowcell_lane2.aligned.bam" "flowcell_lane3.aligned.bam" "sample.aligned.bam" "sample QC reports" "sample_chr[1-24] .vcf"
     17  size="10,10" node [shape=box,style=filled,color=white] "dbsnp" "reference.fasta" "realign.intervals" "indelcalls.vcf" "chr[1-24]  .fasta" "flowcell_lane.1.fq.gz" "flowcell_lane.2.fq.gz" "flowcell_lane.aligned.bam" "flowcell_lane2.aligned.bam" "flowcell_lane3.aligned.bam" "sample.aligned.bam" "sample QC reports" "sample_chr[1-24] .vcf"
    1818
    1919  node [shape=ellipse,color=yellow]
     
    2222    style=filled; color=lightgrey;
    2323
    24   "reference.fasta" -> RealignerTargetCreator  -> "realign.intervals" "indelcalls.vcf"-> RealignerTargetCreator   "reference.fasta"->Split->"chr[1-24] .fasta"  dbsnp -> RealignerTargetCreator   label = "Per genome (1)";
     24  "reference.fasta" -> RealignerTargetCreator   -> "realign.intervals" "indelcalls.vcf"-> RealignerTargetCreator    "reference.fasta"->Split->"chr[1-24]  .fasta"  dbsnp -> RealignerTargetCreator    label = "Per genome (1)";
    2525
    2626}
    2727
    2828  subgraph cluster_1 {
    29     style=filled; color=lightgrey; "flowcell_lane.1.fq.gz" -> align1 -> alignPE "chr[1-24] .fasta" -> align1 "chr[1-24] .fasta" -> align2 "chr[1-24] .fasta" -> alignPE "flowcell_lane.2.fq.gz" -> align2 -> alignPE -> MarkDuplicates  -> "IndelRealigner  & \n FixMateInformation  (knownsOnly)" ->"Quality Recalibration"->"flowcell_lane.aligned.bam" "realign.intervals" -> "IndelRealigner  & \n FixMateInformation  (knownsOnly)"    label = "Per Lane (750*3=2250) ";
     29    style=filled; color=lightgrey; "flowcell_lane.1.fq.gz" -> align1 -> alignPE "chr[1-24]  .fasta" -> align1 "chr[1-24]  .fasta" -> align2 "chr[1-24]  .fasta" -> alignPE "flowcell_lane.2.fq.gz" -> align2 -> alignPE -> MarkDuplicates   -> "IndelRealigner   & \n FixMateInformation   (knownsOnly)" ->"Quality Recalibration"->"flowcell_lane.aligned.bam" "realign.intervals" -> "IndelRealigner   & \n FixMateInformation   (knownsOnly)"    label = "Per Lane (750*3=2250) ";
    3030  }
    3131
    3232  subgraph cluster_2 {
    33     style=filled; color=lightgrey; "flowcell_lane.aligned.bam" -> Merge -> "sample.aligned.bam" -> "IndelRealigner  & FixMateInformation " "flowcell_lane2.aligned.bam" -> Merge "flowcell_lane3.aligned.bam" -> Merge "IndelRealigner  & FixMateInformation " -> IndelGenotyperV2 -> FilterSingleCalls  -> UnifiedGenotyper  -> Filtration -> VariantEval  -> "sample QC reports"
     33    style=filled; color=lightgrey; "flowcell_lane.aligned.bam" -> Merge -> "sample.aligned.bam" -> "IndelRealigner   & FixMateInformation  " "flowcell_lane2.aligned.bam" -> Merge "flowcell_lane3.aligned.bam" -> Merge "IndelRealigner   & FixMateInformation  " -> IndelGenotyperV2 -> FilterSingleCalls   -> UnifiedGenotyper   -> Filtration -> VariantEval   -> "sample QC reports"
    3434
    3535Filtration -> "sample_chr[1-24].vcf"
     
    4242    style=filled; color=lightgrey;
    4343
    44   "sample.aligned.bam" -> "UnifiedGenotype  (without realign)"->"QC against arrays and BGI"
     44  "sample.aligned.bam" -> "UnifiedGenotype   (without realign)"->"QC against arrays and BGI"
    4545
    4646  label = "QC per sample";
     
    8686==  ==
    8787== Optimization? ==
    88 {{{
    89 {| {{table}}
    90 | align="center" style="background:#f0f0f0;"|'''Step'''
    91 | align="center" style="background:#f0f0f0;"|'''Cores'''
    92 | align="center" style="background:#f0f0f0;"|'''Memory (gb)'''
    93 | align="center" style="background:#f0f0f0;"|'''Time (hh.mm)'''
    94 |-
    95 | BWA alignment||1||± 6||10.05
    96 |-
    97 | BWA spe||1||||3.35
    98 |-
    99 | Sam-Bam||1||||12.3
    100 |-
    101 | Sam sort||1||||5.05
    102 |-
    103 | Mark Duplicates||1||4||1.55
    104 |-
    105 | Realignment (knowns only)||1||8 (*can be lowered)||5.2
    106 |-
    107 | Fix mates||1||6 (*)||3.05
    108 |-
    109 | Covariates bef.||1||2||12.35
    110 |-
    111 | Recalibrate||1||4||7.3
    112 |-
    113 | Sam sort||1||||4.5
    114 |-
    115 | Covariates aft.||1||2||11.2
    116 |-
    117 | Analyze Covar.||1||4||< 00.01
    118 |-
    119 | Total||||||± 90 (< 4 days)
    120 |-
    121 |
    122 |}
    123 }}}
    124 === Disk ===
     88==== Current ====
     89Step    Cores    Memory (gb)    Time (hh.mm)[[BR]]BWA alignment    1    ± 6    10.05[[BR]]BWA spe    1        3.35[[BR]]Sam-Bam    1        12.3[[BR]]Sam sort    1        5.05[[BR]]Mark Duplicates    1    4    1.55[[BR]]Realignment (knowns only)    1    8 (*can be lowered)    5.2[[BR]]Fix mates    1    6 (*)    3.05[[BR]]Covariates bef.    1    2    12.35[[BR]]Recalibrate    1    4    7.3[[BR]]Sam sort    1        4.5[[BR]]Covariates aft.    1    2    11.2[[BR]]Analyze Covar.    1    4    < 00.01
     90
     91==== Disk ====
    12592 * Option 1, If it is possible to let a node guarantee certain amount of disk space (/tmp), we should use the entire cluster. Before start running a pipeline, we can just ask the node to reserve that amount of disk space.
    12693 * Option 2, If we can cut a dedicate part of the cluster, we can use our own scheduler to share the nodes/disks. E.g, depending on the disk space usage pattern and how we can remove the data, we can decide which jobs run at which node and when.
    12794
    128 === Memory/CPU time ===
    129  * Can multiple samples use the same reference genome in memory during the BWA alignment. I.e. 1 sample->6GB, 3 samples->6GB. 
     95==== Memory/CPU time ====
     96 * Can multiple samples use the same reference genome in memory during the BWA alignment. I.e. 1 sample->6GB, 3 samples->6GB.
    13097   * NO?
    131  * Can we parallelize the Markduplicate? 
     98 * Can we parallelize the Markduplicate?
    13299   * YES!
    133100   * !MarkDuplicates finds sequence pairs that map to the same position, marking or removing the duplicates so you can work with unique pairs in downstream analyses. If you want them removed, use the REMOVE_DUPLICATES=true flag when running the program.
    134  * Can we parallelize covariate before/after, recalibration? 
     101 * Can we parallelize covariate before/after, recalibration?
    135102   * Don't know
    136103