Commands used to build LvPtE5C. (Last updated 08/09/2018) #Note the three cycles of fgap_wrapper.sh followed by dedupe.sh. #Each time gaps are filled there is a new possibility to remove a sequence #from one of the haplotypes, and doing so reduces erroneous assembly later on #of the type where both haplotype sequences are incorporated into the genome, #rather than eliminating one of the two. #commands were run on two machines, oboe and bassoon, with different #root directories export OTOPDIR=/home/mathog export BTOPDIR=$BTOPDIR/Lv ##get and install platanus and its trim programs cd $OTOPDIR/src wget -O platanus-1.2.4 http://platanus.bio.titech.ac.jp/?ddownload=145 cd $OTOPDIR/bin wget -O platanus_pe_trim http://platanus.bio.titech.ac.jp/?ddownload=153 wget -O platanus_mp_trim http://platanus.bio.titech.ac.jp/?ddownload=154 ##get and install OPERA-LG #download from sourceforge: # https://sourceforge.net/projects/operasf/files/OPERA-LG%20version%202.0.6/ #install in $OTOPDIR/src #patch one perl script and create a PacBIO specific version. Some of this #may be in newer OPERA-LG versions: cat >/tmp/olg_patches.txt <<'EOD' --- OPERA-long-read.pl 2018-01-24 11:12:15.952938478 -0800 +++ OPERA-pacbio-read.pl 2018-02-08 15:24:04.893806637 -0800 @@ -5,7 +5,19 @@ my $min_contig_size = 300; my $min_opera_contig_size = 500; my $long_read_cluster_threshold = 2; -for($i = 0; $i < 6; $i++){$cluster_threshold_tab[$i] = $long_read_cluster_threshold;} +# sd and mean for fake end pairs derived from long reads. +my @stds = (100, 333, 667, 1333, 2000, 2667, 3333, 4667); +my @means = (300, 1000, 2000, 4000, 6000, 8000, 10000, 14000); +my %inter = (); +my $num_it = @stds; #used elsewhere for the number of loop iterations. +for($i = 0; $i < $num_it; $i++){ + $inter{"i${i}"} = [$stds[$i],$means[$i]]; +} +# For some reason original script had i0 with -200 for stds, so change that one +$inter{"i0"} = [-200,$means[0]]; + + +for($i = 0; $i < $num_it; $i++){$cluster_threshold_tab[$i] = $long_read_cluster_threshold;} my $short_read_cluster_threshold = 3; #my @cluster_threshold_tab = (2,2,2,2,1,1); #my @cluster_threshold_tab = (2,2,2,2,2,1); @@ -27,6 +39,9 @@ #my $mapper_extention my $graphmapDir = "/mnt/software/stow/graphmap-0.3.0-1d16f07/bin/"; +#for emitting more log information +my $command; + #Used in case of grapmap mapping my @contig_id_to_name = (); @@ -39,7 +54,7 @@ my $short_read_tooldir = ""; my $samtools_dir = ""; my $short_read_maptool = "bwa"; -my $kmer_size = 100; +my $kmer_size = 61; my $flag_help; @@ -59,6 +74,7 @@ --opera: Folder which contains opera binary (default PATH) --illumina-read1: fasta file of illumina read1 --illumina-read2: fasta file of illumina read2 + --upto-config: stop after config is generated, before OPERA-LG runs --help : prints this message "; @@ -82,6 +98,7 @@ "short-read-tooldir=s" => \$short_read_tooldir, "illumina-read1=s" => \$illum_read1, "illumina-read2=s" => \$illum_read2, + "upto-config+" => \$upto_config, "help" => \$flag_help ) or die("Error in command line arguments.\n"); @@ -154,7 +171,7 @@ chdir( $outputDir ); my $str_full_path = "or please enter the full path"; -if ( ! -e $contigFile ) {die "\nError: $contigFile - contig file does not exist $str_full_path\n"}; +if ( ! -e $contigFile ) {die "\nError: --num-of-processors $nproc $contigFile - contig file does not exist $str_full_path\n"}; if ( ! -e $readsFile ) {die "\nError: $readsFile - long read file does not exist $str_full_path\n"}; if ( ! -e $illum_read1 ) {die "\nError: $illum_read1 - illumina read 1 file does not exist $str_full_path\n"}; if ( ! -e $illum_read2 ) {die "\nError: $illum_read2 - illumina read 2 file does not exist $str_full_path\n"}; @@ -172,7 +189,7 @@ $str_path_dir = ""; $str_path_dir .= "--tool-dir $short_read_tooldir" if($short_read_tooldir ne ""); $str_path_dir .= "--samtools-dir $samtools_dir" if($samtools_dir ne ""); - run_exe("perl $operaDir/preprocess_reads.pl --map-tool $short_read_maptool $str_path_dir --contig $contigFile --illumina-read1 $illum_read1 --illumina-read2 $illum_read2 --out ${file_pref}.bam"); + run_exe("perl $operaDir/preprocess_reads.pl --num-of-processors $nproc --map-tool $short_read_maptool $str_path_dir --contig $contigFile --illumina-read1 $illum_read1 --illumina-read2 $illum_read2 --out ${file_pref}.bam"); } if(! -e "$file_pref.map.sort"){ @@ -272,7 +289,7 @@ } #Filter the long read edges -for (my $i = 0; $i <= 5; $i++){ +for (my $i = 0; $i < $num_it; $i++){ $edge_file = $all_edge_file."_i$i"; $updated_edge_file = $all_edge_file."_no_repeat_i$i"; push(@allEdgeFiles, $updated_edge_file); @@ -296,22 +313,21 @@ &CreateConfigFile( $contigFile, "", @allEdgeFiles ); # run opera -&run_exe( "${operaDir}OPERA-LG config > log" ); +if (defined($upto_config)) { + $time = localtime; + print STDERR "[$time] config file created, exiting without running OPERA-LG\n"; + exit 0; +} +&run_exe( "$operaDir/OPERA-LG config > log" ); #Link to the result file &run_exe("ln -s results/scaffoldSeq.fasta scaffoldSeq.fasta"); sub extract_edge{ my ($all_edge_file) = @_; + $time = localtime; + print STDERR "[$time] in sub extract_edge\n"; - my %inter = ( - "i0", [-200, 300], - "i1", [300, 1000], - "i2", [1000, 2000], - "i3", [2000, 5000], - "i4", [5000, 15000], - "i5", [15000, 40000] - ); my %out_edge = (); foreach $it (keys %inter){ @@ -327,7 +343,9 @@ @line = split(/\t/, $_); $dist = $line[4]; $support = $line[6]; - foreach $it (keys %inter){ +# foreach $it (keys %inter){ + for($i = 0; $i < $num_it; $i++){ + $it = "i${i}"; if($support >= 0 && $inter{$it}->[0] < $dist && $dist < $inter{$it}->[1]){ $OUT = $out_edge{$it}; print $OUT join("\t", @line)."\n"; @@ -499,6 +517,8 @@ } sub checkMapping{ + $time = localtime; + print STDERR "[$time] in sub createOutputFolder\n"; my ($mapFile, $all_edge_file) = @_; my ($rn, $cn, $score, $unused, $ro, $rs, $re, $rl, $co, $cs, $ce, $cl); my $currentScore = 0;my $previousScore = 0; @@ -636,6 +656,8 @@ sub CreateConfigFile{ my( $contigFile, $suffix, @edgeFiles) = @_; + $time = localtime; + print STDERR "[$time] in sub CreateConfigFile\n"; open( CONF, ">config".$suffix ) or die $!; @@ -666,8 +688,6 @@ } $i = 0; - @means = (300, 1000, 2000, 5000, 15000, 40000); - @stds = (30, 100, 200, 500, 1500, 4000); foreach $edgeFileName ( @edgeFiles ){ print CONF "[LIB]\n"; #print CONF "cluster_threshold=$cluster_threshold\n"; @@ -687,6 +707,6 @@ sub run_exe{ my ($exe) = @_; $run = 1; - print STDERR $exe."\n";; + print STDERR "running command: $exe\n"; print STDERR `$exe` if($run); } --- preprocess_reads.pl.dist 2018-01-25 14:09:22.035548937 -0800 +++ preprocess_reads.pl 2018-01-26 16:58:02.056919925 -0800 @@ -15,12 +15,14 @@ --out Name of the output file --map-tool Mapping tool can be either bwa (default) or bowtie --tool-dir Directory that contains binaries to the chosen mapping tool (default PATH) - --samtools-dir Directory that contains samtools binaries (default PATH) + --samtools-dir Directory that contains samtools binaries (default PATH) + --num-of-processors number of processors used for mapping stages --help Help "; my $path = ""; my $samtoolsDir = ""; +my $nproc=1; GetOptions( @@ -31,12 +33,12 @@ "map-tool=s" => \$mapTool, "tool-dir=s" => \$path, "samtools-dir=s" => \$samtoolsDir, + "num-of-processors=i" => \$nproc, "help" => \$flag_help ) or die("Error in command line arguments.\n"); - if ( $flag_help ){ print $help_message; exit 0; @@ -133,6 +135,8 @@ sub createOutputFolder { + $time = localtime; + print STDERR "[$time] in sub createOutputFolder\n"; @dir = split( "/", $outputFile ); $folder = ""; for( $i = 0; $i < @dir - 1; $i++ ) @@ -148,6 +152,8 @@ sub checkReadFormat { + $time = localtime; + print STDERR "[$time] in sub checkReadFormat\n"; if( $readFile1 =~ /\.gz$/ ){ # open gzip file open( READ, "gunzip -c $readFile1 |" ) or die $!; @@ -182,6 +188,7 @@ { # build the index $time = localtime; + print STDERR "[$time] in sub buildIndexUsingBowtie\n"; print "[$time]\t"; print "Building bowtie index...\n"; @contigName = split( "/", $contigFile ); @@ -190,11 +197,13 @@ if( $type eq "cs" ) { $command = "${path}bowtie-build -C $contigFile $folder$contigName[ -1 ]"; + print "constructed command: $command\nExecuting command\n"; `$command` or die "ERROR! Running bowtie-build error.\n"; } else { $command = "${path}bowtie-build $contigFile $folder$contigName[ -1 ]"; + print "constructed command: $command\nExecuting command\n"; `$command` or die "ERROR! Running bowtie-build error.\n"; } } @@ -207,6 +216,7 @@ sub mapWithBowtie { $time = localtime; + print STDERR "[$time] in sub mapWithBowtie\n"; print "[$time]\t"; print "Mapping reads using bowtie...\n"; if( $type eq "cs" ) @@ -217,11 +227,13 @@ { $command = "${path}bowtie -v 3 -a -m 1 -S -t $fasta -p 15 $folder$contigName[ -1 ] - 2>${folder}bowtie.err | sort -n > $folder$outputFile"; } + print "constructed command: $command\n"; } sub buildIndexUsingBwa { $time = localtime; + print STDERR "[$time] in sub buildIndexUsingBwa\n"; print "[$time]\t"; print "Building bwa index...\n"; @contigName = split( "/", $contigFile ); @@ -230,12 +242,13 @@ if( $type eq "cs" ) { $command = "${path}bwa index -c -p $folder$contigName[ -1 ] $contigFile"; + print "constructed command: $command\nExecuting command\n"; `$command`; # or die "ERROR! Running bwa index error.\n"; } else { $command = "${path}bwa index -p $folder$contigName[ -1 ] $contigFile"; - print $command."\n"; + print "constructed command: $command\nExecuting command\n"; `$command`; # or die "ERROR! Running bwa index error.\n"; } } @@ -247,7 +260,9 @@ sub readAndMap { - if( $readFile1 =~ /\.gz$/ ){ + $time = localtime; + print STDERR "[$time] in sub readAndMap\n"; + if( $readFile1 =~ /\.groot 62551 5389 0 14:06 ? 00:0z$/ ){ # open gzip file open( F, "gunzip -c $readFile1 |" ) or die $!; open( S, "gunzip -c $readFile2 |" ) or die $!; @@ -257,6 +272,7 @@ open( S, "$readFile2" ) or die $!; } + print "Executing command with piped input\n"; open( OUTPUT, "|-", $command ) or die $!; $num = 1; while( $first = ) @@ -316,20 +332,24 @@ { # align with bwa $time = localtime; + print STDERR "[$time] in sub findSAWithBwa\n"; print "[$time]\t"; print "Finding the SA coordinates of the reads using BWA aln...\n"; - $command = "${path}bwa aln -t 30 $folder$contigName[ -1 ] - > ${folder}read.sai"; + $command = "${path}bwa aln -t $nproc $folder$contigName[ -1 ] - > ${folder}read.sai"; + print "constructed command: $command\n"; } sub generateAlignmentUsingBwa { $time = localtime; + print STDERR "[$time] in sub generateAlignmentUsingBwa\n"; print "[$time]\t"; print "Generate alignments of reads using bwa sampe...\n"; # $command = "${path}bwa samse -n 1 $folder$contigName[ -1 ] ${folder}read.sai - | awk \'{ if( \$3 != \"*\" ) print \$0 }\' > $folder$outputFile"; # $command = "${path}bwa samse -n 1 $folder$contigName[ -1 ] ${folder}read.sai - | grep '\\(^@\\|XT:A:U\\)' | samtools view -S -h -b -F 0x4 - | samtools sort -no - ${folder}temporarySam | samtools view -h -b - > $folder$outputFile"; - $command = "${path}bwa samse -n 1 $folder$contigName[ -1 ] ${folder}read.sai - | grep '\\(^@\\|XT:A:U\\)' | ${samtoolsDir}samtools view -S -h -b -F 0x4 - | ${samtoolsDir}samtools sort -\@ 20 -no - ${folder}temporarySam > $folder$outputFile"; - print $command."\n"; + ## $command = "${path}bwa samse -n 1 $folder$contigName[ -1 ] ${folder}read.sai - | grep '\\(^@\\|XT:A:U\\)' | ${samtoolsDir}samtools view -S -h -b -F 0x4 - | ${samtoolsDir}samtools sort -\@ 20 -no - ${folder}temporarySam > $folder$outputFile"; + $command = "${path}bwa samse -n 1 $folder$contigName[ -1 ] ${folder}read.sai - | grep '\\(^@\\|XT:A:U\\)' | ${samtoolsDir}samtools view -S -h -b -F 0x4 - | ${samtoolsDir}samtools sort -\@ 20 -no - > $folder$outputFile"; + print "constructed command: $command\n"; } sub clearBowtie EOD cd $OTOPDIR/src/OPERA-LG_v2.0.6/bin patch -p0 ppe_trim.log 1>&2 & platanus_mp_trim SRR176809_1.fastq SRR176809_2.fastq >pmp_09.log 2>&1 & platanus_mp_trim SRR176810_1.fastq SRR176810_2.fastq >pmp_10.log 2>&1 & platanus_mp_trim SRR176812_1.fastq SRR176812_2.fastq >pmp_12.log 2>&1 & ln -s SRR176812_2.fastq.int_trimmed SRR176812_2.trimmed.fq ln -s SRR176812_1.fastq.int_trimmed SRR176812_1.trimmed.fq ln -s SRR176810_2.fastq.int_trimmed SRR176810_2.trimmed.fq ln -s SRR176810_1.fastq.int_trimmed SRR176810_1.trimmed.fq ln -s SRR176809_2.fastq.int_trimmed SRR176809_2.trimmed.fq ln -s SRR176809_1.fastq.int_trimmed SRR176809_1.trimmed.fq ln -s pe_400_R2.fastq.trimmed pe_400_R2.trimmed.fq ln -s pe_400_R1.fastq.trimmed pe_400_R1.trimmed.fq #platanus assemble, on oboe 01/11/2018 mkdir -p $OTOPDIR/Lv/platanus/run1 cd $OTOPDIR/Lv/platanus/run1 platanus assemble -o LvPtA -f \ ../Lv_ILLUMINA/pe_400_R1.fastq.trimmed \ ../Lv_ILLUMINA/pe_400_R2.fastq.trimmed \ -t 48 -m 256 -u 1.0 #platanus scaffold, on oboe 02/07/2018 # add mate pairs and 454 pairs mkdir -p $OTOPDIR/Lv/platanus/run5 cd $OTOPDIR/Lv/platanus/run5 platanus scaffold -o LvPtE \ -c ../run1/LvPtA_contig.fa \ -b ../run1/LvPtA_contigBubble.fa \ -IP1 ../../Lv_ILLUMINA/pe_400_R1.trimmed.fq \ ../../Lv_ILLUMINA/pe_400_R2.trimmed.fq -n1 100 \ -IP2 ../../Lv_454/pairs.1.fastq ../../Lv_454/pairs.2.fastq \ -a2 2268 -d2 1121 -n2 150 \ -OP3 ../../Lv_ILLUMINA/SRR176809_1.trimmed.fq \ ../../Lv_ILLUMINA/SRR176809_2.trimmed.fq -n3 1200 \ -OP4 ../../Lv_ILLUMINA/SRR176810_1.trimmed.fq \ ../../Lv_ILLUMINA/SRR176810_2.trimmed.fq -n4 1200 \ -OP5 ../../Lv_ILLUMINA/SRR176812_1.trimmed.fq \ ../../Lv_ILLUMINA/SRR176812_2.trimmed.fq -n5 1200 -t 48 -u 1.0 #makes LvPtE_scaffold.fa #platanus gap_close, on oboe 02/08/2018 nice platanus gap_close -o LvPtE -c LvPtDE_scaffold.fa \ -f ../../Lv_454/single.fastq ../../Lv_454/pairs.u.fastq \ -IP1 ../../Lv_ILLUMINA/pe_400_R1.trimmed.fq ../../Lv_ILLUMINA/pe_400_R2.trimmed.fq \ -IP2 ../../Lv_454/pairs.1.fastq ../../Lv_454/pairs.2.fastq \ -OP3 ../../Lv_ILLUMINA/SRR176809_1.trimmed.fq ../../Lv_ILLUMINA/SRR176809_2.trimmed.fq \ -OP4 ../../Lv_ILLUMINA/SRR176810_1.trimmed.fq ../../Lv_ILLUMINA/SRR176810_2.trimmed.fq \ -OP5 ../../Lv_ILLUMINA/SRR176812_1.trimmed.fq ../../Lv_ILLUMINA/SRR176812_2.trimmed.fq \ -t 48 #makes LvPtE_gapClosed.fa #on bassoon 02/15/2018 #make a script fgap_wrapper.sh which runs fgap in parallel and only # on sequences which contain N's (so have a gap to close). It also # retrieves megareads efficiently using the fasta_to_binary and # binary_to_fasta programs. This is a slightly updated # version of the program which was used (different error checking # results should be the same.) cat >$OTOPDIR/bin/fgap_wrapper.sh <<'EOD' #!/bin/bash # 2018/02/26 David Mathog, mathog@caltech.edu. # # This is a wrapper for fgap which runs only the subset of # scaffolds which: # 1. contain N's # 2. had corrected PacBIO reads map to flanking regions for those N's. # It is MUCH faster than letting fgap run on a big genome, which for the 1Gb # one I tried ran for 6 days, most of it with no output, and which was # finally terminated without having emitted a result. # # Method: # Run this script. Providing the parameters: # scaffold_file (fasta) # corrected long reads (fasta) # number of threads # longest sequence allowed (buffer size to allocate) # # fgap is run with -z 1 -g 1. # -z 1 lets it remove just the N's without entering anything else. # -g 1 lets it trim off a bit on the ends flanking the Ns. For some # reason that is needed on sequences like this: # .......AAAAAAAAAAANNNNNNNAGAAAAA.... # Most likely because the blastn extensions span the short NNN and so the # mapped corrected pacbio positions APPEAR to overlap, like: # 100 230 and 200 300 but this is an alignment issue resulting from # doing each side separately, not a real overlap. # ### edit this section if paths change export LONGP=$OTOPDIR/src/MATLAB/MATLAB_Compiler_Runtime/v717 export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:\ $LONGP/runtime/glnxa64:\ $LONGP/bin/glnxa64:\ $LONGP/sys/os/glnxa64:\ $LONGP/sys/java/jre/glnxa64/jre/lib/amd64/native_threads:\ $LONGP/sys/java/jre/glnxa64/jre/lib/amd64/server:\ $LONGP/sys/java/jre/glnxa64/jre/lib/amd64 PFGAP=$OTOPDIR/src/gapFinisher/FGAP/fgap ABATCH=16 # map lines per batch in each thread MINSCORE=50 # also minimum overlap ### test_exec() { THEFILE=`which $1` if [ ! -x $THEFILE ] then echo "fgap_wrapper.sh: fatal error: required binary $THEFILE is not found or executable" exit fi } test_read() { THEFILE=$1 if [ ! -r $THEFILE ] then echo "fgap_wrapper.sh: fatal error: required input $THEFILE is not found or readable" exit fi } do_one_set(){ CHUNK=$1 START=$2 LAST=$3 MAPF=../map_scf_read.txt SCFF=../scf_db CPBF=../cpb_db mkdir chunk_$CHUNK cd chunk_$CHUNK NOW=`date` echo "$NOW thread $CHUNK range $START $LAST" >>all_out.log while [ $START -le $LAST ] do FINAL=$(( $START + $ABATCH - 1 )) if [ $FINAL -ge $LAST ] then FINAL=$LAST fi NOW=`date` echo "$NOW thread $CHUNK processing batch $START $FINAL" >>all_out.log extract -in $MAPF -sr $START -er $FINAL -mt -dl ' ' -fmt '[1]' -wl $MAXMAPLEN \ | binary_to_fasta -in $SCFF -sf 1 -of 2 >scf.fasta 2>/dev/null extract -in $MAPF -sr $START -er $FINAL -mt -dl ' ' -fmt '[2,]' -dv '\n' -wl $MAXMAPLEN \ | sort -n | uniq \ | binary_to_fasta -in $CPBF -sf 1 -of 2 >reads.fasta 2>/dev/null # # do NOT use -z 1 with -g 1, some sequences will take HOURS to process!!!! # $PFGAP -d scf.fasta -a reads.fasta -R 2500 -I 3500 -t 1 -s 50 -z 1 >>all_out.log 2>&1 >all_out.log if [ -e output_fgap.stats ] then cat output_fgap.stats >> all_out.stats rm output_fgap.stats else echo "FAILURE: no output_fgap.stats was created" >> all_out.stats fi if [ -e output_fgap.final.fasta ] then mv output_fgap.final.fasta scf.fasta # # negative ones that can really be replaced are generally small so set -R low. # Otherwise -g 1 may sometimes remove and replace large swaths of bases # which are not Ns. # $PFGAP -d scf.fasta -a reads.fasta -R 200 -I 500 -t 1 -s 50 -p 0 -g 1 >>all_out.log 2>&1 >all_out.log if [ -e output_fgap.stats ] then cat output_fgap.stats >> all_out.stats rm output_fgap.stats else echo "FAILURE: no output_fgap.stats was created" >> all_out.stats fi if [ -e output_fgap.final.fasta ] then cat output_fgap.final.fasta >>all_out.fasta rm -f output_fgap.final.fasta else # do not log failures here because negative gaps are pretty rare so many will "fail" cat scf.fasta >>all_out.fasta # results from first round are still good, copy them fi rm scf.fasta else echo "$NOW BATCH FAILURE -z 1: Some issue with this set or no gaps could be filled." >> all_out.log fi # # If no gaps are filled output_fgap.final.fasta is not written. # If it is not removed afterwards that scaffold will be entered twice into the final list. # Logic above handles these cases. # START=$(( $FINAL + 1 )) done rm scf.fasta NOW=`date` echo "$NOW thread $CHUNK range completed" >>all_out.log } NOW=`date` echo "$NOW processing begins" SCAFFOLDS=$1 CORRECTED=$2 THREADS=$3 MAXSEQ=$4 test_exec binary_to_fasta test_exec extract test_exec seq test_exec fasta2fgap test_exec fastafrag test_exec blastn test_exec makeblastdb test_exec many_blast_plus_1cpu.sh test_read $SCAFFOLDS test_read $CORRECTED #the matlab runtime will sink a hook into X11 if it can find it. If the starting #session then exits so will the runtime. So hide X11 from the matlab runtime. if [ $DISPLAY ] then unset DISPLAY fi WORKDIR=tmp_fgap mkdir -p $WORKDIR NOW=`date` if [ -e $WORKDIR/datasets.fasta ] then echo "$NOW using EXISTING converted corrected reads datasets.fasta" else # # corrected reads. Differs very slightly from FGAP in that it # retains the original header after a space. blastn will drop that # extra text, so final blastn.out will be the same. # echo "$NOW converting corrected reads to datasets.fasta" fastafrag -length 1000000 -outlen 60 -prefix 'dataset_1_%d ' -upper \ <$CORRECTED >$WORKDIR/datasets.fasta fi NOW=`date` if [ -e $WORKDIR/draft_regions.fasta ] then echo "$NOW using EXISTING scaffolds to draft_regions.fasta" else echo "$NOW converting scaffolds to draft_regions.fasta" fasta2fgap -in $SCAFFOLDS -f 300 -m $MINSCORE -d 1 > $WORKDIR/draft_regions.fasta fi cd $WORKDIR NOW=`date` if [ -e datasets.00.nsq ] then echo "$NOW using EXISTING blast database made from datasets.fasta" else echo "$NOW making blast database out of datasets.fasta" makeblastdb -dbtype nucl -in datasets.fasta -title datasets -out datasets fi NOW=`date` if [ -e blastn.out ] then echo "$NOW using EXISTING blastn.out" else echo "$NOW running blastn for draft_regions vs. datasets to make blastn.out" BLASTPARAMS="-db datasets -task blastn -min_raw_gapped_score $MINSCORE -evalue 1e-07 -perc_identity 70 -word_size 15 -gapopen 1 -gapextend 1 -penalty -3 -reward 1 -max_target_seqs 200 -outfmt \"6 qseqid sseqid score bitscore evalue pident nident mismatch sstrand qstart qseq qend sstart sseq send qlen length qcovhsp\"" # PROG=$1 # INFILE=$2 # OUTFILE=$3 # NCPUS=$4 # BLASTPARAMS=$5 many_blast_plus_1cpu.sh blastn draft_regions.fasta blastn.out $THREADS "$BLASTPARAMS" fi cd .. #back in top level # # next command makes a file with structure: # scaffold read1 .... readN # For an assembly with very large scaffolds and lots of repeats # the default of 50M might not be enough. That should be enough to # hold over 6M reads if they take <=8 characters each. # NOW=`date` if [ -e map_scf_read.txt ] then echo "$NOW using EXISTING making map_scf_read.txt" else echo "$NOW making map_scf_read.txt from blastn.out" extract -in $WORKDIR/blastn.out -mt -dl ' \t_' -fmt '[fw6:jr:2] [7]' \ | sort -k 1,1n -k2,2n \ | uniq \ | extract -merge 6 -wl 50000000 >map_scf_read.txt test_read map_scf_read.txt fi NOW=`date` if [ -e cpb_db.hea ] then echo "$NOW using EXISTING cpb_db fasta_to_binary db from $CORRECTED" else echo "$NOW making fasta_to_binary db from $CORRECTED" fasta_to_binary -in $CORRECTED -out cpb_db -ls -ms 500000 test_read cpb_db.hea #one file from binary database of corrected PacBIO reads fi NOW=`date` if [ -e scf_db.hea ] then echo "$NOW using EXISTING scf_db fasta_to_binary db from $SCAFFOLDS" else echo "$NOW making fasta_to_binary db from $SCAFFOLDS" # # Replace ">header" with ">123 header". # This VASTLY simplifies some of the later steps because there is always # an easy to parse index right at the start of every header and we don't # need to poke around within the header itself. Also in some cases there # may not even be an index in the header which corresponds the the entry's # position in the fata file. # cat $SCAFFOLDS \ | fastafrag -prefix '%d ' \ | fasta_to_binary -in - -out scf_db -ls -ms 500000 test_read scf_db.hea #one file from binary database of scaffolds with gaps fi MAXMAPLEN=`cat map_scf_read.txt | wc -L ` MAXMAPLEN=$(( $MAXMAPLEN + 100)) #a few extra bytes are needed ORIGSCFN=`binary_to_fasta -in scf_db -sf 0 -of 2 2>/dev/null | grep '>' | wc -l` LINES=`cat map_scf_read.txt | wc -l` #lines in the map BLOCK=$(( 1 + $LINES / $THREADS )) #total lines from map processed per thread NOW=`date` echo "$NOW lines $LINES block $BLOCK threads $THREADS origscfN $ORIGSCFN" NOW=`date` if [ -e pre_mod_scaffolds.fasta ] then echo "$NOW using EXISTING pre_mod_scaffolds.fasta mod.log mod.stats" else echo "$NOW generating pre_mod_scaffolds.fasta mod.log mod.stats" I=1 MAXTHREADS=0 TSTART=1 while [ $I -le $THREADS ] do TEND=$(($TSTART + $BLOCK -1)) if [ $TSTART -gt $LINES ] then break else MAXTHREADS=$I fi if [ $TEND -gt $LINES ] then TEND=$LINES fi do_one_set $I $TSTART $TEND & TSTART=$(( $TEND + 1 )) I=$(( $I + 1)) done NOW=`date` echo "$NOW started threads: $MAXTHREADS" wait #assemble the pieces which it made truncate --size 0 pre_mod_scaffolds.fasta truncate --size 0 mod.log truncate --size 0 mod.stats I=1 while [ $I -le $MAXTHREADS ] do NOW=`date` echo "$NOW merging results from thread $I" SUBDIR=chunk_$I cat $SUBDIR/all_out.fasta >> pre_mod_scaffolds.fasta cat $SUBDIR/all_out.log >> mod.log cat $SUBDIR/all_out.stats >> mod.stats rm -rf $SUBDIR I=$(( $I + 1)) done fi NOW=`date` if [ -e scaffold_indices.txt ] then echo "$NOW using EXISTING scaffold_indices.txt" else seq 1 $ORIGSCFN > scaffold_indices.txt fi # # some of the scaffolds put into fgap will NOT have been patched at all. # If they were not they will not be in any of the all_out.fasta files # and so not in pre_mod_scaffolds.fasta. The next section grabs the # scaffold number from the header of that file, and it depends on # all scaffolds having the same format headers! # NOW=`date` if [ -e list_modified.txt ] then echo "$NOW using EXISTING list_modified.txt" else echo "$NOW creating list_modified.txt" extract -in pre_mod_scaffolds.fasta -if '^>' -ifonly -mt -dl '> ' -fmt '[fw10:jr:1]X' -wl $MAXMAPLEN >list_modified.txt extract -in scaffold_indices.txt -fmt '[fw10:jr:1,]' >>list_modified.txt fi NUMMODIFIED=`cat list_modified.txt | grep 'X' | wc -l` # # for speed extract all of the entries in one fell swoop which fgap didn't process. Collapse # the fasta files into single lines and sort on the first field. Then process back into fasta # form. # NOW=`date` if [ -e modified_scaffolds.txt ] && [ -e unmodified_scaffolds.txt ] then echo "$NOW using EXISTING modified_scaffolds.txt" else echo "$NOW creating modified_scaffolds.txt" sort -k 1,1n list_modified.txt | extract -merge 10 >runs_both grep -v X runs_both >unmodified_scaffolds.txt grep X runs_both | tr -d 'X' > modified_scaffolds.txt rm runs_both fi NOW=`date` if [ -e tmp_db.hea ] then echo "$NOW using EXISTING tmp_db" else echo "$NOW making tmp_db" # # Enter two sets of sequences first modified, then unmodified. # ( cat pre_mod_scaffolds.fasta ; \ binary_to_fasta -in scf_db -sel unmodified_scaffolds.txt -sf 1 -of 2 ) \ | fasta_to_binary -out tmp_db -ls -ms 500000 fi # # make the final sequence files. Remove the numeric index # from the headers. # cat modified_scaffolds.txt unmodified_scaffolds.txt \ | extract -in -,scaffold_indices.txt -indl ' ' \ | sort -k 1,1n \ | extract -mt -fmt '[2]' \ | binary_to_fasta -in tmp_db -out - -sf 1 -of 2 \ | extract -out all_scf.fasta -if '>' -mt -dl '> ' -fmt '>[2,]' -wl $MAXSEQ cat pre_mod_scaffolds.fasta \ | extract -out mod_scaffolds.fasta -if '>' -mt -dl '> ' -fmt '>[2,]' -wl $MAXSEQ NOW=`date` echo "$NOW cleaning up temporary files" # save these when debugging: change true to false if true then rm cpb_db.* rm scf_db.* rm tmp_db.* rm list_modified.txt fi NOW=`date` echo "$NOW Scaffolds which were processed: $MODSEL" echo "$NOW Scaffolds which were modified: $NUMMODIFIED" echo "$NOW List of processed scaffolds are in modified_scaffolds.txt" echo "$NOW Merged scaffold sequences, modified and unmodified, are in all_scf.fasta" echo "$NOW done" EOD #gap fill with FGAP, on bassoon 02/15/2018 mkdir -p $BTOPDIR/Lv/FGAP/LvPtE #move some data oboe to bassoon, on oboe 02/15/2018 cd $OTOPDIR/Lv/do_masurca_B scp mr.41.15.17.0.029.1.fa \ bassoon:$BTOPDIR/Lv/FGAP/lv_mscB_mr.fa cd $OTOPDIR/Lv/platanus/run5 scp LvPtE_gapClosed.fa bassoon:/tmp/LvPtE_gapClosed.fa #resume work, gap fill producing all_scf.fasta, on bassoon 02/15/2018 cd $BTOPDIR/Lv/FGAP/LvPtE ln -s /tmp/LvPtE_gapClosed.fa LvPtE_gapClosed.fa nice fgap_wrapper.sh LvPtE_gapClosed.fa ../lv_mscB_mr.fa 44 50000000 \ >fgap_wrapperB.log 2>&1 #deduplicate the gap filled sequences, on bassoon 03/02/2018 # tried many different combinations of parameters, this was best. # Because of the highly polymorphic sequences dedupe.sh does not work # very well. Even if the geometry is right the gaps/mismatches often # exceed the limits and the two haplotypes of a sequence will both persist. # Also it fails miserably when the two haplotype pieces overlap like this: # ------------ # ------------ # mkdir $BTOPDIR/Lv/FGAP/other cd $BTOPDIR/Lv/FGAP/other nice $OTOPDIR/src/bbmap/dedupe.sh \ in=../all_scf.fasta \ out=LvPtE_gapClosedFGAPdedupe7.fa \ outd=LvPtE_gapClosedFGAP_dups_removed7.fa \ findoverlap=t minidentity=97 maxedits=12 \ >LvPtE_gapClosedFGAP_dedupe7.log 2>&1 #make bam files for OPERA-LG, on bassoon 03/02/2018 cat >do_others2.sh <<'EOD' #!/bin/bash #edit the next two lines CONTIGS=LvPtE_gapClosedFGAPdedupe7.fa PREFIX=olg_e5_ do_one() { THIS=$1 nice perl $OTOPDIR/src/OPERA-LG_v2.0.6/bin/preprocess_reads.pl \ --num-of-processors 44 --map-tool bwa \ --contig $CONTIGS \ --illumina-read1 $BTOPDIR/Lv/Lv_ILLUMINA/${THIS}_1.trimmed.fq \ --illumina-read2 $BTOPDIR/Lv/Lv_ILLUMINA/${THIS}_2.trimmed.fq \ --out ${PREFIX}${THIS}.bam >${PREFIX}${THIS}_bam.log 2>&1 } cd $BTOPDIR/Lv/FGAP/other NOW=`date` echo "$NOW starting 454 mapping" nice perl $OTOPDIR/src/OPERA-LG_v2.0.6/bin/preprocess_reads.pl \ --num-of-processors 44 --map-tool bwa \ --contig $CONTIGS \ --illumina-read1 $BTOPDIR/Lv/Lv_454/pairs.1.fastq \ --illumina-read2 $BTOPDIR/Lv/Lv_454/pairs.2.fastq \ --out ${PREFIX}454.bam >${PREFIX}454_bam.log 2>&1 do_one SRR176809 do_one SRR176810 do_one SRR176812 NOW=`date` echo "$NOW done" EOD chmod 755 do_others2.sh ./do_others2.sh >do_others2.log 2>&1 #run OPERA, output to results1 subdirectory, on oboe 03/02/2018 #this uses bam files made above plus it makes olg_e5.bam using #the 400bp pe set. #pacbio_nonredundant.fa was made by MaSuRCA, it follows the CCS step #but is before the megareads are constructed. cd $OTOPDIR/Lv/OPERA-LG/E_RUN5 scp bassoon:$BTOPDIR/Lv/FGAP/other/LvPtE_gapClosedFGAPdedupe7.fa . scp bassoon:$BTOPDIR/Lv/FGAP/other/LvPtE_gapClosedFGAP_dups_removed7.fa . scp bassoon:$BTOPDIR/Lv/FGAP/other/LvPtE_gapClosedFGAP_dedupe7.log . scp bassoon:$BTOPDIR/Lv/FGAP/LvPtE_gapClosedFGAP.fa . cd .. perl $OTOPDIR/src/OPERA-LG_v2.0.6/bin/OPERA-pacbio-read.pl \ --upto-config \ --num-of-processors 48 \ --kmer 31 \ --opera $OTOPDIR/src/OPERA-LG_v2.0.6/bin \ --contig-file $OTOPDIR/Lv/OPERA-LG/E_RUN5/LvPtE_gapClosedFGAPdedupe7.fa \ --illumina-read1 $OTOPDIR/Lv/Lv_ILLUMINA/pe_400_R1.trimmed.fq \ --illumina-read2 $OTOPDIR/Lv/Lv_ILLUMINA/pe_400_R2.trimmed.fq \ --long-read-file $OTOPDIR/Lv/do_masurca_B/pacbio_nonredundant.fa \ --output-prefix olg_e5 \ --output-directory E_RUN5 \ >e_run5.log 2>&1 & cd E_RUN5 # #A config file was created by the preceding command, but it does not know #about the 454 and Illumina MP data. Make e5_config from it. For clarity, # here is the entire e5_config file: cat >e5_config <<'EOD' # # Essential Parameters # # Output folder for final results output_folder=results1 # Contig file contig_file=$OTOPDIR/Lv/OPERA-LG/E_RUN5/LvPtE_gapClosedFGAPdedupe.fa samtools_dir= kmer=31 # Mapped read locations [LIB] map_file=olg_e5.bam cluster_threshold=3 [LIB] cluster_threshold=2 map_file=pairedEdges_no_repeat_i0 lib_mean=300 lib_std=100 map_type=opera [LIB] cluster_threshold=2 map_file=pairedEdges_no_repeat_i1 lib_mean=1000 lib_std=333 map_type=opera [LIB] cluster_threshold=2 map_file=pairedEdges_no_repeat_i2 lib_mean=2000 lib_std=667 map_type=opera [LIB] cluster_threshold=2 map_file=pairedEdges_no_repeat_i3 lib_mean=4000 lib_std=1333 map_type=opera [LIB] cluster_threshold=2 map_file=pairedEdges_no_repeat_i4 lib_mean=6000 lib_std=2000 map_type=opera [LIB] cluster_threshold=2 map_file=pairedEdges_no_repeat_i5 lib_mean=8000 lib_std=2667 map_type=opera [LIB] cluster_threshold=2 map_file=pairedEdges_no_repeat_i6 lib_mean=10000 lib_std=3333 map_type=opera [LIB] cluster_threshold=2 map_file=pairedEdges_no_repeat_i7 lib_mean=14000 lib_std=4667 map_type=opera [LIB] map_file=olg_e5_454.bam cluster_threshold=3 lib_mean=2300 lib_std=1200 [LIB] map_file=olg_e5_SRR176809.bam cluster_threshold=3 lib_mean=2300 lib_std=1200 [LIB] map_file=olg_e5_SRR176810.bam cluster_threshold=3 lib_mean=2300 lib_std=1200 [LIB] map_file=olg_e5_SRR176812.bam cluster_threshold=3 lib_mean=2300 lib_std=1200 EOD $OTOPDIR/src/OPERA-LG_v2.0.6/bin/OPERA-LG \ e5_config >olg_e5.log 2>&1 #gap fill what remains, it makes all_scf.fasta, on oboe 03/03/2018 #Megareads from LvMSCB assembly are used. mkdir -p $OTOPDIR/Lv/OPERA-LG/E_RUN5/results1/FGAP cd $OTOPDIR/Lv/OPERA-LG/E_RUN5/results1/FGAP ln -s ../scaffoldSeq.fasta LvPtE_gapClosedFGAPdedupeOLG.fa ln -s $OTOPDIR/Lv/do_masurca_B/mr.41.15.17.0.029.1.fa lv_mscB_mr.fa nice fgap_wrapper.sh LvPtE_gapClosedFGAPdedupeOLG.fa \ lv_mscB_mr.fa 44 50000000 \ >fgap_wrapper.log 2>&1 & #on oboe 03/04/2018 # dedupe, again. # makes LvPtE_gapClosedFGAPdedupeOLGFGAPdedupe.fa cd $OTOPDIR/Lv/OPERA-LG/E_RUN5/results1/FGAP mv all_scf.fasta LvPtE_gapClosedFGAPdedupeOLGFGAP.fa $OTOPDIR/src/bbmap/dedupe.sh \ in=LvPtE_gapClosedFGAPdedupeOLGFGAP.fa \ out=LvPtE_gapClosedFGAPdedupeOLGFGAPdedupe.fa \ outd=LvPtE_gapClosedFGAPdedupeOLGFGAP_dups_removed.fa \ findoverlap=t minidentity=97 maxedits=12 \ >LvPtE_gapClosedFGAPdedupeOLGFGAP_dedupe.log 2>&1 ln -s LvPtE_gapClosedFGAPdedupeOLGFGAPdedupe.fa LvPtE5.fasta #on bassoon 03/07/2018 #use P_RNA_scaffolder to merge some LvPtE5.fasta scaffolds #makes results/P_RNA_scaffold.fasta mkdir -p $BTOPDIR/Lv/P_RNA_scaffolder cd $BTOPDIR/Lv/P_RNA_scaffolder scp oboe:$OTOPDIR/Lv/OPERA-LG/E_RUN5/results1/FGAP/LvPtE5.fasta . cp ../Lv_TRINITY/get_sra.sh . #For now just get one of these: SRR1661111 echo SRR1661111 >SRR.desc nice ./get_sra.sh fastq-dump --defline-seq '@$sn[_$rn]/$ri' --split-3 SRR1661111.sra rm SRR1661111.sra $OTOPDIR/src/hisat2-2.1.0/hisat2-build LvPtE5.fasta Lv_hisat $OTOPDIR/src/hisat2-2.1.0/hisat2 -x Lv_hisat -1 SRR1661111_1.fastq \ -2 SRR1661111_2.fastq \ -k 3 -p 40 --pen-noncansplice 1000000 -S lv_input.sam \ > hisat_pass2.log 2>&1 mkdir results mv lv_input.sam Lv_input.sam $OTOPDIR/src/P_RNA_scaffolder/P_RNA_scaffolder.sh \ -d $OTOPDIR/src/P_RNA_scaffolder \ -i Lv_input.sam \ -j LvPtE5.fasta \ -F SRR1661111_1.fastq -R SRR1661111_2.fastq \ -o $BTOPDIR/Lv/P_RNA_scaffolder/results \ -s yes -t 40 >p_rna.log 2>&1 mv results Lv_results #on bassoon 03/07/2018 #another round of gap filling with FGAP, makes all_scf.fasta mkdir $BTOPDIR/FGAP/LvPtE5B cd $BTOPDIR/FGAP/LvPtE5B ln -s $BTOPDIR/Lv/P_RNA_scaffolder/Lv_results/P_RNA_scaffold.fasta LvPtE5B.fa fgap_wrapper.sh \ LvPtE5B.fa \ ../lv_mscB_mr.fa 40 50000000 \ >fgap_wrapper.log 2>&1 & #mathog on bassoon 03/09/2018 #makes final LvPtE5C.fa cd $BTOPDIR/Lv/FGAP/LvPtE5B $OTOPDIR/src/bbmap/dedupe.sh \ in=all_scf.fasta \ out=LvPtE5C.fa \ outd=LvPtE5C_dups_removed.fa \ findoverlap=t minidentity=97 maxedits=12 \ >LvPtE5C_dedupe.log 2>&1 &