Commands used to build LvPtE5C.
(Last updated 08/09/2018)

#Note the three cycles of fgap_wrapper.sh followed by dedupe.sh. 
#Each time gaps are filled there is a new possibility to remove a sequence 
#from one of the haplotypes, and doing so reduces erroneous assembly later on 
#of the type where both haplotype sequences are incorporated into the genome,
#rather than eliminating one of the two.

#commands were run on two machines, oboe and bassoon, with different
#root directories
export OTOPDIR=/home/mathog
export BTOPDIR=$BTOPDIR/Lv

##get and install platanus and its trim programs
cd $OTOPDIR/src
wget -O platanus-1.2.4 http://platanus.bio.titech.ac.jp/?ddownload=145
cd $OTOPDIR/bin
wget -O platanus_pe_trim http://platanus.bio.titech.ac.jp/?ddownload=153
wget -O platanus_mp_trim http://platanus.bio.titech.ac.jp/?ddownload=154

##get and install OPERA-LG
#download from sourceforge:
#   https://sourceforge.net/projects/operasf/files/OPERA-LG%20version%202.0.6/
#install in $OTOPDIR/src
#patch one perl script and create a PacBIO specific version.  Some of this
#may be in newer OPERA-LG versions:
cat >/tmp/olg_patches.txt <<'EOD'
--- OPERA-long-read.pl	2018-01-24 11:12:15.952938478 -0800
+++ OPERA-pacbio-read.pl	2018-02-08 15:24:04.893806637 -0800
@@ -5,7 +5,19 @@
 my $min_contig_size = 300;
 my $min_opera_contig_size = 500;
 my $long_read_cluster_threshold = 2;
-for($i = 0; $i < 6; $i++){$cluster_threshold_tab[$i] = $long_read_cluster_threshold;}
+# sd and mean for fake end pairs derived from long reads.
+my @stds  = (100,  333,  667, 1333, 2000, 2667,  3333,  4667);
+my @means = (300, 1000, 2000, 4000, 6000, 8000, 10000, 14000);
+my %inter = ();
+my $num_it = @stds; #used elsewhere for the number of loop iterations.
+for($i = 0; $i < $num_it; $i++){
+   $inter{"i${i}"} = [$stds[$i],$means[$i]];
+}
+# For some reason original script had i0 with -200 for stds, so change that one
+$inter{"i0"} = [-200,$means[0]];
+
+
+for($i = 0; $i < $num_it; $i++){$cluster_threshold_tab[$i] = $long_read_cluster_threshold;}
 my $short_read_cluster_threshold = 3;
 #my @cluster_threshold_tab = (2,2,2,2,1,1);
 #my @cluster_threshold_tab = (2,2,2,2,2,1);
@@ -27,6 +39,9 @@
 #my $mapper_extention 
 my $graphmapDir = "/mnt/software/stow/graphmap-0.3.0-1d16f07/bin/";
 
+#for emitting more log information
+my $command;
+
 
 #Used in case of grapmap mapping
 my @contig_id_to_name = ();
@@ -39,7 +54,7 @@
 my $short_read_tooldir = "";
 my $samtools_dir = "";
 my $short_read_maptool = "bwa";
-my $kmer_size = 100;
+my $kmer_size = 61;
 my $flag_help;
 
 
@@ -59,6 +74,7 @@
 	--opera: Folder which contains opera binary (default PATH)
 	--illumina-read1: fasta file of illumina read1
 	--illumina-read2: fasta file of illumina read2
+        --upto-config: stop after config is generated, before OPERA-LG runs
         --help : prints this message
 ";
 
@@ -82,6 +98,7 @@
     "short-read-tooldir=s" => \$short_read_tooldir,
     "illumina-read1=s"      => \$illum_read1,
     "illumina-read2=s"      => \$illum_read2,
+    "upto-config+" => \$upto_config,
     "help"       => \$flag_help
 ) or die("Error in command line arguments.\n");
 
@@ -154,7 +171,7 @@
 chdir( $outputDir );
 
 my $str_full_path = "or please enter the full path";
-if ( ! -e $contigFile ) {die "\nError: $contigFile - contig file does not exist $str_full_path\n"};
+if ( ! -e $contigFile ) {die "\nError:  --num-of-processors $nproc $contigFile - contig file does not exist $str_full_path\n"};
 if ( ! -e $readsFile ) {die "\nError: $readsFile - long read file does not exist $str_full_path\n"};
 if ( ! -e $illum_read1 ) {die "\nError: $illum_read1 - illumina read 1 file does not exist $str_full_path\n"};
 if ( ! -e $illum_read2 ) {die "\nError: $illum_read2 - illumina read 2 file does not exist $str_full_path\n"};
@@ -172,7 +189,7 @@
     $str_path_dir = "";
     $str_path_dir .= "--tool-dir  $short_read_tooldir" if($short_read_tooldir ne "");
     $str_path_dir .= "--samtools-dir $samtools_dir" if($samtools_dir ne "");
-    run_exe("perl $operaDir/preprocess_reads.pl --map-tool $short_read_maptool $str_path_dir --contig $contigFile --illumina-read1 $illum_read1 --illumina-read2 $illum_read2 --out ${file_pref}.bam");
+    run_exe("perl $operaDir/preprocess_reads.pl --num-of-processors $nproc --map-tool $short_read_maptool $str_path_dir --contig $contigFile --illumina-read1 $illum_read1 --illumina-read2 $illum_read2 --out ${file_pref}.bam");
 }
 
 if(! -e "$file_pref.map.sort"){
@@ -272,7 +289,7 @@
 }
 
 #Filter the long read edges
-for (my $i = 0; $i <= 5; $i++){
+for (my $i = 0; $i < $num_it; $i++){
     $edge_file = $all_edge_file."_i$i";
     $updated_edge_file = $all_edge_file."_no_repeat_i$i";
     push(@allEdgeFiles, $updated_edge_file);
@@ -296,22 +313,21 @@
 &CreateConfigFile( $contigFile, "", @allEdgeFiles );
 
 # run opera
-&run_exe( "${operaDir}OPERA-LG config > log" );
+if (defined($upto_config)) {
+   $time = localtime;
+   print STDERR "[$time] config file created, exiting without running OPERA-LG\n";
+   exit 0;
+}
+&run_exe( "$operaDir/OPERA-LG config > log" );
 
 #Link to the result file
 &run_exe("ln -s results/scaffoldSeq.fasta scaffoldSeq.fasta");
 
 sub extract_edge{
     my ($all_edge_file) = @_;
+    $time = localtime;
+    print STDERR "[$time] in sub extract_edge\n";
 
-    my %inter = (
-	"i0", [-200, 300],
-	"i1", [300, 1000],
-	"i2", [1000, 2000],
-	"i3", [2000, 5000],
-	"i4", [5000, 15000],
-	"i5", [15000, 40000]
-	);
     
     my %out_edge = ();
     foreach $it (keys %inter){
@@ -327,7 +343,9 @@
 	@line = split(/\t/, $_);
 	$dist = $line[4];
   	$support = $line[6];
-  	foreach $it (keys %inter){
+#  	foreach $it (keys %inter){
+  	for($i = 0; $i < $num_it; $i++){
+            $it = "i${i}";
 	    if($support >= 0 && $inter{$it}->[0] < $dist && $dist < $inter{$it}->[1]){
 		$OUT = $out_edge{$it};
 		print $OUT join("\t", @line)."\n";
@@ -499,6 +517,8 @@
 }
 
 sub checkMapping{
+    $time = localtime;
+    print STDERR "[$time] in sub createOutputFolder\n";
     my ($mapFile, $all_edge_file) = @_;
     my ($rn, $cn, $score, $unused, $ro, $rs, $re, $rl, $co, $cs, $ce, $cl);
     my $currentScore = 0;my $previousScore = 0;
@@ -636,6 +656,8 @@
 
 sub CreateConfigFile{
     my( $contigFile, $suffix, @edgeFiles) = @_;
+    $time = localtime;
+    print STDERR "[$time] in sub CreateConfigFile\n";
     
     open( CONF, ">config".$suffix ) or die $!;
 
@@ -666,8 +688,6 @@
     }
     
     $i = 0;
-    @means = (300, 1000, 2000, 5000, 15000, 40000);
-    @stds = (30, 100, 200, 500, 1500, 4000);
     foreach $edgeFileName ( @edgeFiles ){
 	    print CONF "[LIB]\n";
 	    #print CONF "cluster_threshold=$cluster_threshold\n";
@@ -687,6 +707,6 @@
 sub run_exe{
     my ($exe) = @_;
     $run = 1;
-    print STDERR $exe."\n";;
+    print STDERR "running command: $exe\n";
     print STDERR `$exe` if($run);
 }
--- preprocess_reads.pl.dist	2018-01-25 14:09:22.035548937 -0800
+++ preprocess_reads.pl	2018-01-26 16:58:02.056919925 -0800
@@ -15,12 +15,14 @@
 	--out			Name of the output file
 	--map-tool		Mapping tool can be either bwa (default) or bowtie
 	--tool-dir		Directory that contains binaries to the chosen mapping tool (default PATH)
-        --samtools-dir		Directory that contains samtools binaries (default PATH)
+	--samtools-dir		Directory that contains samtools binaries (default PATH)
+	--num-of-processors     number of processors used for mapping stages
 	--help			Help
 ";
 
 my $path = "";
 my $samtoolsDir = "";
+my $nproc=1;
 
 GetOptions(
 
@@ -31,12 +33,12 @@
     "map-tool=s"      => \$mapTool,
     "tool-dir=s"      => \$path,
     "samtools-dir=s"  => \$samtoolsDir,
+    "num-of-processors=i" => \$nproc,
     "help"	=> \$flag_help
 
 ) or die("Error in command line arguments.\n");
 
 
-
 if ( $flag_help ){
         print $help_message;
         exit 0;
@@ -133,6 +135,8 @@
 
 sub createOutputFolder
 {
+    $time = localtime;
+    print STDERR "[$time] in sub createOutputFolder\n";
     @dir = split( "/", $outputFile );
     $folder = "";
     for( $i = 0; $i < @dir - 1; $i++ )
@@ -148,6 +152,8 @@
 
 sub checkReadFormat
 {
+    $time = localtime;
+    print STDERR "[$time] in sub checkReadFormat\n";
     if(  $readFile1 =~ /\.gz$/ ){
 	# open gzip file
 	open( READ, "gunzip -c $readFile1 |" ) or die $!;
@@ -182,6 +188,7 @@
 {
     # build the index 
     $time = localtime;
+    print STDERR "[$time] in sub buildIndexUsingBowtie\n";
     print "[$time]\t";
     print "Building bowtie index...\n";
     @contigName = split( "/", $contigFile );
@@ -190,11 +197,13 @@
 	if( $type eq "cs" )
 	{
 	    $command = "${path}bowtie-build -C $contigFile $folder$contigName[ -1 ]";
+	    print "constructed command: $command\nExecuting command\n";
 	    `$command` or die "ERROR! Running bowtie-build error.\n";
 	}
 	else
 	{
 	    $command = "${path}bowtie-build $contigFile $folder$contigName[ -1 ]";
+	    print "constructed command: $command\nExecuting command\n";
 	    `$command` or die "ERROR! Running bowtie-build error.\n";
 	}
     }
@@ -207,6 +216,7 @@
 sub mapWithBowtie
 {
     $time = localtime;
+    print STDERR "[$time] in sub mapWithBowtie\n";
     print "[$time]\t";
     print "Mapping reads using bowtie...\n";
     if( $type eq "cs" )
@@ -217,11 +227,13 @@
     {
 	$command = "${path}bowtie -v 3 -a -m 1 -S -t $fasta -p 15 $folder$contigName[ -1 ] - 2>${folder}bowtie.err | sort -n > $folder$outputFile";
     }
+    print "constructed command: $command\n";
 }
 
 sub buildIndexUsingBwa
 {
     $time = localtime;
+    print STDERR "[$time] in sub buildIndexUsingBwa\n";
     print "[$time]\t";
     print "Building bwa index...\n";
     @contigName = split( "/", $contigFile );
@@ -230,12 +242,13 @@
 	if( $type eq "cs" )
 	{
 	    $command = "${path}bwa index -c -p $folder$contigName[ -1 ] $contigFile";
+	    print "constructed command: $command\nExecuting command\n";
 	    `$command`; # or die "ERROR! Running bwa index error.\n";
 	}
 	else
 	{
 	    $command = "${path}bwa index -p $folder$contigName[ -1 ] $contigFile";
-	    print $command."\n";
+	    print "constructed command: $command\nExecuting command\n";
 	    `$command`;  # or die "ERROR! Running bwa index error.\n";
 	}
     }
@@ -247,7 +260,9 @@
 
 sub readAndMap
 {
-    if(  $readFile1 =~ /\.gz$/ ){
+    $time = localtime;
+    print STDERR "[$time] in sub readAndMap\n";
+    if(  $readFile1 =~ /\.groot      62551   5389  0 14:06 ?        00:0z$/ ){
 	# open gzip file
 	open( F, "gunzip -c $readFile1 |" ) or die $!;
 	open( S, "gunzip -c $readFile2 |" ) or die $!;
@@ -257,6 +272,7 @@
 	open( S, "$readFile2" ) or die $!;
     }
     
+    print "Executing command with piped input\n";
     open( OUTPUT, "|-", $command ) or die $!;
     $num = 1;
     while( $first = <F> )
@@ -316,20 +332,24 @@
 {
     # align with bwa
     $time = localtime;
+    print STDERR "[$time] in sub findSAWithBwa\n";
     print "[$time]\t";
     print "Finding the SA coordinates of the reads using BWA aln...\n";
-    $command = "${path}bwa aln -t 30 $folder$contigName[ -1 ] - > ${folder}read.sai";
+    $command = "${path}bwa aln -t $nproc $folder$contigName[ -1 ] - > ${folder}read.sai";
+    print "constructed command: $command\n";
 }
 
 sub generateAlignmentUsingBwa
 {
     $time = localtime;
+    print STDERR "[$time] in sub generateAlignmentUsingBwa\n";
     print "[$time]\t";
     print "Generate alignments of reads using bwa sampe...\n";
     # $command = "${path}bwa samse -n 1 $folder$contigName[ -1 ] ${folder}read.sai - | awk \'{ if( \$3 != \"*\" ) print \$0 }\' > $folder$outputFile";
     # $command = "${path}bwa samse -n 1 $folder$contigName[ -1 ] ${folder}read.sai - | grep '\\(^@\\|XT:A:U\\)' | samtools view -S -h -b -F 0x4 - | samtools sort -no - ${folder}temporarySam | samtools view -h -b - > $folder$outputFile";
-    $command = "${path}bwa samse -n 1 $folder$contigName[ -1 ] ${folder}read.sai - | grep '\\(^@\\|XT:A:U\\)' | ${samtoolsDir}samtools view -S -h -b -F 0x4 - | ${samtoolsDir}samtools sort -\@ 20 -no - ${folder}temporarySam > $folder$outputFile";
-    print $command."\n";
+    ## $command = "${path}bwa samse -n 1 $folder$contigName[ -1 ] ${folder}read.sai - | grep '\\(^@\\|XT:A:U\\)' | ${samtoolsDir}samtools view -S -h -b -F 0x4 - | ${samtoolsDir}samtools sort -\@ 20 -no - ${folder}temporarySam > $folder$outputFile";
+    $command = "${path}bwa samse -n 1 $folder$contigName[ -1 ] ${folder}read.sai - | grep '\\(^@\\|XT:A:U\\)' | ${samtoolsDir}samtools view -S -h -b -F 0x4 - | ${samtoolsDir}samtools sort -\@ 20 -no - > $folder$outputFile";
+    print "constructed command: $command\n";
 }
 
 sub clearBowtie
EOD
cd $OTOPDIR/src/OPERA-LG_v2.0.6/bin
patch -p0 </tmp/olg_patches.txt


##get and install FGAP as part of gapFinisher.
cd $OTOPDIR/src
wget http://www.helsinki.fi/~jkammone/gapFinisher.zip
unzip gapFinisher.zip
rm gapFinisher.zip

##get and install matlab run time library for Linux, needed by FGAP
mkdir -p /tmp/mcr
cd /tmp/mcr
wget http://ssd.mathworks.com/supportfiles/MCR_Runtime/R2012a/MCR_R2012a_glnxa64_installer.zip
unzip MCR_R2012a_glnxa64_installer.zip
rm MCR_R2012a_glnxa64_installer.zip
./install
# graphical installer comes up, put it in 
# $OTOPDIR/src/MATLAB

##get and install bbmap 37.23
# from https://sourceforge.net/projects/bbmap
# install in $OTOPDIR/src
##get and install other dependencies like 
#  blasr
#  bwa-mem
#  binary_to_fasta,  fasta_to_binary, other fasta utilities
#     (http://saf.bio.caltech.edu/pub/software/molbio/),
#  extract (from drm_tools on sourceforge)
#  perl script dependencies
#  etc.
#

#on oboe 01/09/2018
cd $OTOPDIR/Lv/Lv_ILLUMINA
#download 3 SRR's from NCBI
#SRR176809,SRR176810,SRR176812, 2.5kb insert Illumina paired end.
#break out data with:
fastq-dump  --split-3 SRR176809.sra
fastq-dump  --split-3 SRR176810.sra
fastq-dump  --split-3 SRR176812.sra

#on oboe 01/09/2018
#trim raw Illumina 400bp insert, Paired end.
#trim Illumina mate pair
#make the names simpler.
cd $OTOPDIR/Lv/Lv_ILLUMINA
platanus_pe_trim pe_400_R1.fastq pe_400_R2.fastq \
  2>ppe_trim.log 1>&2 &
platanus_mp_trim SRR176809_1.fastq  SRR176809_2.fastq >pmp_09.log 2>&1 &
platanus_mp_trim SRR176810_1.fastq  SRR176810_2.fastq >pmp_10.log 2>&1 &
platanus_mp_trim SRR176812_1.fastq  SRR176812_2.fastq >pmp_12.log 2>&1 &
ln -s SRR176812_2.fastq.int_trimmed SRR176812_2.trimmed.fq
ln -s SRR176812_1.fastq.int_trimmed SRR176812_1.trimmed.fq
ln -s SRR176810_2.fastq.int_trimmed SRR176810_2.trimmed.fq
ln -s SRR176810_1.fastq.int_trimmed SRR176810_1.trimmed.fq
ln -s SRR176809_2.fastq.int_trimmed SRR176809_2.trimmed.fq
ln -s SRR176809_1.fastq.int_trimmed SRR176809_1.trimmed.fq
ln -s pe_400_R2.fastq.trimmed       pe_400_R2.trimmed.fq  
ln -s pe_400_R1.fastq.trimmed       pe_400_R1.trimmed.fq  

#platanus assemble, on oboe 01/11/2018
mkdir -p $OTOPDIR/Lv/platanus/run1
cd $OTOPDIR/Lv/platanus/run1
platanus assemble -o LvPtA -f \
  ../Lv_ILLUMINA/pe_400_R1.fastq.trimmed \
  ../Lv_ILLUMINA/pe_400_R2.fastq.trimmed \
  -t 48 -m 256 -u 1.0 

#platanus scaffold, on oboe 02/07/2018
# add mate pairs and 454 pairs
mkdir -p $OTOPDIR/Lv/platanus/run5
cd $OTOPDIR/Lv/platanus/run5
platanus scaffold -o LvPtE \
  -c ../run1/LvPtA_contig.fa \
  -b ../run1/LvPtA_contigBubble.fa \
  -IP1 ../../Lv_ILLUMINA/pe_400_R1.trimmed.fq \ ../../Lv_ILLUMINA/pe_400_R2.trimmed.fq -n1 100 \
  -IP2 ../../Lv_454/pairs.1.fastq ../../Lv_454/pairs.2.fastq \
  -a2 2268 -d2 1121 -n2 150 \
  -OP3 ../../Lv_ILLUMINA/SRR176809_1.trimmed.fq \
  ../../Lv_ILLUMINA/SRR176809_2.trimmed.fq -n3 1200 \
  -OP4  ../../Lv_ILLUMINA/SRR176810_1.trimmed.fq \
  ../../Lv_ILLUMINA/SRR176810_2.trimmed.fq -n4 1200 \
  -OP5  ../../Lv_ILLUMINA/SRR176812_1.trimmed.fq \
  ../../Lv_ILLUMINA/SRR176812_2.trimmed.fq -n5 1200 -t 48 -u 1.0 
#makes LvPtE_scaffold.fa

#platanus gap_close, on oboe 02/08/2018
nice platanus gap_close -o LvPtE -c LvPtDE_scaffold.fa  \
  -f    ../../Lv_454/single.fastq                  ../../Lv_454/pairs.u.fastq               \
  -IP1  ../../Lv_ILLUMINA/pe_400_R1.trimmed.fq     ../../Lv_ILLUMINA/pe_400_R2.trimmed.fq   \
  -IP2  ../../Lv_454/pairs.1.fastq                 ../../Lv_454/pairs.2.fastq               \
  -OP3  ../../Lv_ILLUMINA/SRR176809_1.trimmed.fq   ../../Lv_ILLUMINA/SRR176809_2.trimmed.fq \
  -OP4  ../../Lv_ILLUMINA/SRR176810_1.trimmed.fq   ../../Lv_ILLUMINA/SRR176810_2.trimmed.fq \
  -OP5  ../../Lv_ILLUMINA/SRR176812_1.trimmed.fq   ../../Lv_ILLUMINA/SRR176812_2.trimmed.fq \
  -t 48
#makes LvPtE_gapClosed.fa
  
#on bassoon 02/15/2018
#make a script fgap_wrapper.sh which runs fgap in parallel and only
#  on sequences which contain N's (so have a gap to close).  It also
#  retrieves megareads efficiently using the fasta_to_binary and
#  binary_to_fasta programs.  This is a slightly updated
#  version of the program which was used (different error checking
#  results should be the same.)

cat >$OTOPDIR/bin/fgap_wrapper.sh <<'EOD'
#!/bin/bash
# 2018/02/26  David Mathog, mathog@caltech.edu.
#
# This is a wrapper for fgap which runs only the subset of 
# scaffolds which:
# 1.  contain N's
# 2.  had corrected PacBIO reads map to flanking regions for those N's.
# It is MUCH  faster than letting fgap run on a big genome, which for the 1Gb
# one I tried ran for 6 days, most of it with no output, and which was
# finally terminated without having emitted a result.
#
# Method:
#     Run this script.  Providing the parameters:
#          scaffold_file (fasta)
#          corrected long reads (fasta)
#          number of threads
#          longest sequence allowed (buffer size to allocate)
#
#  fgap is run with -z 1 -g 1. 
#  -z 1 lets it remove just the N's without entering anything else.
#  -g 1 lets it trim off a bit on the ends flanking the Ns.  For some
#     reason that is needed on sequences like this:
#     .......AAAAAAAAAAANNNNNNNAGAAAAA....
#     Most likely because the blastn extensions span the short NNN and so the
#     mapped corrected pacbio positions APPEAR to overlap, like:
#     100 230  and 200 300  but this is an alignment issue resulting from
#     doing each side separately, not a real overlap.
#

### edit this section if paths change
export LONGP=$OTOPDIR/src/MATLAB/MATLAB_Compiler_Runtime/v717
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:\
$LONGP/runtime/glnxa64:\
$LONGP/bin/glnxa64:\
$LONGP/sys/os/glnxa64:\
$LONGP/sys/java/jre/glnxa64/jre/lib/amd64/native_threads:\
$LONGP/sys/java/jre/glnxa64/jre/lib/amd64/server:\
$LONGP/sys/java/jre/glnxa64/jre/lib/amd64   

PFGAP=$OTOPDIR/src/gapFinisher/FGAP/fgap
ABATCH=16         # map lines per batch in each thread
MINSCORE=50       # also minimum overlap
###

test_exec() {
   THEFILE=`which $1`
   if [ ! -x $THEFILE ]
   then
      echo "fgap_wrapper.sh: fatal error: required binary $THEFILE is not found or executable"
      exit
   fi
}

test_read() {
   THEFILE=$1
   if [ ! -r $THEFILE ]
   then
      echo "fgap_wrapper.sh: fatal error: required input $THEFILE is not found or readable"
      exit
   fi
}

do_one_set(){
   CHUNK=$1
   START=$2
   LAST=$3
   MAPF=../map_scf_read.txt
   SCFF=../scf_db
   CPBF=../cpb_db
   mkdir chunk_$CHUNK
   cd chunk_$CHUNK
   NOW=`date`
   echo "$NOW thread $CHUNK range $START $LAST" >>all_out.log
   while [ $START -le $LAST ]
   do
       FINAL=$(( $START + $ABATCH - 1 ))
       if [ $FINAL -ge $LAST ]
       then
          FINAL=$LAST
       fi
       NOW=`date`
       echo "$NOW thread $CHUNK processing batch $START $FINAL" >>all_out.log
       extract -in $MAPF -sr $START -er $FINAL -mt -dl ' ' -fmt '[1]' -wl $MAXMAPLEN \
         | binary_to_fasta -in $SCFF -sf 1 -of 2 >scf.fasta 2>/dev/null
       extract -in $MAPF -sr $START -er $FINAL -mt -dl ' ' -fmt '[2,]' -dv '\n' -wl $MAXMAPLEN \
         | sort -n | uniq \
         | binary_to_fasta -in $CPBF -sf 1 -of 2 >reads.fasta 2>/dev/null
       #
       # do NOT use -z 1 with -g 1, some sequences will take HOURS to process!!!!
       #
       $PFGAP -d scf.fasta -a reads.fasta -R 2500 -I 3500 -t 1 -s 50 -z 1 >>all_out.log 2>&1 </dev/null
       echo "FGAP exit status $? " >>all_out.log
       if [ -e output_fgap.stats ]
       then
          cat output_fgap.stats >> all_out.stats
          rm output_fgap.stats
       else
          echo "FAILURE: no output_fgap.stats was created" >> all_out.stats
       fi
       if [ -e output_fgap.final.fasta ]
       then
          mv output_fgap.final.fasta scf.fasta
          #
          #  negative ones that can really be replaced are generally small so set -R low.
          #  Otherwise -g 1 may sometimes remove and replace large swaths of bases
          #  which are not Ns.
          #
          $PFGAP -d scf.fasta -a reads.fasta -R 200 -I 500 -t 1 -s 50 -p 0 -g 1 >>all_out.log 2>&1 </dev/null
          echo "FGAP exit status $? " >>all_out.log
          if [ -e output_fgap.stats ]
          then
             cat output_fgap.stats >> all_out.stats
             rm output_fgap.stats
          else
             echo "FAILURE: no output_fgap.stats was created" >> all_out.stats
          fi
          if [ -e output_fgap.final.fasta ]
          then
             cat output_fgap.final.fasta >>all_out.fasta
             rm -f output_fgap.final.fasta
          else
             # do not log failures here because negative gaps are pretty rare so many will "fail"
             cat scf.fasta >>all_out.fasta  # results from first round are still good, copy them
          fi
          rm scf.fasta
       else
          echo "$NOW BATCH FAILURE -z 1: Some issue with this set or no gaps could be filled." >> all_out.log
       fi
       #
       #  If no gaps are filled output_fgap.final.fasta is not written.
       #  If it is not removed afterwards that scaffold will be entered twice into the final list.
       #  Logic above handles these cases.
       #
       START=$(( $FINAL + 1 ))
   done
   rm scf.fasta
   NOW=`date`
   echo "$NOW thread $CHUNK range completed" >>all_out.log
}

NOW=`date`
echo "$NOW processing begins"

SCAFFOLDS=$1
CORRECTED=$2
THREADS=$3
MAXSEQ=$4

test_exec binary_to_fasta
test_exec extract
test_exec seq
test_exec fasta2fgap
test_exec fastafrag
test_exec blastn
test_exec makeblastdb
test_exec many_blast_plus_1cpu.sh
test_read $SCAFFOLDS
test_read $CORRECTED


#the matlab runtime will sink a hook into X11 if it can find it.  If the starting
#session then exits so will the runtime.  So hide X11 from the matlab runtime.
if [ $DISPLAY ]
then
   unset DISPLAY
fi

WORKDIR=tmp_fgap

mkdir -p $WORKDIR

NOW=`date`
if [ -e $WORKDIR/datasets.fasta ]
then
echo "$NOW using EXISTING converted corrected reads datasets.fasta"
else
#
#  corrected reads. Differs very slightly from FGAP in that it 
#  retains the original header after a space.  blastn will drop that
#  extra text, so final blastn.out will be the same.
#
echo "$NOW converting corrected reads to datasets.fasta"
fastafrag -length 1000000 -outlen 60 -prefix 'dataset_1_%d ' -upper \
   <$CORRECTED >$WORKDIR/datasets.fasta
fi

NOW=`date`
if [ -e $WORKDIR/draft_regions.fasta ]
then
echo "$NOW using EXISTING scaffolds to draft_regions.fasta"
else
echo "$NOW converting scaffolds to draft_regions.fasta"
fasta2fgap -in $SCAFFOLDS -f 300 -m $MINSCORE -d 1 > $WORKDIR/draft_regions.fasta
fi


cd $WORKDIR

NOW=`date`
if [ -e datasets.00.nsq ]
then
echo "$NOW using EXISTING blast database made from datasets.fasta"
else
echo "$NOW making blast database out of datasets.fasta"
makeblastdb -dbtype nucl -in datasets.fasta -title datasets -out datasets
fi

NOW=`date`
if [ -e blastn.out ]
then
echo "$NOW using EXISTING blastn.out"
else
echo "$NOW running blastn for draft_regions vs. datasets to make blastn.out"
BLASTPARAMS="-db datasets -task blastn -min_raw_gapped_score $MINSCORE -evalue 1e-07 -perc_identity 70 -word_size 15 -gapopen 1 -gapextend 1 -penalty -3 -reward 1 -max_target_seqs 200 -outfmt \"6 qseqid sseqid score bitscore evalue pident nident mismatch sstrand qstart qseq qend sstart sseq send qlen length qcovhsp\""

# PROG=$1
# INFILE=$2
# OUTFILE=$3
# NCPUS=$4
# BLASTPARAMS=$5
many_blast_plus_1cpu.sh blastn draft_regions.fasta blastn.out $THREADS "$BLASTPARAMS"
fi

cd ..

#back in top level

#
#  next command makes a file with structure:
#     scaffold read1 .... readN
#  For an assembly with very large scaffolds and lots of repeats
#  the default of 50M might not be enough.  That should be enough to
#  hold over 6M reads if they take <=8 characters each.
#
NOW=`date`
if [ -e map_scf_read.txt ]
then
echo "$NOW using EXISTING making map_scf_read.txt"
else
echo "$NOW making map_scf_read.txt from blastn.out"
extract -in $WORKDIR/blastn.out -mt -dl ' \t_' -fmt '[fw6:jr:2] [7]' \
   | sort -k 1,1n -k2,2n \
   | uniq \
   | extract -merge 6 -wl 50000000 >map_scf_read.txt
test_read map_scf_read.txt
fi

NOW=`date`
if [ -e cpb_db.hea ]
then
echo "$NOW using EXISTING cpb_db fasta_to_binary db from $CORRECTED"
else
echo "$NOW making fasta_to_binary db from $CORRECTED"
fasta_to_binary -in $CORRECTED -out cpb_db -ls -ms 500000
test_read cpb_db.hea        #one file from binary database of corrected PacBIO reads
fi

NOW=`date`
if [ -e scf_db.hea ]
then
echo "$NOW using EXISTING scf_db fasta_to_binary db from $SCAFFOLDS"
else
echo "$NOW making fasta_to_binary db from $SCAFFOLDS"
#
# Replace ">header" with  ">123 header".
# This VASTLY simplifies some of the later steps because there is always
# an easy to parse index right at the start of every header and we don't
# need to poke around within the header itself.  Also in some cases there
# may not even be an index in the header which corresponds the the entry's
# position in the fata file.
#
cat $SCAFFOLDS  \
  | fastafrag -prefix  '%d ' \
  | fasta_to_binary -in - -out scf_db -ls -ms 500000
test_read scf_db.hea        #one file from binary database of scaffolds with gaps
fi

MAXMAPLEN=`cat map_scf_read.txt | wc -L `
MAXMAPLEN=$(( $MAXMAPLEN + 100)) #a few extra bytes are needed
ORIGSCFN=`binary_to_fasta -in scf_db -sf 0 -of 2 2>/dev/null | grep '>' | wc -l` 
LINES=`cat map_scf_read.txt | wc -l`      #lines in the map
BLOCK=$(( 1 + $LINES / $THREADS ))  #total lines from map processed per thread

NOW=`date`
echo "$NOW lines $LINES block $BLOCK threads $THREADS origscfN $ORIGSCFN"

NOW=`date`
if [ -e pre_mod_scaffolds.fasta ]
then
echo "$NOW using EXISTING pre_mod_scaffolds.fasta mod.log mod.stats" 
else
echo "$NOW generating pre_mod_scaffolds.fasta mod.log mod.stats" 
I=1
MAXTHREADS=0
TSTART=1
while [ $I -le $THREADS ]
do
   TEND=$(($TSTART + $BLOCK -1))
   if [ $TSTART -gt $LINES ]
   then
      break
   else
      MAXTHREADS=$I
   fi
   if [ $TEND -gt $LINES ]
   then
       TEND=$LINES
   fi
   do_one_set $I $TSTART $TEND &
   TSTART=$(( $TEND + 1 ))
   I=$(( $I + 1))
done
NOW=`date`
echo "$NOW started threads: $MAXTHREADS" 
wait

#assemble the pieces which it made

truncate --size 0 pre_mod_scaffolds.fasta
truncate --size 0 mod.log
truncate --size 0 mod.stats
I=1
while [ $I -le $MAXTHREADS ]
do
   NOW=`date`
   echo "$NOW merging results from thread $I"
   SUBDIR=chunk_$I
   cat $SUBDIR/all_out.fasta >> pre_mod_scaffolds.fasta
   cat $SUBDIR/all_out.log   >> mod.log
   cat $SUBDIR/all_out.stats >> mod.stats
   rm -rf $SUBDIR
   I=$(( $I + 1))
done
fi

NOW=`date`
if [ -e scaffold_indices.txt ]
then
echo "$NOW using EXISTING scaffold_indices.txt" 
else
seq 1 $ORIGSCFN > scaffold_indices.txt
fi

#
# some of the scaffolds put into fgap will NOT have been patched at all.
# If they were not they will not be in any of the all_out.fasta files
# and so not in pre_mod_scaffolds.fasta.  The next section grabs the 
# scaffold number from the header of that file, and it depends on
# all scaffolds having the same format headers!
#
NOW=`date`
if [ -e list_modified.txt ]
then
echo "$NOW using EXISTING list_modified.txt" 
else
echo "$NOW creating list_modified.txt" 
extract -in pre_mod_scaffolds.fasta -if '^>' -ifonly -mt -dl '> ' -fmt '[fw10:jr:1]X' -wl $MAXMAPLEN >list_modified.txt
extract -in scaffold_indices.txt -fmt '[fw10:jr:1,]' >>list_modified.txt
fi

NUMMODIFIED=`cat list_modified.txt | grep 'X' | wc -l`

#
#  for speed extract all of the entries in one fell swoop which fgap didn't process.  Collapse
#  the fasta files into single lines and sort on the first field. Then process back into fasta
#  form.
#
NOW=`date`
if [ -e modified_scaffolds.txt ] && [ -e unmodified_scaffolds.txt ]
then
echo "$NOW using EXISTING modified_scaffolds.txt" 
else
echo "$NOW creating modified_scaffolds.txt" 
sort -k 1,1n list_modified.txt | extract -merge 10 >runs_both
grep -v X runs_both >unmodified_scaffolds.txt
grep X runs_both | tr -d 'X' > modified_scaffolds.txt
rm runs_both
fi

NOW=`date`
if [ -e tmp_db.hea ]
then
echo "$NOW using EXISTING tmp_db" 
else
echo "$NOW making tmp_db" 
#
# Enter two sets of sequences first modified, then unmodified.
#
( cat pre_mod_scaffolds.fasta ; \
  binary_to_fasta -in scf_db -sel unmodified_scaffolds.txt -sf 1 -of 2 ) \
  | fasta_to_binary -out tmp_db -ls -ms 500000
fi

#
#  make the final sequence files.  Remove the numeric index
#  from the headers.
#
cat modified_scaffolds.txt unmodified_scaffolds.txt \
  | extract -in -,scaffold_indices.txt -indl ' '  \
  | sort -k 1,1n \
  | extract -mt -fmt '[2]' \
  | binary_to_fasta -in tmp_db -out - -sf 1 -of 2 \
  | extract -out all_scf.fasta -if '>' -mt -dl '> ' -fmt '>[2,]' -wl $MAXSEQ

cat  pre_mod_scaffolds.fasta \
  | extract -out mod_scaffolds.fasta -if '>' -mt -dl '> ' -fmt '>[2,]' -wl $MAXSEQ


NOW=`date`
echo "$NOW cleaning up temporary files" 

# save these when debugging: change true to false
if true
then
rm cpb_db.*
rm scf_db.*
rm tmp_db.*
rm list_modified.txt
fi

NOW=`date`
echo "$NOW Scaffolds which were processed: $MODSEL"
echo "$NOW Scaffolds which were modified:  $NUMMODIFIED"
echo "$NOW List of processed scaffolds are in modified_scaffolds.txt"
echo "$NOW Merged scaffold sequences, modified and unmodified, are in all_scf.fasta"
echo "$NOW done"

EOD

#gap fill with FGAP, on bassoon 02/15/2018
mkdir -p $BTOPDIR/Lv/FGAP/LvPtE

#move some data oboe to bassoon, on oboe 02/15/2018
cd $OTOPDIR/Lv/do_masurca_B
scp mr.41.15.17.0.029.1.fa \
  bassoon:$BTOPDIR/Lv/FGAP/lv_mscB_mr.fa
cd $OTOPDIR/Lv/platanus/run5
scp LvPtE_gapClosed.fa bassoon:/tmp/LvPtE_gapClosed.fa

#resume work, gap fill producing all_scf.fasta, on bassoon 02/15/2018
cd $BTOPDIR/Lv/FGAP/LvPtE
ln -s /tmp/LvPtE_gapClosed.fa LvPtE_gapClosed.fa
nice fgap_wrapper.sh LvPtE_gapClosed.fa ../lv_mscB_mr.fa 44 50000000 \
  >fgap_wrapperB.log 2>&1

#deduplicate the gap filled sequences, on bassoon 03/02/2018
# tried many different combinations of parameters, this was best.
# Because of the highly polymorphic sequences dedupe.sh does not work 
# very well.  Even if the geometry is right the gaps/mismatches often
# exceed the limits and the two haplotypes of a sequence will both persist.
# Also it fails miserably when the two haplotype pieces overlap like this:
#    ------------
#         ------------
#
mkdir $BTOPDIR/Lv/FGAP/other
cd $BTOPDIR/Lv/FGAP/other
nice $OTOPDIR/src/bbmap/dedupe.sh \
  in=../all_scf.fasta \
  out=LvPtE_gapClosedFGAPdedupe7.fa \
  outd=LvPtE_gapClosedFGAP_dups_removed7.fa \
  findoverlap=t minidentity=97 maxedits=12 \
>LvPtE_gapClosedFGAP_dedupe7.log 2>&1
  
#make bam files for OPERA-LG, on bassoon 03/02/2018
cat >do_others2.sh <<'EOD'
#!/bin/bash
#edit the next two lines
CONTIGS=LvPtE_gapClosedFGAPdedupe7.fa
PREFIX=olg_e5_

do_one() {
THIS=$1
nice perl $OTOPDIR/src/OPERA-LG_v2.0.6/bin/preprocess_reads.pl \
  --num-of-processors 44 --map-tool bwa  \
  --contig $CONTIGS \
  --illumina-read1 $BTOPDIR/Lv/Lv_ILLUMINA/${THIS}_1.trimmed.fq \
  --illumina-read2 $BTOPDIR/Lv/Lv_ILLUMINA/${THIS}_2.trimmed.fq \
  --out ${PREFIX}${THIS}.bam >${PREFIX}${THIS}_bam.log 2>&1
}


cd $BTOPDIR/Lv/FGAP/other
NOW=`date`
echo "$NOW starting 454 mapping"
nice perl $OTOPDIR/src/OPERA-LG_v2.0.6/bin/preprocess_reads.pl \
  --num-of-processors 44 --map-tool bwa  \
  --contig $CONTIGS \
  --illumina-read1 $BTOPDIR/Lv/Lv_454/pairs.1.fastq \
  --illumina-read2 $BTOPDIR/Lv/Lv_454/pairs.2.fastq \
  --out ${PREFIX}454.bam >${PREFIX}454_bam.log 2>&1
  
do_one  SRR176809
do_one  SRR176810
do_one  SRR176812
NOW=`date`
echo "$NOW done"
EOD
chmod 755 do_others2.sh
./do_others2.sh >do_others2.log 2>&1

#run OPERA, output to results1 subdirectory, on oboe 03/02/2018
#this uses bam files made above plus it makes olg_e5.bam using
#the 400bp pe set.
#pacbio_nonredundant.fa was made by MaSuRCA, it follows the CCS step
#but is before the megareads are constructed.
cd $OTOPDIR/Lv/OPERA-LG/E_RUN5
scp  bassoon:$BTOPDIR/Lv/FGAP/other/LvPtE_gapClosedFGAPdedupe7.fa .
scp  bassoon:$BTOPDIR/Lv/FGAP/other/LvPtE_gapClosedFGAP_dups_removed7.fa .
scp bassoon:$BTOPDIR/Lv/FGAP/other/LvPtE_gapClosedFGAP_dedupe7.log .
scp bassoon:$BTOPDIR/Lv/FGAP/LvPtE_gapClosedFGAP.fa .
cd ..
perl $OTOPDIR/src/OPERA-LG_v2.0.6/bin/OPERA-pacbio-read.pl \
    --upto-config \
    --num-of-processors 48 \
    --kmer 31 \
    --opera $OTOPDIR/src/OPERA-LG_v2.0.6/bin \
    --contig-file    $OTOPDIR/Lv/OPERA-LG/E_RUN5/LvPtE_gapClosedFGAPdedupe7.fa \
    --illumina-read1 $OTOPDIR/Lv/Lv_ILLUMINA/pe_400_R1.trimmed.fq  \
    --illumina-read2 $OTOPDIR/Lv/Lv_ILLUMINA/pe_400_R2.trimmed.fq  \
    --long-read-file $OTOPDIR/Lv/do_masurca_B/pacbio_nonredundant.fa \
    --output-prefix  olg_e5 \
    --output-directory E_RUN5 \
    >e_run5.log 2>&1 &
cd E_RUN5
#
#A config file was created by the preceding command, but it does not know
#about the 454 and Illumina MP data.  Make e5_config from it.  For clarity,
# here is the entire e5_config file:
cat >e5_config <<'EOD'
#
# Essential Parameters
#

# Output folder for final results
output_folder=results1
# Contig file
contig_file=$OTOPDIR/Lv/OPERA-LG/E_RUN5/LvPtE_gapClosedFGAPdedupe.fa
samtools_dir=
kmer=31
# Mapped read locations
[LIB]
map_file=olg_e5.bam
cluster_threshold=3
[LIB]
cluster_threshold=2
map_file=pairedEdges_no_repeat_i0
lib_mean=300
lib_std=100
map_type=opera
[LIB]
cluster_threshold=2
map_file=pairedEdges_no_repeat_i1
lib_mean=1000
lib_std=333
map_type=opera
[LIB]
cluster_threshold=2
map_file=pairedEdges_no_repeat_i2
lib_mean=2000
lib_std=667
map_type=opera
[LIB]
cluster_threshold=2
map_file=pairedEdges_no_repeat_i3
lib_mean=4000
lib_std=1333
map_type=opera
[LIB]
cluster_threshold=2
map_file=pairedEdges_no_repeat_i4
lib_mean=6000
lib_std=2000
map_type=opera
[LIB]
cluster_threshold=2
map_file=pairedEdges_no_repeat_i5
lib_mean=8000
lib_std=2667
map_type=opera
[LIB]
cluster_threshold=2
map_file=pairedEdges_no_repeat_i6
lib_mean=10000
lib_std=3333
map_type=opera
[LIB]
cluster_threshold=2
map_file=pairedEdges_no_repeat_i7
lib_mean=14000
lib_std=4667
map_type=opera
[LIB]
map_file=olg_e5_454.bam
cluster_threshold=3
lib_mean=2300
lib_std=1200
[LIB]
map_file=olg_e5_SRR176809.bam
cluster_threshold=3
lib_mean=2300
lib_std=1200
[LIB]
map_file=olg_e5_SRR176810.bam
cluster_threshold=3
lib_mean=2300
lib_std=1200
[LIB]
map_file=olg_e5_SRR176812.bam
cluster_threshold=3
lib_mean=2300
lib_std=1200
EOD
$OTOPDIR/src/OPERA-LG_v2.0.6/bin/OPERA-LG \
  e5_config >olg_e5.log 2>&1

#gap fill what remains, it makes all_scf.fasta, on oboe 03/03/2018
#Megareads from LvMSCB assembly are used.
mkdir -p $OTOPDIR/Lv/OPERA-LG/E_RUN5/results1/FGAP
cd $OTOPDIR/Lv/OPERA-LG/E_RUN5/results1/FGAP
ln -s ../scaffoldSeq.fasta LvPtE_gapClosedFGAPdedupeOLG.fa
ln -s $OTOPDIR/Lv/do_masurca_B/mr.41.15.17.0.029.1.fa lv_mscB_mr.fa
nice fgap_wrapper.sh LvPtE_gapClosedFGAPdedupeOLG.fa \
  lv_mscB_mr.fa 44 50000000 \
  >fgap_wrapper.log 2>&1 &

#on oboe 03/04/2018
# dedupe, again.
# makes LvPtE_gapClosedFGAPdedupeOLGFGAPdedupe.fa
cd $OTOPDIR/Lv/OPERA-LG/E_RUN5/results1/FGAP
mv all_scf.fasta LvPtE_gapClosedFGAPdedupeOLGFGAP.fa
$OTOPDIR/src/bbmap/dedupe.sh \
  in=LvPtE_gapClosedFGAPdedupeOLGFGAP.fa \
  out=LvPtE_gapClosedFGAPdedupeOLGFGAPdedupe.fa \
  outd=LvPtE_gapClosedFGAPdedupeOLGFGAP_dups_removed.fa \
  findoverlap=t minidentity=97 maxedits=12 \
  >LvPtE_gapClosedFGAPdedupeOLGFGAP_dedupe.log 2>&1
ln -s LvPtE_gapClosedFGAPdedupeOLGFGAPdedupe.fa LvPtE5.fasta

#on bassoon 03/07/2018
#use P_RNA_scaffolder to merge some LvPtE5.fasta scaffolds
#makes results/P_RNA_scaffold.fasta
mkdir -p $BTOPDIR/Lv/P_RNA_scaffolder
cd $BTOPDIR/Lv/P_RNA_scaffolder
scp oboe:$OTOPDIR/Lv/OPERA-LG/E_RUN5/results1/FGAP/LvPtE5.fasta .
cp ../Lv_TRINITY/get_sra.sh .
#For now just get one of these:  SRR1661111
echo SRR1661111 >SRR.desc
nice ./get_sra.sh
fastq-dump --defline-seq '@$sn[_$rn]/$ri' --split-3 SRR1661111.sra
rm SRR1661111.sra
$OTOPDIR/src/hisat2-2.1.0/hisat2-build LvPtE5.fasta Lv_hisat
$OTOPDIR/src/hisat2-2.1.0/hisat2 -x Lv_hisat -1 SRR1661111_1.fastq \
  -2 SRR1661111_2.fastq \
  -k 3 -p 40 --pen-noncansplice 1000000 -S lv_input.sam \
  > hisat_pass2.log 2>&1
mkdir results
mv lv_input.sam Lv_input.sam
$OTOPDIR/src/P_RNA_scaffolder/P_RNA_scaffolder.sh \
  -d $OTOPDIR/src/P_RNA_scaffolder \
  -i Lv_input.sam \
  -j LvPtE5.fasta \
  -F SRR1661111_1.fastq -R SRR1661111_2.fastq \
  -o $BTOPDIR/Lv/P_RNA_scaffolder/results \
  -s yes -t 40 >p_rna.log 2>&1
mv results Lv_results

#on bassoon 03/07/2018
#another round of gap filling with FGAP, makes all_scf.fasta
mkdir $BTOPDIR/FGAP/LvPtE5B
cd $BTOPDIR/FGAP/LvPtE5B
ln -s $BTOPDIR/Lv/P_RNA_scaffolder/Lv_results/P_RNA_scaffold.fasta 
LvPtE5B.fa
fgap_wrapper.sh \
  LvPtE5B.fa  \
  ../lv_mscB_mr.fa 40 50000000 \
  >fgap_wrapper.log 2>&1 &

#mathog on bassoon 03/09/2018
#makes final LvPtE5C.fa
cd $BTOPDIR/Lv/FGAP/LvPtE5B
  $OTOPDIR/src/bbmap/dedupe.sh \
  in=all_scf.fasta \
  out=LvPtE5C.fa \
  outd=LvPtE5C_dups_removed.fa \
  findoverlap=t minidentity=97 maxedits=12 \
  >LvPtE5C_dedupe.log 2>&1 &