#!/usr/bin/perl

######################################################################
#                             FastQFS                                #
######################################################################
#
#    A Tool for evaluating and filtering paired-end sequencing data 
#    generated from high throughput sequencing
#
#    Copyright (C) 2014 -          Rahul Sharma
#                                  Marco Thines
#	
#    Under GNU GPL v3 License                                   
#
#    FastQFS is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    FastQFS is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with FastQFS. If not, see <http://www.gnu.org/licenses/>.
#
#
#######################################################################

# Author: Rahul Sharma and Marco Thines
# Version: V1.0
#

###############################
## Libraries Used
###############################

use warnings;
use strict;
use Getopt::Long;  
use Data::Dumper;


###############################
## Usgae message
###############################


my ($filtering, $plotting)=("No", "No");

my ($fw, $rw, $prefix, $mq, $sc, $gsize, $q, $l, $help); 

## Default values
$mq=3;
$sc=33;
$q=20;
$l=100;


my $usage="

###
### Generate plots
###

perl $0 -plotting Yes -fw <Forward reads file> -rw <Reverse reads file> -prefix <Prefix for the output files> -sc <Scoring system used (33 or 64)> -gsize <expected genome size in MBs> -l <Read length> 

###
### Perform read filtering
###

perl $0 -filtering Yes -fw <Forward reads file> -rw <Reverse reads file> -prefix <Prefix for the output files> -sc <Scoring system used (33 or 64)> -mq <Base_quality_cutoff> -q <Average read quality threshold> -l <Length threshold> -plotting <Yes/No> -gsize <expected genome size in MBs> 

OPTIONS:

## File related options
-fw 		: File having the forward reads.
-rw 		: File having the reverse reads.
-prefix 	: Prefix for the output files (forward file, reverse file, single file and other log files).

## Filtering parameters related options
-sc 		: Scores used for reads quality (33 - Sanger Phred+33, 64 - Solexa Solexa+64) [Default: $sc] 
-mq 		: Minimum quality threshold for a base to filter [Default: $mq]
-q  		: Average quality threshold filter [Default: $q]
-l  		: Length threshold filter [Default: $l]
-gsize 		: Expected genome size in MBs to calculate reads coverage depth. 

## General options 
-plotting 	: Plot length distribution table (Yes or No).
-filtering 	: Perform the data filtering or not (Yes or No).
-h  		: Help message.


# Author: Rahul Sharma and Marco Thines 
# Institute: Bik-F, Senckenberg.
# Date: 12th, June, 2014
# Version: V1.0
#
# Description:
#
# 	This script to helps user in making decisions on the data filtering parameters. It shows reads statistics 
# 	by calculating average quality, length and minimum quality of a base of both pairs. It considers average
# 	qualities of 18 to 30 (increment of 4) and minimum base quality from 3 to 18 (increment of 5) and lengths
# 	from L-50 (L is user-specified length) with the increment of 10. Later it can also filter out the reads
# 	using following criteria:
#
#	1). Reads average quality cutoff.
#	2). Whole read will be discarded if one of the base is having quality below
#	    the provided quality cutoff.
#	3). Reads which are shorter than the provided length cutoffs.
#	4). Reads which are having Ns.
";


###############################
## Options definition
###############################

GetOptions ("fw=s" => \$fw,         
            "rw=s" => \$rw,
            "prefix=s" => \$prefix,
            "sc=i" => \$sc,
            "mq=i" => \$mq,
            "q=i" => \$q,
            "l=i" => \$l,
            "gsize=f" => \$gsize,
            "plotting=s" => \$plotting,
            "filtering=s" => \$filtering,
            "h|help!"  => \$help) or die "Input options were not recognized!", $usage;          
 
  if ($help) {
         die $usage;
  }


my ($st, $cov_ln,$read_count);


if(($filtering =~ /Yes/i) && ($plotting =~ /No/i)) 
	 {
   		die "\nInput parameters required",$usage unless  ($fw && $rw && $prefix);
           print "STEP.1: Performing data filtering, skipping the data plotting step\n";
		($st, $cov_ln, $read_count)=&fastq_parser($fw, $rw, $sc, $prefix, $mq, $l, $q);
             

           }

elsif(($filtering =~ /Yes/i) && ($plotting =~ /Yes/i)) 
	 {
   		die "\nInput parameters required",$usage unless  ($fw && $rw && $prefix && $gsize);
           print "STEP.1: Performing data filtering and calculating data plotting parameters\n";
	        ($st, $cov_ln, $read_count)=&fastq_parser($fw, $rw, $sc, $prefix, $mq, $l, $q);
             }

elsif(($plotting =~ /Yes/i) && ($filtering =~ /No/i)) 
	 {
  		 die "\nInput parameters required",$usage unless  ($fw && $rw && $gsize && $prefix);
           print "STEP.1: Performing data plotting parameters calculations, skipping the data filtering step\n";
        	 ($st, $cov_ln, $read_count)=&fastq_parser($fw, $rw, $sc, $prefix, 0, $l, 0);	  
               }

elsif(($plotting =~ /No/i) && ($filtering =~ /No/i)) 
	 {
  		 die "\nPlease choose Plotting and/or Filtering option(s)",$usage unless  ($fw && $rw && $gsize);
               }


if($plotting =~ /Yes/i){
print "STEP.2: Writing files for data plotting\n";
&plotting($st, $cov_ln, $read_count, $l, $gsize, $prefix);
}
print "
Successfully completed!
";

###############################
## Subroutines
###############################


sub fastq_parser{

                my($fw, $rw, $scr, $prefix, $mq, $l, $aq)=@_;
		my ($count, $singles, $paired)=(0, 0, 0);
		
		open F,"<$fw" or die$!;
		open R,"<$rw" or die$!;
		open FF,">$prefix\_R1.fastq" or die$! if ($filtering =~ /Yes/i);
		open RF,">$prefix\_R2.fastq" or die$! if ($filtering =~ /Yes/i);
		open S,">$prefix\_Singltons.fastq" or die$! if ($filtering =~ /Yes/i);
        
        my %stats; my %cov_len;       
        
        if($filtering =~ /Yes/i){
        open LQD,">$prefix\_Reads_Average_quality_and_Length_distribution_table.tab" or die$!;
        print LQD "Read_id\tRead1_Len\tRead1_AvgQul\tRead1_MiniQul\tRead1_N(0 or 1)\tRead2_Len\tRead2_AvgQul\tRead2_MiniQul\tRead2_N(0 or 1)\n"; 
        }
             #Opening and processing Fastq input files                     
                do{
                     chomp(my $f=<F>); $f =~ s/(.*)\s(.*)/$1/g; $f =~ s/(.*)\/(.*)/$1/g;  #Forward R1 file Id line
                     chomp(my $r=<R>); $r =~ s/(.*)\s(.*)/$1/g; $r =~ s/(.*)\/(.*)/$1/g;  #Reverse R2 file Id line
                    if($f =~ /$r/) { 
				     my $fastq_First_Seq=(<F>);  #Forward R1 file sequence line
				     my $fastq_Second_Seq=(<R>); #Reverse R2 file sequence line
                     		     $count++;
					my ($len1, $len2)=(0, 0);
					my ($n1, $n2)=(0, 0);
					my ($av1, $av2)=(0, 0);
					my ($mq1, $mq2)=(0, 0);

				     $len1=length($fastq_First_Seq)-1;  #Forward R1 sequence length
				     $len2=length($fastq_Second_Seq)-1; #Reverse R2 sequence length
  
   
                                     if($fastq_First_Seq =~ /N/i) {$n1=1;} #Checking for presence of Ns in R1
                                     if($fastq_Second_Seq =~ /N/i){$n2=1;} #Checking for presence of Ns in R2

 				     my $fp=<F>; my $rp=<R>; 
				     if(($fp !~ /\+/) && ($rp !~ /\+/))  #Checking for '+' line in R1 and R2
                                     {die "ERROR:Sequence are not in fastq format\n\n";}
		
				     #Fetching sequences 	
				     my $fastq_First_Qul_ASCII=(<F>); 
				     my $fastq_Second_Qul_ASCII=(<R>);
							      
				     #Calculating minimum quality and total quality 	
                                     my ($min_qul_first, $sum_first) = &quality_cal($fastq_First_Qul_ASCII, $scr);
                                     my ($min_qul_second, $sum_second)= &quality_cal($fastq_Second_Qul_ASCII, $scr);
                                     
				     $mq1=$min_qul_first;
 			             $mq2=$min_qul_second;
                                     
				     $av1=sprintf("%.2f", ($sum_first)/$len1); #Calculating average quality of R1 seq
				     $av2=sprintf("%.2f",($sum_second)/$len2); #Calculating average quality of R2 seq
         
         			     print LQD "$f\t$len1\t$av1\t$mq1\t$n1\t" if($filtering =~ /Yes/i);
         			     print LQD "$len2\t$av2\t$mq2\t$n2\n" if($filtering =~ /Yes/i);
         
                                     if((!$n1)&&(!$n2)&&($plotting =~ /Yes/i)) #Calculating plotting parameters
      	   				 {
            				  for (my $i=18; $i<=30; $i=$i+4)
						{
				                  if(($av1 >= $i) && ($av2 >= $i))
            						{
          				         	   for (my $j=3; $j<=18; $j=$j+5)
                              					{ 
                           					    if(($mq1 >= $j)&& ($mq2 >=$j))
                                   				       {
                        						  for (my $k=$l-50; $k<=$l; $k=$k+10)
                              	       					    { 
            		               					     if(($len1 >= $k)&& ($len2 >= $k))
                                             					{
 		                                 				$stats{$i}{$j}{$k}++;
					   	 				$cov_len{$i}{$j}{$k}{Len}+=$len1+$len2;
  					      					}	                  
             			   	 				     }
                                    					}
            		      					}
     	                				}     
           	  				}
             				}
                                   
                  if($filtering =~ /Yes/i) #Performing filtering step
				{
				     if(
					  (( $len1 >= $l) && ($len2 >= $l)) &&	
					  (( $mq1 >= $mq) && ( $mq2 >= $mq))&&
					  (( $av1 >= $aq) && ( $av2 >= $aq))&&
					  (( !$n1) && ( !$n2 ))
				       )

					{ $paired++; 
					print FF "$f/1\n$fastq_First_Seq+\n$fastq_First_Qul_ASCII";
					print RF "$r/2\n$fastq_Second_Seq+\n$fastq_Second_Qul_ASCII";
					  }	
				 
                                    if(
					  (( $len1 >= $l) && (( $len2 < $l)||
					     ($mq2 < $mq)||($av2<$aq)||
					     ($fastq_Second_Seq =~/N/i))) &&	
					  (( $mq1 >= $mq))&&
					  (( $av1 >= $aq))&&
					  (( $fastq_First_Seq !~ /N/i ))
				       )

					{
					$singles++;
					print S "$f/1\n$fastq_First_Seq+\n$fastq_First_Qul_ASCII";
				    	}
				     if(
					  (( $len2 >= $l) && (( $len1 < $l)||
					     ($mq1 < $mq)||($av1 < $aq)||
					     ($fastq_First_Seq =~/N/i))) &&	
					  (( $mq2 >= $mq))&&
					  (( $av2 >= $aq))&&
					  (( $fastq_Second_Seq !~ /N/i))
				       )

					{
					$singles++;
					print S "$r/2\n$fastq_Second_Seq+\n$fastq_Second_Qul_ASCII";
				    	}
				    }
                                        sub quality_cal #Perl subroutine calculating phred quality values
					{
					      my @s=split('',$_[0]); pop(@s); my $sc=$_[1]; my @qul;
 					      my $sum=0; my $min=0; 
   					 	 
                                              foreach my $ml(@s)
         				 	 {
          					  push(@qul, ord($ml)-$sc);
                     				  $sum=$sum+ord($ml)-$sc;  
                                                    }
						
      						@qul= sort {$a <=> $b} @qul; $min=shift(@qul);
          					return ($min, $sum);
  					 }
                          
          
                                }
                    else          {  #Program quits if the ids of R1 and R2 are not identical
                                     die "
					      ERROR: Forward and Reverse mates are not in order 
					       
 					      Script accepts forward and reverse read ids in following formats:
					      1. \@HWUSI-EAS100R:6:73:941:1973#0/1 \@HWUSI-EAS100R:6:73:941:1973#0/2
 	     				      2. \@HWUSI-EAS100R:6:73:941:1973#0 1 \@HWUSI-EAS100R:6:73:941:1973#0 2 
       					  
         				      Please convert your read ids accordingly and rerun the script.	
										
						\n\n";
			  	   }
  		 }until eof(F);

	if($filtering =~/Yes/i){
	open L,">Log" or die$!;
	my $qltp=($paired/$count)*100;
	my $qlts=($singles/$count)*100;

 print L "\n\nTotal number of reads: $count
             
            Reads without Ns, average quality cutoff $aq, minimum quality of a base more than $mq and length cutoff $l: $paired($qltp%)
	    Number of single reads generated after filtering: $singles($qlts%)	 	    

            This script has generated trimmed files:
 
            \"$prefix\_R1.fastq\"
            \"$prefix\_R2.fastq\"
          
 	";}

close (F);
close (R);
close (FF) if($filtering =~/Yes/i) ;
close (RF) if($filtering =~/Yes/i) ;
close (L) if($filtering =~/Yes/i);
close (S) if($filtering =~/Yes/i);
close (LQD);

return (\%stats, \%cov_len, $count);
}

sub plotting{

	my $pl=shift(@_);
	my $coverage_ln=shift(@_);
	my $count=shift(@_);
	my $len=shift(@_);
	my $gsize=shift(@_); $gsize=$gsize*1000000;
        my $pfix=shift(@_); 	
      
        my %plot= %{$pl};
	my %cov_plot= %{$coverage_ln};

        #Generating files for plotting
	open P,">$pfix\_file_for_plotting_reads_percentages.txt";
	open C,">$pfix\_file_for_plotting_Coverage.txt";
	open B,">$pfix\_breif_log.txt";

	print P "Average_Quality\tMinimum_base_quality"; for (my $k=$len-50; $k<=$len; $k=$k+10){print P "\t$k";}
	print C "Average_Quality\tMinimum_base_quality"; for (my $lm=$len-50; $lm<=$len; $lm=$lm+10){print C "\t$lm";}

      foreach my $avg_ql (sort { $a <=> $b } keys %plot)
             {
 	       foreach my $minimum_ql (sort { $a <=> $b } keys %{$plot{$avg_ql}}) 
              
                {
		print P "\n$avg_ql\t$minimum_ql";
		print C "\n$avg_ql\t$minimum_ql";
           	
                 foreach my $length (sort  { $a <=> $b } keys %{$plot{$avg_ql}{$minimum_ql}})
                 
		    {     
		     my $per = sprintf("%.2f", ($plot{$avg_ql}{$minimum_ql}{$length}/$count)*100);
		     my $cove= sprintf("%.2f", ($cov_plot{$avg_ql}{$minimum_ql}{$length}{Len})/$gsize);
                     print B "Number of reads with Average Quality $avg_ql, Minimum base quality $minimum_ql and length $length: $plot{$avg_ql}{$minimum_ql}{$length} ($per%)\tCoverage depth: $cove x\n";                  
                       print P "\t$per";                  
                       print C "\t$cove";                  
                     }
                 
          		print B "\n";
                  }

	          print B "\n";
       		  print B "\n";
       		}

close B;
close P;
close C;
}	
__END__
