#!/usr/bin/perl

#      Copyright (C) 2006  Jifeng Tang                                                  #

#       This program is free software; you can redistribute it and/or                   #
# 	modify it under the terms of the GNU General Public License                     #
#       as published by the Free Software Foundation; either version 2                  #
#       of the License, or (at your option) any later version.                          #

#       This program is distributed in the hope that it will be useful,                 #
#       but WITHOUT ANY WARRANTY; without even the implied warranty of                  #
#       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                   #
#       GNU General Public License for more details.                                    #

#       You should have received a copy of the GNU General Public License               #
#       along with this program; if not, write to the Free Software                     #
#       Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA. #

#########################################################################################
# The program gets multiple sequence alignment from *.ace #

my @cap_array = "";
my $cap_line = "";
my $sub_line = "";
my $add_sequence = "";
my $count = "";
my $contig = "";
my @AF = "";
my $AF_count = "";
my @sub_array = "";
my $sub_line = "";
my @QA = "";
my @sequence = "";
my $length = "";
my $head = "";
my $tail = "";
my @bases = "";
my $add_x = "";
my $x_count = "";
my $bases = "";
my $leader = "";
my $infile = "";
my $clustersize = "";
my @counter=(0..0); #counter number of contig according EST number included

#get infile anme
chomp ($infile = $ARGV[0]);
chomp ($clustersize = $ARGV[1]);

if($infile !~/\D/)
{
#input infile name
#print "Enter infile name: \n";
chomp($infile = <STDIN>);
}
else{
#print "input file name:$infile \n";

my $capfile = "$infile.ace";
print "Get some information from $capfile\n";

open (CAP_INFILE, "$capfile");
open (CAP_LIST, ">availcontiglist"); #>>availcontiglist is append contigname to a file;>availcontiglist is creating a file.
#open (CAP_LIST1,">availcontiglistname"); #availcontiglist only including contig name not est number.
open (CAP_LIST2,">contiglistmore"); #>>contiglist is append contigname to a file;>contiglist300 more than 300 reads ESTs
open (CAP_LIST3,">contigstatisticinfo");
open (CAP_LIST0,">contiglistless");

while (defined ($cap_line = <CAP_INFILE>))
  {
  $cap_line =~ s/CO Contig/@ CO Contig/gi;

  push @cap_array, $cap_line;
  }
$cap_line = join ('_join_', @cap_array);
@cap_array = split (/@/,$cap_line);
shift @cap_array;

#put alignment information to data directory
system("mkdir data");
#ESTcounter1 for counting availcontig;ESTcounter2 for counting contigs more than 100 EST members
$ESTcounter1 =0; 
$ESTcounter2 =0;
$ESTcounter3 =0;

foreach $sub_line(@cap_array)
{

  $add_sequence = "no";
  @sequence = "";
  @QA = "";
  $AF_count = 0;
  @AF = "";
  $count = 0;

  $contig++;

  @sub_array = split (/_join_/, $sub_line);

   foreach (@sub_array)
   {

   if($_ =~ m/CO Contig/g)
   {
    $contig_name = $_;

   # @contig_id= split(/(\s+)/,$contig_name,8);
   # print "$contig_id[4],$contig_id[6],$contig_id[8]"; # these two sentences are correct
   @contig_id= split(m"\s+",$contig_name,8);
   #print "$contig_id[2],$contig_id[3],$contig_id[4]";

   $contigid = $contig_id[2];
   $contiglen = $contig_id[3];
   $contigsize = $contig_id[4];
   #print "$contigid,$contiglen,$contigsize";
   }

   if ($_ =~ /^AF /)
    {
    $AF_count++;
    $AF[$AF_count] = $_;
    }
   if ($_ =~ /^RD /)
    {
    $add_sequence = "yes";
    $count++;
    }
   if ($_ =~ /^QA /)
    {
    $QA[$count] = $_;
    $add_sequence = "no";
    }
   #if ($add_sequence eq "yes" && $_ =~ /^[XCATGN\*]/)
    if ($add_sequence eq "yes" && $_ =~ /^[XCATGNactgnx\*]/)
    {
    $sequence[$count] = $sequence[$count].$_;
    }
   }
#statistic contig according the number of EST $AF_count 
use Switch;
switch($AF_count){
case 1  {$counter[0]++;}
case 2  {$counter[1]++;}
case 3  {$counter[2]++;}
case 4  {$counter[3]++;}
case 5  {$counter[4]++;}
case 6  {$counter[5]++;}
case 7  {$counter[6]++;}
case 8  {$counter[7]++;}
case 9  {$counter[8]++;}
case 10  {$counter[9]++;}
case 11  {$counter[10]++;}
case 12  {$counter[11]++;}
case 13  {$counter[12]++;}
case 14  {$counter[13]++;}
case 15  {$counter[14]++;}
case 16  {$counter[15]++;}
case 17  {$counter[16]++;}
case 18  {$counter[17]++;}
case 19  {$counter[18]++;}
case 20  {$counter[19]++;}
case 21  {$counter[20]++;}
case 22  {$counter[21]++;}
case 23  {$counter[22]++;}
case 24  {$counter[23]++;}
case 25  {$counter[24]++;}
case 26  {$counter[25]++;}
case 27  {$counter[26]++;}
case 28  {$counter[27]++;}
case 29  {$counter[28]++;}
case 30  {$counter[29]++;}
case 31  {$counter[30]++;}
case 32  {$counter[31]++;}
case 33  {$counter[32]++;}
case 34  {$counter[33]++;}
case 35  {$counter[34]++;}
case 36  {$counter[35]++;}
case 37  {$counter[36]++;}
case 38  {$counter[37]++;}
case 39  {$counter[38]++;}
case 40  {$counter[39]++;}
case 41  {$counter[40]++;}
case 42  {$counter[41]++;}
case 43  {$counter[42]++;}
case 44  {$counter[43]++;}
case 45  {$counter[44]++;}
case 46  {$counter[45]++;}
case 47  {$counter[46]++;}
case 48  {$counter[47]++;}
case 49  {$counter[48]++;}
case 50  {$counter[49]++;}
case 51  {$counter[50]++;}
case 52  {$counter[51]++;}
case 53  {$counter[52]++;}
case 54  {$counter[53]++;}
case 55  {$counter[54]++;}
case 56  {$counter[55]++;}
case 57  {$counter[56]++;}
case 58  {$counter[57]++;}
case 59  {$counter[58]++;}
case 60  {$counter[59]++;}
case 61  {$counter[60]++;}
case 62  {$counter[61]++;}
case 63  {$counter[62]++;}
case 64  {$counter[63]++;}
case 65  {$counter[64]++;}
case 66  {$counter[65]++;}
case 67  {$counter[66]++;}
case 68  {$counter[67]++;}
case 69  {$counter[68]++;}
case 70  {$counter[69]++;}
case 71  {$counter[70]++;}
case 72  {$counter[71]++;}
case 73  {$counter[72]++;}
case 74  {$counter[73]++;}
case 75  {$counter[74]++;}
case 76  {$counter[75]++;}
case 77  {$counter[76]++;}
case 78  {$counter[77]++;}
case 79  {$counter[78]++;}
case 80  {$counter[79]++;}
case 81  {$counter[80]++;}
case 82  {$counter[81]++;}
case 83  {$counter[82]++;}
case 84  {$counter[83]++;}
case 85  {$counter[84]++;}
case 86  {$counter[85]++;}
case 87  {$counter[86]++;}
case 88  {$counter[87]++;}
case 89  {$counter[88]++;}
case 90  {$counter[89]++;}
case 91  {$counter[90]++;}
case 92  {$counter[91]++;}
case 93  {$counter[92]++;}
case 94  {$counter[93]++;}
case 95  {$counter[94]++;}
case 96  {$counter[95]++;}
case 97  {$counter[96]++;}
case 98  {$counter[97]++;}
case 99  {$counter[98]++;}
case 100  {$counter[99]++;}
else  {$counter[100]++;}
} #end switch

#add get the number of every contigs
#get clusters with at least $clustersize sequences and with less than 500 sequences; if you want to 500 to more than 500, you should change C programs realted to it
if($AF_count >$clustersize-1 && $AF_count<501)
{
  $contigname ="Contig$contig";
  #print "contigname:$contigname";
  if($AF_count == $contigsize && $contigname eq $contigid)
  {
  open (CAP_OUT, ">contig_$contig");
  print CAP_LIST "contig_$contig $AF_count $contiglen\n";
  }
  $ESTcounter1++;
foreach (@sequence)
{

s/[^A-Z\*-]//gi;
s/\*/-/g;
}
$count = 1;
$length = @sequence;
%re_order = ();
while ($count < $length)
{

$QA[$count] =~ /QA\s(\d*)\s(\d*)/;

$head = $1 - 1;
$tail = $2 - 1;

@bases = split (//,$sequence[$count]);
@bases = @bases[$head..$tail];
$bases = join ('', @bases);
$AF[$count] =~ /\s(-?\d*)$/;
$add_x = $1 - 1;
$AF[$count] =~ s/\D\s-?\d*$//;
$AF[$count] =~ s/^AF\s//;
chomp ($this_seq_name = $AF[$count]);
$x_count = $head + $add_x;
$leader = "." x $x_count;
$bases = $leader.$bases;
$re_order{$this_seq_name} = $bases;
$print_order_count = 1;
#print "seqname:$this_seq_name,start:$head,stop:$tail\n"; #give the length
$count++;
}
#re-order sequences by name to help group those of the same variety

foreach $seq_in_order (sort keys %re_order)
{
#add unigene information such unigene Id
#print "seq:$seq_in_order\n";
$seqname = $seq_in_order;
chop($seqname);
print CAP_OUT ">$seqname\n$re_order{$seq_in_order}\n";
$fasta_format =  $re_order{$seq_in_order};
$fasta_format =~ s/[\.\-]//g;
#print CAP_OUT2 ">$seq_in_order\n$fasta_format\n";
$print_order_count++;
}
close CAP_OUT;
#close CAP_OUT2;
#put every contigs to the directory data

system("mv contig_* data/");

}elsif($AF_count>500)
{  
   print CAP_LIST2 "contig_$contig $AF_count\n";
   $ESTcounter2++;
}elsif($AF_count<$clustersize)
{
   printf CAP_LIST0 "contig_$contig $AF_count\n";
   $ESTcounter3++;
}
}
print CAP_LIST2 "total $ESTcounter1 contigs with more than $clustersize and less than 500 EST;$ESTcounter2 with more than 500 ESTs;$ESTcounter3 contigs with less than $clustersize";
#output staticstic information
for($i=0;$i<@counter;$i++)
{
    $j=$i+1;
   print CAP_LIST3 "$j	$counter[$i]\n";
}

close CAP_LIST;
close CAP_INFILE;
close CAP_LIST2;
close CAP_LIST3;

}
