#!/usr/bin/env perl
use warnings;



# Copyright 2009 - 2018 Christopher Benner <cbenner@ucsd.edu>
#
# This file is part of HOMER
#
# HOMER is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# HOMER is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

use POSIX;

my $includeSingleExonsFlag = 1;
my $include1stExonFlag = 1;

my $maxCPUs = 3;
my $gtfTssSize = 400;
my $maxRNALog2Fold = 3;
my $maxInputLog2Fold = 3;
my $minDistDiff = 0.15;
my $defaultLog2Fold = 1.0;
my $freqSize = 2000;
$findPeaksTSSoptions = " -size 150 -L 2 -ntagThreshold 15 ";

sub printCMD {
	print STDERR "\n\tfindcsRNATSS.pl <csRNA tag directory> [options]\n";
	print STDERR "\n\tOptions:\n";
	print STDERR "\t\t-o <prefix> \n";
	print STDERR "\t\t-i <csRNA input tag directory>\n";
	print STDERR "\t\t-rna <RNAseq tag directory>\n";
	print STDERR "\t\t-gtf <gtf file>\n";
	print STDERR "\t\t-genome <genome>\n";
	print STDERR "\t\t-cpu <#> (max CPUs)\n";
	print STDERR "\t\t-minDistDiff <#> (default: $minDistDiff)\n";
	print STDERR "\t\t-defaultLog2Fold <#> (default: $defaultLog2Fold)\n";
	print STDERR "\t\t-maxInputLog2Fold <#> (maximum log2 fold enrichment vs. input or RNA, default: $maxInputLog2Fold)\n";
	print STDERR "\t\t-maxRNALog2Fold <#> (maximum log2 fold enrichment vs. input or RNA, default: $maxRNALog2Fold)\n";
	print STDERR "\n";
	exit;
}

if (@ARGV < 1) {
	printCMD();
}

my $prefix = "out";
my $tagDir = $ARGV[0];
my $inputDir = '';
my $rnaDir = '';
my $gtfFile = '';
my $gtfOptions = '';
my $genome = 'none';
for (my $i=1;$i<@ARGV;$i++) {
	if ($ARGV[$i] eq '-i') {
		$inputDir = $ARGV[++$i];
	} elsif ($ARGV[$i] eq '-rna') {
		$rnaDir = $ARGV[++$i];
	} elsif ($ARGV[$i] eq '-o' || $ARGV[$i] eq '-prefix') {
		$prefix = $ARGV[++$i];
	} elsif ($ARGV[$i] eq '-gtf') {
		$gtfFile = $ARGV[++$i];
	} elsif ($ARGV[$i] eq '-genome') {
		$genome = $ARGV[++$i];
	} elsif ($ARGV[$i] eq '-minDistDiff') {
		$minDistDiff = $ARGV[++$i];
	} elsif ($ARGV[$i] eq '-defaultLog2Fold') {
		$defaultLog2Fold = $ARGV[++$i];
	} elsif ($ARGV[$i] eq '-maxRNALog2Fold') {
		$maxRNALog2Fold = $ARGV[++$i];
	} elsif ($ARGV[$i] eq '-maxInputLog2Fold') {
		$maxInputLog2Fold = $ARGV[++$i];
	} elsif ($ARGV[$i] eq '-cpu') {
		$maxCPUs = $ARGV[++$i];
	} else {
		printCMD();
	}
}

if ($gtfFile eq '' && ($genome eq '' || $genome eq 'none')) {
	#print STDERR "!! Error: For now either a homer-configured genome or a gtf file is required\n";
	#exit;
}


my $rand = rand();
my $tmpFile = $rand . ".tmp";
my $tmpFile2 = $rand . ".2.tmp";
my $tmpFile3 = $rand . ".3.tmp";
my $tmpFile4 = $rand . ".4.tmp";

my $initialTSSfile = $rand . ".tss.tmp";
my $gtfTSSfile = $rand . ".gtfTSS.tmp";
my $gtfSingleExonFile = $rand . ".gtfSingleExon.tmp";
my $gtf1stExonFile = $rand . ".gtf1stExon.tmp";
my $gtfOtherExonsFile = $rand . ".gtfOtherExons.tmp";
my $divergentTSSfile = $rand . ".divergentTSS.txt";
my $antisenseTSSfile = $rand . ".antisenseTSS.txt";



`findPeaks "$tagDir" -style tss $findPeaksTSSoptions > $initialTSSfile`;

if ($gtfFile ne '') {
	`parseGTF.pl "$gtfFile" tss > $tmpFile`;
	`adjustPeakFile.pl $tmpFile -size $gtfTssSize > $gtfTSSfile`;
	`adjustPeakFile.pl $gtfTSSfile -flipStrand > $tmpFile`;
	`adjustPeakFile.pl $tmpFile -size 600 > $antisenseTSSfile`;

	`parseGTF.pl "$gtfFile" exons > $tmpFile`;
	`grep "exon 1 of 1" $tmpFile > $gtfSingleExonFile`;
	`grep -v "exon 1 of 1" $tmpFile > $tmpFile2`;
	`grep "exon 1 of" $tmpFile2 > $gtf1stExonFile`;
	`grep -v "exon 1 of" $tmpFile2 > $gtfOtherExonsFile`;

	`adjustPeakFile.pl $initialTSSfile -flipStrand > $tmpFile`;
	`adjustPeakFile.pl $tmpFile -size 600 > $divergentTSSfile`;

	`rm $tmpFile $tmpFile2`;
	$gtfOptions = " -gtf \"$gtfFile\" -pcount -p $gtfTSSfile $gtfSingleExonFile $gtf1stExonFile $gtfOtherExonsFile $divergentTSSfile $antisenseTSSfile";
} elsif ($genome ne '' || $genome ne 'none') {

}

#if ($tagDir ne ''  $inputDir ne '') {
`annotatePeaks.pl $initialTSSfile $genome -strand + -fragLength 1 -d $tagDir $inputDir $rnaDir -cpu $maxCPUs $gtfOptions > $tmpFile2`;

my %data = ();
my $c = 0;
open IN, $tmpFile2;
while (<IN>) {
	$c++;
	next if ($c < 2);
	chomp;
	s/\r//g;
	my $og = $_;
	my @line = split /\t/;
	my $id = $line[0];
	my $chr = $line[1];
	my $start = $line[2];
	my $end = $line[3];
	my $strand = $line[4];
	my $score = $line[5];
	my $focusRatio = $line[6];
	my $str = "$id\t$chr\t$start\t$end\t$strand\t$score\t$focusRatio";

	if (@line < 20) {
		print STDERR "Warning... not enough columns in file\n";
		next;
	}
	my $csRNA = $line[19];
	my $csRNAinput = 'na';
	my $rna = 'na';
	my $ann = 'na';
	my $col = 20;
	if ($inputDir ne '' && @line > $col) {
		$csRNAinput = $line[$col];
		$col++;
	}
	if ($rnaDir ne '' && @line > $col) {
		$rna = $line[$col];
		$col++;
	}
	if ($gtfFile ne '' && @line > $col+5) {
		if ($line[$col+1] ne '' && $line[$col+1] > 0) {
			$ann = 'singleExon';
		} elsif ($line[$col] ne '' && $line[$col] > 0) {
			$ann = 'tss';
		} else {
			if ($line[$col+3] ne '' && $line[$col+3] > 0) {
				if ($line[$col+4] ne '' && $line[$col+4] > 0) {
					$ann = 'otherExonBidirectional';
				} else {
					$ann = 'otherExon';
				}
				$ann = 'otherExon';
			} elsif ($line[$col+2] ne '' && $line[$col+2] > 0) {
				$ann = 'firstExon';
			} elsif ($line[$col+5] ne '' && $line[$col+5] > 0) {
				$ann = 'tssAntisense';
			} else {
				$ann = 'other';
			}
		}
	}
	$data{$id} = {str=>$str, csRNA=>$csRNA, csRNAinput=>$csRNAinput, rna=>$rna,ann=>$ann};
	#print $og . "\t$csRNA\t$csRNAinput\t$rna\t$ann\n";
}
close IN;
`rm $tmpFile2`;

my @tssIDs = sort {$data{$b}->{'csRNA'} <=> $data{$a}->{'csRNA'}} keys %data;
my $max = scalar(@tssIDs);
my $index = floor($max*0.9);
#my $pseudoCount = $data{$tssIDs[$index]}->{'csRNA'} * 0.5;

$pseudoCount = 8;
if ($pseudoCount < 0.1) {
	print STDERR "!!! Warning/Error: PseudoCount ($pseudoCount) calculation is less than 0.1!!!!\n";
	exit;
}
print STDERR "\tPsuedoCount set at $pseudoCount\n";



#bidirectional stuff
`adjustPeakFile.pl $initialTSSfile -flipStrand -size -300,0 > $tmpFile`;
`annotatePeaks.pl $tmpFile $genome -strand + -fragLength 1 -d $tagDir $inputDir $rnaDir -cpu $maxCPUs $gtfOptions > $tmpFile2`;

$c = 0;
open IN, $tmpFile2;
while (<IN>) {
	$c++;
	next if ($c < 2);
	chomp;
	s/\r//g;
	my $og = $_;
	my @line = split /\t/;
	my $id = $line[0];

	if (@line < 20) {
		print STDERR "Warning... not enough columns in file\n";
		next;
	}
	my $csRNA = $line[19];
	my $csRNAinput = 'na';
	my $rna = 'na';
	my $ann = '';
	my $col = 20;
	if ($inputDir ne '' && @line > $col) {
		$csRNAinput = $line[$col];
		$col++;
	}
	if ($rnaDir ne '' && @line > $col) {
		$rna = $line[$col];
		$col++;
	}
	$data{$id}->{'revcsRNA'} = $csRNA;
	$data{$id}->{'revcsRNAinput'} = $csRNAinput;
	$data{$id}->{'revrna'} = $rna;
	my $logRatio = 'na';
	my $ogLevel = $data{$id}->{'csRNA'};
	if ($ogLevel ne 'na' && $csRNA ne 'na') {
		$logRatio = log(($ogLevel+$pseudoCount)/($csRNA+$pseudoCount))/log(2.0);
	}
   	$data{$id}->{'revfold'} = $logRatio;
}
close IN;
`rm $tmpFile $tmpFile2`;

my %possible = ();
$possible{'otherExon'} = 1;
if ($includeSingleExonsFlag) {
	$possible{'singleExon'} = 1;
}
if ($include1stExonFlag) {
	$possible{'firstExon'} = 1;
}
my @tss = ();
my @exons = ();
foreach(@tssIDs) {
	my $id = $_;
	my $csRNA = $data{$id}->{'csRNA'};
	my $csRNAinput = $data{$id}->{'csRNAinput'};
	my $foldInput = 'na';
	if ($csRNAinput ne 'na') {
		$foldInput = log(($csRNA+$pseudoCount)/($csRNAinput+$pseudoCount))/log(2.0);
	}
	my $rna = $data{$id}->{'rna'};
	my $foldRNA = 'na';
	if ($rna ne 'na') {
		$foldRNA = log(($csRNA+$pseudoCount)/($rna+$pseudoCount))/log(2.0);
	}
	$data{$id}->{'foldInput'} = $foldInput;
	$data{$id}->{'foldRNA'} = $foldRNA;
	if ($data{$id}->{'ann'} eq 'tss') {
		my @a = ($foldInput, $foldRNA);
		push(@tss, \@a);
	}
	if (exists($possible{$data{$id}->{'ann'}})) {
		my @a = ($foldInput, $foldRNA);
		push(@exons, \@a);
	}
}
my $numTSSann = scalar(@tss);
my $numExonann = scalar(@exons);
my $inputThresh = -1e10;
my $inputMaxDiff = -1e10;
my $rnaThresh = -1e10;
my $rnaMaxDiff = -1e10;
open INPUT, ">$prefix.inputDistribution.txt";
open RNA, ">$prefix.rnaDistribution.txt";
for (my $z=0;$z<2;$z++) {
	next if ($z == 0 && $inputDir eq '');
	next if ($z == 1 && $rnaDir eq '');
	@tss = sort {$b->[$z] <=> $a->[$z]} @tss;
	@exons = sort {$b->[$z] <=> $a->[$z]} @exons;

	my $j=0;
	my $maxDiff = -1;
	my $thresh = 0;
	my $Nx = scalar(@tss);
	my $Ny = scalar(@exons);
	for (my $i=0;$i<@tss;$i++) {
		while ($j < $Ny && $exons[$j]->[$z] > $tss[$i]->[$z]) {
			$j++;
		}
		my $fracX = ($i+1)/$Nx;
		my $fracY = ($j)/$Ny;
		my $diff = $fracX-$fracY;
		if ($z == 0) {
			print INPUT "$tss[$i]->[$z]\t$fracX\t$fracY\t$diff\n";
		} elsif ($z == 1) {
			print RNA "$tss[$i]->[$z]\t$fracX\t$fracY\t$diff\n";
		}
		if ($diff > $maxDiff) {
			$maxDiff = $diff;
			$thresh = $tss[$i]->[$z];
		}
		#print "$x[$i]\t$fracX\t$fracY\t$diff\n";
	}
	print STDERR "\n\tMaxDiff [$z] = $maxDiff\n";
	print STDERR "\tThreshold [$z] = $thresh\n\n";
	if ($maxDiff < $minDistDiff) {
		print STDERR "\tDifference in distributions is too small: setting fold enrichment to default\n";
		$thresh = $defaultLog2Fold;
	}
			
	if ($z == 0 && $inputDir ne '') {
		$inputThresh = $thresh;
		$inputMaxDiff = $maxDiff;
	} elsif ($z == 1 && $rnaDir ne '') {
		$rnaThresh = $thresh;
		$rnaMaxDiff = $maxDiff;
	}
}
close RNA;
close INPUT;



if ($inputThresh > $maxInputLog2Fold) {
	print STDERR "\t!!!Warning: Automatic Log2 Fold enrichment vs. input ($inputThresh) greater than max ($maxInputLog2Fold)\n";
	print STDERR "\t\tSometimes this can happen if the data is very clean where most of the initial peaks are legit TSS\n";
	$inputThresh = $maxInputLog2Fold;
}
if ($rnaThresh > $maxRNALog2Fold) {
	print STDERR "\t!!!Warning: Automatic Log2 Fold enrichment vs. rnaseq ($rnaThresh) greater than max ($maxRNALog2Fold)\n";
	print STDERR "\t\tSometimes this can happen if the data is very clean where most of the initial peaks are legit TSS\n";
	$rnaThresh = $maxRNALog2Fold;
}

open TSS, ">$prefix.tss.txt";
open TSSINPUT, ">$prefix.input.txt";
open TSSRNA, ">$prefix.rna.txt";
open TSSANN, ">$prefix.anntss.txt";
open TSSEXON, ">$prefix.annexons.txt";

my $total = 0;
my $Ntss = 0;
my $Noverinput = 0;
my $Ninput = 0;
my $Nrna = 0;
my $Nann = 0;
my $Nexon = 0;

my $header = "#tssID\tchr\tstart\tend\tstrand\tscore\tfocusRatio\tcsRNA\tcsRNAinput\trnaseq\tannotation";
$header .= "\tLog2Ratio vs. Input\tLog2Ratio vs. RNA";
$header .= "\trev-csRNA\trev-csRNAinput\trev-RNA\tBidirectionalRatio";
$header .= "\n";
print TSS $header;
print TSSINPUT $header;
print TSSRNA $header;
print TSSANN $header;
print TSSEXON $header;
foreach(@tssIDs) {
	my $id = $_;
	my $str = $data{$id}->{'str'} . "\t" . $data{$id}->{'csRNA'} . "\t" . $data{$id}->{'csRNAinput'};
	$str .= "\t" . $data{$id}->{'rna'} . "\t" . $data{$id}->{'ann'} . "\t" . $data{$id}->{'foldInput'};
	$str .= "\t" . $data{$id}->{'foldRNA'};
	if (exists($data{$id}->{'revfold'})) {
		$str .= "\t" . $data{$id}->{'revcsRNA'} . "\t" . $data{$id}->{'revcsRNAinput'} . "\t" . $data{$id}->{'revrna'};
		$str .= "\t" . $data{$id}->{'revfold'};
	} else {
		$str .= "\tna\tna\tna\tna";
	}
	$str .= "\n";
	$total++;
	my $inputFold = $data{$id}->{'foldInput'};
	my $rnaFold = $data{$id}->{'foldRNA'};

	if ($inputFold eq 'na' || $inputFold > $inputThresh) {
		$Noverinput++;
		if ($rnaFold eq 'na' || $rnaFold > $rnaThresh) {
			$Ntss++;
			print TSS $str;
		} else {
			$Nrna++;
			print TSSRNA $str;
		}

		if ($data{$id}->{'ann'} eq 'otherExon') {
			$Nexon++;
			print TSSEXON $str;
		} else {
			$Nann++;
			print TSSANN $str;
		}
	} else {
		$Ninput++;
		print TSSINPUT $str;
	}
}	
close TSS;
close TSSINPUT;
close TSSRNA;
close TSSANN;
close TSSEXON;
open OUT, ">$prefix.stats.txt";
print OUT "Set\tTotal TSS\n";
print OUT "total\t$total\n";
print OUT "valid TSS\t$Ntss\n";
print OUT "total over input\t$Noverinput\n";
print OUT " under RNA\t$Nrna\n";
print OUT " valid Annotation: $Nann\n";
print OUT " exon Annotation: $Nexon\n";
print OUT "Total input\t$Ninput\n";
print OUT "log2 fold vs. input: $inputThresh\n";
print OUT "log2 fold vs. rna: $rnaThresh\n";
print OUT "Number of TSS regions for foldChange Calculation: $numTSSann\n";
print OUT "Number of exon regions for foldChange Calculation: $numExonann\n";
close OUT;
open IN, "$prefix.stats.txt";
while (<IN>) {
	print STDERR "$_";
}
close IN;

my @files = (
	"$prefix.tss.txt",
	"$prefix.input.txt",
	"$prefix.rna.txt",
	"$prefix.anntss.txt",
	"$prefix.annexons.txt"
);
if ($genome ne '' && $genome ne 'none') {
	foreach(@files) {
		my $f = $_;
		`annotatePeaks.pl $f $genome -size $freqSize -hist 1 -di > $f.freq.tsv 2> /dev/null`;
	}
}

`rm $initialTSSfile`;
if ($gtfFile ne '') {
	`rm $gtfTSSfile $gtfSingleExonFile $gtf1stExonFile $gtfOtherExonsFile $divergentTSSfile $antisenseTSSfile`;
}
