#!/usr/bin/env perl                                                                                                                  
use strict;
use warnings;
use Cwd 'abs_path';
use Encode;
use utf8;
binmode STDIN,":utf8";
binmode STDOUT,":utf8";
no warnings 'utf8';
no strict "refs";

### collectCounts.pl
### - Jake Ryland Williams
#
### general descriptiion: This is a companion to alphaSplit.pl and randomPartitionText.pl
###                       and is needed afterward to piece together the dictionary count
###                       files.
###
### NOTE: "collectCounts.pl" will throw an error depending on the users terminal "ulimit"
###       settings. In this event the standard output should indicate an appropriate ulimit
###       settiong. Be aware that setting the ulimit may require starting a new terminal 
###       session.
###
#
#
#
################### sample call: collectCounts.pl
#
### In order to piece the files back together into a single file of counts we will call the 
### collectCounts.pl script in the package. Note that this script will open ALL of the count
### files from the partitioning at once, and so if it detects that your ulimit (the max number
### of files your computer will allow you to open at once) is too low, combinedCounts.pl will
### die and tell you what to set it to. Please also be aware that on Unix machines (at least)
### you will have to start a new terminal to change the ulimit, if you've already chaged it.
#
#
# piece the count files together: 
#
# cd path/to/partitioner
# perl bin/collectCounts.pl "data/twoCities"
#
###################


my $text = $ARGV[0]; ### the text file path whose counts we will collect

### some initializations
my @letters = ("a".."z");
my $file = $text."\.txt";
my @files;
my $letter;
my $list;
my %nextletter;
my $i;
my $notDone = 1;
my $string;
my $flipped = 0;
my $infile;
my $numfiles;
my $ulimit;
my $limitchanged = 0;
my @infiles;
my @handles;
my $outfile = $file;
$outfile =~ s/.txt/_counts.txt/;
my $count;
my @counts;
my $phrase;
my @phrases;
my $fh;
my $line;
my $numdone = 0;
my $todo = 0;
my $max  = 0;
my $newmax = 0;
my %aremax;
my $ismax = 1;
my @maxphrases;

# pull all max count phrases from those files that got'em
sub pullphrases{
    @maxphrases = ();    
    foreach $i (keys %aremax){
	$fh = $handles[$i];
	$ismax = 1;
	while ($ismax){
	    push(@maxphrases,$phrases[$i]);
	    if ($line = <$fh>){
		if ($line =~ m/(.*?)\t(.*?)\n/){
		    $phrases[$i] = $1;
		    $counts[$i] = $2;
		    if ($counts[$i] != $max){		    
			$ismax = 0;
		    }
		}
		else{
		    $phrases[$i] = "";
		    $counts[$i] = 0;
		    $numdone += 1;
		    $ismax = 0;
		}
	    }
	    else{
		$phrases[$i] = "";
		$counts[$i] = 0;
		$numdone += 1;
		$ismax = 0;
	    }	    	    
	}
    }

    # if we're not done then determine the max count and max count files
    $newmax = 0;
    %aremax = ();
    if ($todo-$numdone){
	foreach $i (0 .. (scalar(@infiles)-1)){
	    if ($counts[$i] > $newmax){
		%aremax = ();
		$aremax{$i} = 1;
		$newmax = $counts[$i];
	    }
	    elsif ($counts[$i] == $newmax){
		$aremax{$i} = 1;
	    }
	}    
    }
}

# determine the various count files
$list = `ls $text\_\*_counts.txt`;
chomp $list;
@files = split("\n",$list);
foreach $infile (@files){
    $string = $infile;
    if ($string =~ s/$text\_([a-z\-]+)\_counts\.txt/$1/){
	push(@handles,$string);
	push(@infiles,$infile);
    }
    if ($string =~ m/$text\_GLOBAL\_counts\.txt/){
	system("mv $string $outfile");
	exit 0;
    }
}

$todo = scalar(@infiles);
@counts = (0) x $todo;
@phrases = ("") x $todo;

# check the ulimit. if it is too low then die and suggest something higher
$ulimit = `ulimit -n`;
chomp($ulimit);
if ($ulimit < ($todo + 100)){
    $todo += 100;
    die "you're ulimit is too small \($ulimit\), try setting: ulimit \-n $todo"." \<ENTER\>\n";
}

# open all of the files to combine
foreach $i (0 .. (scalar(@infiles)-1)){
    $fh = $handles[$i];
    $infile = $infiles[$i];
    open($fh,"<",$infile);
}

# start reading the files
foreach $i (0 .. (scalar(@infiles)-1)){
    $fh = $handles[$i];    
    if ($line = <$fh>){
	if ($line =~ m/(.*?)\t(.*?)\n/){
	    $phrases[$i] = $1;
	    $counts[$i] = $2;
	    if ($counts[$i] > $max){
		%aremax = ();
		$aremax{$i} = 1;
		$max = $counts[$i];
	    }
	    elsif ($counts[$i] == $max){
		$aremax{$i} = 1;
	    }
	}
	else{
	    $numdone += 1;
	}
    }
    else{
	$numdone += 1;
    }
}

# pull and print out current max count phrases until all files are empty
open(OUTFILE,">",$outfile) or die "Can't open '$outfile': $!";
while ($todo-$numdone){
    &pullphrases;
    foreach $phrase (@maxphrases){
	print OUTFILE $phrase."\t".$max."\n";
    }
    $max = $newmax;
}
close(OUTFILE);

# close all of the files once we're done
foreach $fh (@handles){
    close($fh);
}


# delete all of the string-files now that we're done.
foreach $infile (@infiles){
    system("rm ".$infile);
}
