#!/usr/bin/env perl                                                                                                                  
use strict;
use warnings;
use Cwd 'abs_path';
use Encode;
use utf8;
binmode STDIN,":utf8";
binmode STDOUT,":utf8";
no warnings 'utf8';


### randomPartitionText.pl
### - Jake Ryland Williams
#
### general descriptiion: This script computes random partition expected counts for phrases in 
###                       multiple ways. These are in general controlled by alphaSplit.pl, as there
###                       is some delicacy in determining how best to manage memory. Please read
###                       the documentation in alphaSplit.pl for more details.
###
#

### set the defaults
my $string = "";
my $maxRank = 10000000;
my $file = 0;
my $wrapping = 0;
my $maxOrder = 50;
my $q = 0.5;
my $subword = 0;
my $noRT = 1;
my $noBot = 1;
my $noMark = 1;
my $noUrl = 1;
my $verbose = 0;
my $real = 1;
my $input;
my $flag;
my @flags;

### check the user input
foreach $input (@ARGV){
    if ($input =~ m/^string\=(.*?)$/i){
	$string = lc($1);
    }
    if ($input =~ m/^ranks\=(.*?)$/i){
	$maxRank = $1;
    }
    if ($input =~ m/^file\=(.*?)$/i){
	$file = $1;
    }
    if ($input =~ m/^max\=(.*?)$/i){
	$maxOrder = $1;
    }
    if ($input =~ m/^q\=(.*?)$/i){
	$q = $1;
    }
    if ($input =~ m/\-([a-z]+)/){
	@flags = split("",$1);
    }
}

### check the file
if ($file){
    if (open(INFILE,"<",$file)){
	close(INFILE);    
    }
    else{
	die "your file path was invalid: ".$file."\n";
    }
}
else{
    die "please specify a file path.\n\n"."try something like: file=/path/to/my/text.txt\n"
}

### go through the flags
foreach $flag (@flags){
    if ($flag eq "s"){
	$subword = 1;
    }
    if ($flag eq "w"){
	$wrapping = 1;
    }
    if ($flag eq "r"){
	$noRT = 0;
    }
    if ($flag eq "b"){
	$noBot = 0;
    }
    if ($flag eq "m"){
	$noMark = 0;
    }
    if ($flag eq "u"){
	$noUrl = 0;    
    }
    if ($flag eq "v"){
	$verbose = 1;
    }
}

### set the subword/word space
my $space;
if ($subword){
    $space = "";
    $wrapping = 0;
}
else{
    $space = " ";
}

### initilize stuff
my $root = abs_path($0);
$root =~ s/randomPartitionText\.pl//;
my $outfile = $file;

### set up for alphaSplit.pl if we need it
my $alphaSplit = "perl ".$root."alphaSplit.pl ";
if (scalar(@flags)){
    $alphaSplit .= "\-".join("",@flags)." ";
}
$alphaSplit .= "max\=$maxOrder q\=$q depth\=1 \"string\=$string\" ranks\=$maxRank file\=$file";

### initilize stuff
my $line;
my $clause = "";
my @clauses;
my $superClause;
my $word;
my @words;
my @terms;
my $term;
my %counts;
my $phrase;
my $tempPhrase;
my $order;
my $length;
my $n;
my $i;
my $bnd = 0;
my $stringlength = length($string);
my $phraselength;
my $numKeys;
my $nextDepth = 0;
my $unprocessed = 0;
my $lastLength;
my $outstring = $string;
$outstring =~ s/ /\-/g;
if ($outstring eq ""){
    $outstring = "GLOBAL";
}
$outfile =~ s/.txt/\_$outstring\_counts.txt/;


### subroutine adds to counts for phrases in the clase
sub countPhrases{
    if (!$subword){
	$clause =~ s/^[\s]+//g;
	$clause =~ s/[\s]+$//g;
	$clause =~ s/[\s]+/ /g;
    }    
    $clause =~ s/\n//g;
    @words = split($space,$clause);
    $length = scalar(@words);
    if ($length > $maxOrder){
	$lastLength = $maxOrder;
    }
    else{
	$lastLength = $length;
    }
    foreach $order (1 .. $lastLength){
	foreach $n (0 .. $length-$order) {
	    $phrase=join($space, @words[$n..$order-1+$n]);
	    $tempPhrase = $phrase;
	    $phraselength = length($phrase);
	    if ($phraselength < $stringlength){
		$tempPhrase .= join("","a" x ($stringlength-$phraselength) );
	    }
	    if ($tempPhrase =~ m/^$string/){
		if ($n == 0 || $n == $length-$order){
		    $bnd = 1;
		}
		if ($order == $length){
		    $bnd = 2;
		}
		$counts{$phrase} += ($q ** (2-$bnd)) * ((1 - $q) ** ($order - 1));
		$bnd = 0;
	    }
	}    
    }        
    $clause = "";
    $unprocessed = 0;
}

### print out dictionary progress
if ($verbose){
    print "building a section under \"".$string."\"\n";
}

### go through the text line by line
open(INFILE,"<",$file);
while ($line = <INFILE>){    
    chomp $line;
    $real = 1;
    if ($noBot){                               ### ignores lines with known automations.
	if (
	    ($line =~ m/more for aries/gi) || ($line =~ m/more for taurus/gi) || 
	    ($line =~ m/more for gemini/gi) || ($line =~ m/more for cancer/gi) || 
	    ($line =~ m/more for leo/gi) || ($line =~ m/more for virgo/gi) || 
	    ($line =~ m/more for libra/gi) || ($line =~ m/more for scorpio/gi) || 
	    ($line =~ m/more for sagittarius/gi) || ($line =~ m/more for capricorn/gi) || 
	    ($line =~ m/more for aquarius/gi) || ($line =~ m/more for pisces/gi) || 
	    ($line =~ m/4sq\.com/gi) || ($line =~ m/extra watering cans after harvesting/gi) || 
	    ($line =~ m/\@questionnnierr/gi)){
	    $real = 0;
	}
    }
    if ($real){                                ### if we didn't determine this line to be "from a bot"
	if ($noMark){                          ### interprets the common markup
	    $line =~ s/\&lt/\</g;
	    $line =~ s/\&gt/\>/g;
	    $line =~ s/\&amp/\&/g;
	    $line =~ s/\\n/\. /g;
	}	    
        if ($noUrl){                           ### map urls onto the word "http"
	    $line =~ s/http[^ \n]+/http/g;
	}

	if ($noRT){                            ### remove automated RT structure
	    $line =~ s/RT \@[^ ]+\: /\. /g;
	}

	if (($line eq "") && $wrapping){
	    if ($unprocessed){
		&countPhrases;
	    }
	}
	else{	       
	    while ($line =~ m/(((\@|\#)?[a-z]+((\'|\-)[a-z]+)*\'?\s?)+)/gi){
		if ($wrapping){
		    if (length($line) == $+[0]){
			$clause = $clause." ".lc($1);
			$unprocessed = 1;
		    }
		    else{
			$clause = $clause." ".lc($1);
			&countPhrases;
		    }
		}
		else{
		    if ($subword){
			$superClause = lc($1);
			@clauses = split(" ",$superClause);
			foreach (@clauses){
			    $clause = $_;
			    $unprocessed = 1;
			    &countPhrases;
			}
		    }
		    else{
			$clause = lc($1);
			$unprocessed = 1;
			&countPhrases;
		    }
		}
	    }
	}

    }
    ### if we find too many phrases (N > $maxrank) then break the loop
    ### and run alphaSpit.pl
    $numKeys = keys %counts;
    if ($numKeys > $maxRank){
	$nextDepth = 1;
	last;
    }
}
close(INFILE);

### there may be one more line to clause to process if line wrapping is on
if ($wrapping){
    if ($unprocessed){
	&countPhrases;
    }
}

if ($nextDepth){ ### if there were to many phrases run alphaSplit.pl
    system($alphaSplit);
}
else{ ### otherwise print the (non-zero) counts out
    if (scalar(keys %counts)){
	open(OUTFILE,">",$outfile);
	foreach $phrase (reverse sort {$counts{$a} <=> $counts{$b}} keys %counts){
	    if ($counts{$phrase}){
		print OUTFILE "\"".$phrase."\"\t".$counts{$phrase}."\n";    
	    }
	}
	close(OUTFILE);
    }
}
