#!/usr/bin/env perl                                                                                                                  
use strict;
use warnings;
use Cwd 'abs_path';
use Encode;
use utf8;
binmode STDIN,":utf8";
binmode STDOUT,":utf8";
no warnings 'utf8';


### alphaSplit.pl
### - Jake Ryland Williams
#
### general descriptiion:
###
### This script is intended to aid in the partitioning of very large text files containing
### unaccented latin utf8 text. Please note that it comes with companion scripts 
### randomPartitionText.pl and collectCounts.pl. In order to create complete rank-ordered
### lists of phrases, the user will have to run collectCounts.pl to merge the dictionary
### files (which are needed for memory limits) once alphaSplit.pl has run. 
###
### Random partitions can generate count distributions with many ranks, and so this
### script will split the job of partitioning according to an alphabetic dictionary
### ordering. Hence the output of this script will be multiple count files, which contian
### the counts of the phrases that begin with a designated start string. As an example,
### the phrase 
###                                 "it was the best of times" 
### may wind up in the file
###                                "twoCities_it-w_counts.txt"
###
### where the "-" indicates a space.
###
### NOTE: "collectCounts.pl" will throw an error depending on the users terminal "ulimit"
###       settings. In this event the standard output should indicate an appropriate ulimit
###       setting. Be aware that setting the ulimit may require starting a new terminal 
###       session.
###
### NOTE: the documentation of flags and inputs for randomPartitionText.pl is covered
###       in this file (it has the same variables as alphaSplit.pl)
###
#
##############################################################################################
#
################### sample call: alphaSplit.pl
#
### The following example will partition "A Tale of Two Cities" starting at depth 1 (letters).
### Note that line wrapping is on (-w), we will only print out to 100-grams (max=100), and
### will have files split (alphabetically) when they reach 20001 ranks (ranks=20000).
#
#
# partitioning a text:
#
# cd path/to/partitioner
# perl bin/alphaSplit.pl -vw depth=1 file=data/twoCities.txt max=100 ranks=60000
#
###################
#
#
################### sample call: collectCounts.pl
#
### In order to piece the files back together into a single file of counts we will call the 
### collectCounts.pl script in the package. Note that this script will open ALL of the count
### files from the partitioning at once, and so if it detects that your ulimit (the max number
### of files your computer will allow you to open at once) is too low, combinedCounts.pl will
### die and tell you what to set it to. Please also be aware that on Unix machines (at least)
### you will have to start a new terminal to change the ulimit, if you've already changed it.
#
#
# piece the count files together: 
#
# cd path/to/partitioner
# perl bin/collectCounts.pl "data/twoCities"
#
###################
#
#
################################# flags ############################
#
#               -w     # turn on line wrapping
#               -s     # switch to sub-word partitioning
#               -v     # be verbose about dictionary progress
#               -u     # leave in urls
#               -b     # leave in known twitter automations
#               -m     # leave in common markup: &gt, &lt ...
#               -r     # leave in retweet automations (RT @...: )
#
####################################################################
#
#
################################# input ############################
# 
## file ## the file of raw text: 
#
#                        file=path/to/file.txt
##########
#
## ranks ## the maximum rank (size) for dictionary sections:
#
#                        ranks=10000000     (no more than 10 million ranks per section)         
#
#           Note: If you set "ranks" too high and run a large partition, then you will very likely
#                 run out of memory (not fun). 
#
#                 So, a very important rule of thumb is: 
#
#                              5000000 ranks of phrases consumes 1Gb of available memory
###########
#
## max ## the longest-length phrases to partition:
#
#                        max=50     (up to and including 50-grams)
#########
#
## string ## the string from which we start decending the dictionary
#
#                        string=""  (global)
############
#
## depth ## the depth from the string at which we cover the tree
#
#                        depth=0    (gloabl)
###########
#
## q ## set the "temperature" of the partition:
#
#                        q=0.5      (all partitions equally likely)
#######


my $depth = 0;                   ### initialize at global depth
my $startString = "";            ### initialize as all strings
my $maxRank = 10000000;          ### initialize at 10 million ranks ~ 2Gb memory
my $file = 0;                    ### initialize as 0, so file will die if no file
my $wrapping = 0;                ### initialize as no line wrapping
my $maxOrder = 50;               ### initialize to maximum of 50-grams
my $q = 1/2;                     ### initialize to q=0.5 "temperature"
my $noRT = 1;                    ### initialize to remove "RT @...: " annotations
my $noUrl = 1;                   ### initialize to replace Urls with "http"
my $noBot = 1;                   ### initialize to ingore know twitter automations
my $noMark = 1;                  ### initialize to interpret common markup
my $subword = 0;                 ### initialize to super-word (phrase) partitions
my $verbose = 0;                 ### initialize quiet

my $input;
my @flags;
my $flag;

foreach $input (@ARGV){                         ### read in command line input
    if ($input =~ m/^string\=(.*?)$/i){
	$startString = lc($1);
    }
    if ($input =~ m/file\=(.*?)$/i){
	$file = $1;
    }
    if ($input =~ m/depth\=(.*?)$/i){
	$depth = $1;
    }
    if ($input =~ m/ranks\=(.*?)$/i){
	$maxRank = $1;
    }
    if ($input =~ m/max\=(.*?)$/i){
	$maxOrder = $1;
    }
    if ($input =~ m/^\-([a-z]+)$/i){
	@flags = split("",$1);
    }
    if ($input =~ m/^q\=(.*?)$/i){
	$q = $1;
    }
}

if ($file){                                    ### check the file
    if (open(INFILE,"<",$file)){
	close(INFILE);    
    }
    else{
	die "your file path was invalid: ".$file."\n";
    }
}
else{
    die "please specify a file path.\n\n"."try something like: file=/path/to/my/text.txt\n"
}

if (scalar(@flags)){                           ### go through flags
    foreach $flag (@flags){
	if ($flag eq "w"){
	    $wrapping = 1;
	}
	if ($flag eq "s"){
	    $subword = 1;
	}
	if ($flag eq "r"){
	    $noRT = 0;
	}
	if ($flag eq "b"){
	    $noBot = 0;
	}
	if ($flag eq "m"){
	    $noMark = 0;
	}
	if ($flag eq "u"){
	    $noUrl = 0;
	}
	if ($flag eq "v"){
	    $verbose = 1;
	}
    }
}

### various initializations
my $root = abs_path($0);
$root =~ s/alphaSplit\.pl//;
my $partitioner = "perl ".$root."randomPartitionText.pl ";
my @letters = ("a".."z");
push(@letters," ");
my $letter;
my $command;
my $newString;
my %nextletter;
my $i;
my $notDone = 1;
my $string = join("","a" x $depth);
my $prevString = "";
my $flipped = 0;

### make a letter order that wraps
foreach $i (0..scalar(@letters)-1){
    if ($letters[$i] eq " "){
	$nextletter{$letters[$i]} = $letters[0];
    }
    else{
	$nextletter{$letters[$i]} = $letters[$i+1];
    }
}

### subroutine figures out the next string in our dictionary ordering
sub nextString{    
    @letters = split("",$string);
    $flipped = 0;
    for ($i=(scalar(@letters) - 1);$i>=0;$i--){
	$letter = $letters[$i];
	$letters[$i] = $nextletter{$letter};
	if ($letter eq " "){
	    $flipped += 1;
	}
	else{
	    last;
	}
    }
    if ($flipped == scalar(@letters)){
	$notDone = 0;
    }
    $string = join("",@letters);
}


### main code ###

if ($depth){                                                         ### if dictionary has sections
    while ($notDone){                                                ### ($notDone == 1) ==> dictionary not totally covered
	$newString = $startString.$string;                           ### the dictionary string we actually use
	if ( (!($newString =~ m/  /)) && (!($newString =~ m/^ /)) ){ ### no multi-space dictionary section-names
                                                                     ### create the partitioner command
	    if (scalar(@flags)){
		$command = "$partitioner \-".join("",@flags)." \"string\=$newString\" ranks\=$maxRank file\=$file max\=$maxOrder q\=$q";
	    }
	    else{
		$command = "$partitioner \"string\=$newString\" ranks\=$maxRank file\=$file max\=$maxOrder q\=$q";
	    }
	    system($command);                                        ### run the partitioner
	}
	&nextString;                                                 ### update $newString, $notDone, etc...
    }
}
else{                                                                ### in global, all we care about are those
    $newString = $string;                                            ### entries that lie under $string
    if (scalar(@flags)){                                                 
	$command = "$partitioner \-".join("",@flags)." \"string\=$newString\" ranks\=$maxRank file\=$file max\=$maxOrder q\=$q";
    }
    else{
	$command = "$partitioner \"string\=$newString\" ranks\=$maxRank file\=$file max\=$maxOrder q\=$q";
    }
    system($command);                                                ### run the partitioner
}
