source("http://www.uvm.edu/~rsingle/stat3880/data/scripts-3880.R")

#Read in data: 8 classified messages & 2 test messages
#txt_raw <- read.csv("small_spam_data.csv", stringsAsFactors = FALSE)
 txt_raw <- classdata("small_spam_data.csv", sep=",")
 txt_raw
 type = txt_raw$type #a short name for later

#install.packages("tm") #install the Text Mining package
 library(tm)
 txt_corpus <- Corpus(VectorSource(txt_raw$text))
 inspect(txt_corpus[1:5])

#Simplify the corpus for NLP
 corpus_clean <- tm_map(txt_corpus, tolower)
 corpus_clean <- tm_map(corpus_clean, removeNumbers)
 stopwords()
 corpus_clean <- tm_map(corpus_clean, removeWords, stopwords())
 corpus_clean <- tm_map(corpus_clean, removePunctuation)
 corpus_clean <- tm_map(corpus_clean, stripWhitespace)

 inspect(txt_corpus[1:5])
 inspect(corpus_clean[1:5])

#Tokenize: split the messages into words via DocumentTermMatrix() "DTM"
# --> sparse matrix One row for each message & one column for each word 
 txt_dtm <- DocumentTermMatrix(corpus_clean)
 dim(txt_dtm)
 txt_dtm[1:6,]
 inspect(txt_dtm[1:6,])

#identify frequent words
 findFreqTerms(txt_dtm, 1) #words that appear at least once
 findFreqTerms(txt_dtm, 3) #words that appear at least three times
 txt_dict <- findFreqTerms(txt_dtm, 2) 
 txt_dict

#restrict DTM to a subset of frequent words 
 txt_dtm2 <- DocumentTermMatrix(corpus_clean, list(dictionary = txt_dict))
 inspect(txt_dtm2[1:6,]) 

#convert DTM to matrix with ONLY 0/1 entries (Note: see 1st row)
 txt_mat <- apply(txt_dtm2, MARGIN=2, function(x){( ifelse(x > 0, 1, 0) )})
 inspect(txt_mat[1:6,]) #ERROR: inspect() is only for DTMs
 txt_mat[1:6,]          #NOTE:  cols no longer alphabetical

 attributes(corpus_clean)
 cbind(corpus_clean$content,type)

 word.no=1
 ( word = txt_dict[word.no] )
 ( tab = table(txt_raw$type, txt_mat[,word]) )
 freq.ham1  = .01 + sum(txt_mat[,word][type=="ham" ]) / 9  #NB: there are 9 words in HAM training msgs (cleaned)
 freq.ham1 

#create vectors of counts for each word 
 total.words = ncol(txt_mat) #or dim(txt_mat)[2]
 count.ham = NULL        #initialize vectors
 count.spam = NULL       
 for (i in 1:total.words) {
   count.ham[i]  = sum(txt_mat[,i][type=="ham"])
   count.spam[i] = sum(txt_mat[,i][type=="spam"])
 } 
 names(count.ham) = colnames(txt_mat)
 names(count.spam) = colnames(txt_mat)
 count.ham
 count.spam
 cbind(count.ham,count.spam)
 apply(cbind(count.ham,count.spam),2,sum)

#inspect message 5
 message = 5
 inspect(txt_corpus[message])
 inspect(corpus_clean[message])
 txt_mat[message,]
 txt_mat[message,]==1
 count.vec.ham[txt_mat[message,]==1]
 count.vec.spam[txt_mat[message,]==1]