source("http://www.uvm.edu/~rsingle/stat3880/data/scripts-3880.R") #Read in data: 8 classified messages & 2 test messages #txt_raw <- read.csv("small_spam_data.csv", stringsAsFactors = FALSE) txt_raw <- classdata("small_spam_data.csv", sep=",") txt_raw type = txt_raw$type #a short name for later #install.packages("tm") #install the Text Mining package library(tm) txt_corpus <- Corpus(VectorSource(txt_raw$text)) inspect(txt_corpus[1:5]) #Simplify the corpus for NLP corpus_clean <- tm_map(txt_corpus, tolower) corpus_clean <- tm_map(corpus_clean, removeNumbers) stopwords() corpus_clean <- tm_map(corpus_clean, removeWords, stopwords()) corpus_clean <- tm_map(corpus_clean, removePunctuation) corpus_clean <- tm_map(corpus_clean, stripWhitespace) inspect(txt_corpus[1:5]) inspect(corpus_clean[1:5]) #Tokenize: split the messages into words via DocumentTermMatrix() "DTM" # --> sparse matrix One row for each message & one column for each word txt_dtm <- DocumentTermMatrix(corpus_clean) dim(txt_dtm) txt_dtm[1:6,] inspect(txt_dtm[1:6,]) #identify frequent words findFreqTerms(txt_dtm, 1) #words that appear at least once findFreqTerms(txt_dtm, 3) #words that appear at least three times txt_dict <- findFreqTerms(txt_dtm, 2) txt_dict #restrict DTM to a subset of frequent words txt_dtm2 <- DocumentTermMatrix(corpus_clean, list(dictionary = txt_dict)) inspect(txt_dtm2[1:6,]) #convert DTM to matrix with ONLY 0/1 entries (Note: see 1st row) txt_mat <- apply(txt_dtm2, MARGIN=2, function(x){( ifelse(x > 0, 1, 0) )}) inspect(txt_mat[1:6,]) #ERROR: inspect() is only for DTMs txt_mat[1:6,] #NOTE: cols no longer alphabetical attributes(corpus_clean) cbind(corpus_clean$content,type) word.no=1 ( word = txt_dict[word.no] ) ( tab = table(txt_raw$type, txt_mat[,word]) ) freq.ham1 = .01 + sum(txt_mat[,word][type=="ham" ]) / 9 #NB: there are 9 words in HAM training msgs (cleaned) freq.ham1 #create vectors of counts for each word total.words = ncol(txt_mat) #or dim(txt_mat)[2] count.ham = NULL #initialize vectors count.spam = NULL for (i in 1:total.words) { count.ham[i] = sum(txt_mat[,i][type=="ham"]) count.spam[i] = sum(txt_mat[,i][type=="spam"]) } names(count.ham) = colnames(txt_mat) names(count.spam) = colnames(txt_mat) count.ham count.spam cbind(count.ham,count.spam) apply(cbind(count.ham,count.spam),2,sum) #inspect message 5 message = 5 inspect(txt_corpus[message]) inspect(corpus_clean[message]) txt_mat[message,] txt_mat[message,]==1 count.vec.ham[txt_mat[message,]==1] count.vec.spam[txt_mat[message,]==1]