Day Five Code

source("code/day_five_functions.R")
library(e1071)
# Set the input Parameters
input_dir <- "data/taggedCorpus"
files_v <- dir(input_dir, "\\.txt")

# We could enter a custom list. . . 
# my_words <- c("the","of","and","it","a")

# or we can upload a list from a file
my_words <- read.csv("data/stoplist.csv", row.names = NULL, stringsAsFactors = FALSE)[,1]

# An empty container to store some data while looping. . . .
corpus_data <- NULL

##############################################
# The meat of the script is a for loop
for(i in 1:length(files_v)){
  path_to_file <- file.path(input_dir, files_v[i])
  text_v <- scan(path_to_file, what = "character", sep = "\n")
  parsed_text <- split_text(text_v)
  words <- select_my_words(parsed_text, my_words)
  # Deal with possibility of no words
  if(length(words) > 1){
    clean_words <- gsub("[^[:alnum:][:space:]']", "", words)
    clean_words <- clean_words[which(clean_words != "")]
    word_df <- as.data.frame(table(clean_words)/length(clean_words))
    file_word_freq <- cbind(ID=files_v[i], word_df)
    corpus_data <- rbind(corpus_data, file_word_freq)
  } else {
    # Be kind and report the error!
    cat("file", files_v[i], "has no found words.")
  }
}

##############################################
# Now munge and reshape the data
# xtabs works like "pivot table" in Excel
final_df <- xtabs(Freq ~ ID+clean_words, data=corpus_data)


# Let's only study the 100 most frequeny features in the data
the_means <- colMeans(final_df)
num_features <- 10  # Number of frequent words to use?
# Make sure we don't pick too many
if(length(the_means) < num_features){
  num_features <- length(the_means)
}
keepers <- names(sort(the_means, decreasing = TRUE)[1:num_features])
class_m <- final_df[,keepers]


author_ids <- substr(rownames(class_m), 1, 4)
class_df <- data.frame(author_ids, class_m, stringsAsFactors = FALSE)

test_row <- which(class_df$author_ids == "anon")
feature_cols <- 2:ncol(class_df)
train <- class_df[-test_row, feature_cols]
class_col <- class_df[-test_row, "author_ids"]

test_data <- class_df[test_row, feature_cols]