source("code/day_five_functions.R")
library(e1071)
# Set the input Parameters
input_dir <- "data/taggedCorpus"
files_v <- dir(input_dir, "\\.txt")
# We could enter a custom list. . .
# my_words <- c("the","of","and","it","a")
# or we can upload a list from a file
my_words <- read.csv("data/stoplist.csv", row.names = NULL, stringsAsFactors = FALSE)[,1]
# An empty container to store some data while looping. . . .
corpus_data <- NULL
##############################################
# The meat of the script is a for loop
for(i in 1:length(files_v)){
path_to_file <- file.path(input_dir, files_v[i])
text_v <- scan(path_to_file, what = "character", sep = "\n")
parsed_text <- split_text(text_v)
words <- select_my_words(parsed_text, my_words)
# Deal with possibility of no words
if(length(words) > 1){
clean_words <- gsub("[^[:alnum:][:space:]']", "", words)
clean_words <- clean_words[which(clean_words != "")]
word_df <- as.data.frame(table(clean_words)/length(clean_words))
file_word_freq <- cbind(ID=files_v[i], word_df)
corpus_data <- rbind(corpus_data, file_word_freq)
} else {
# Be kind and report the error!
cat("file", files_v[i], "has no found words.")
}
}
##############################################
# Now munge and reshape the data
# xtabs works like "pivot table" in Excel
final_df <- xtabs(Freq ~ ID+clean_words, data=corpus_data)
# Let's only study the 100 most frequeny features in the data
the_means <- colMeans(final_df)
num_features <- 10 # Number of frequent words to use?
# Make sure we don't pick too many
if(length(the_means) < num_features){
num_features <- length(the_means)
}
keepers <- names(sort(the_means, decreasing = TRUE)[1:num_features])
class_m <- final_df[,keepers]
author_ids <- substr(rownames(class_m), 1, 4)
class_df <- data.frame(author_ids, class_m, stringsAsFactors = FALSE)
test_row <- which(class_df$author_ids == "anon")
feature_cols <- 2:ncol(class_df)
train <- class_df[-test_row, feature_cols]
class_col <- class_df[-test_row, "author_ids"]
test_data <- class_df[test_row, feature_cols]