1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
source("code/day_five_functions.R") library(e1071) # Set the input Parameters input_dir <- "data/taggedCorpus" files_v <- dir(input_dir, "\\.txt") # We could enter a custom list. . . # my_words <- c("the","of","and","it","a") # or we can upload a list from a file my_words <- read.csv("data/stoplist.csv", row.names = NULL, stringsAsFactors = FALSE)[,1] # An empty container to store some data while looping. . . . corpus_data <- NULL ############################################## # The meat of the script is a for loop for(i in 1:length(files_v)){ path_to_file <- file.path(input_dir, files_v[i]) text_v <- scan(path_to_file, what = "character", sep = "\n") parsed_text <- split_text(text_v) words <- select_my_words(parsed_text, my_words) # Deal with possibility of no words if(length(words) > 1){ clean_words <- gsub("[^[:alnum:][:space:]']", "", words) clean_words <- clean_words[which(clean_words != "")] word_df <- as.data.frame(table(clean_words)/length(clean_words)) file_word_freq <- cbind(ID=files_v[i], word_df) corpus_data <- rbind(corpus_data, file_word_freq) } else { # Be kind and report the error! cat("file", files_v[i], "has no found words.") } } ############################################## # Now munge and reshape the data # xtabs works like "pivot table" in Excel final_df <- xtabs(Freq ~ ID+clean_words, data=corpus_data) # Let's only study the 100 most frequeny features in the data the_means <- colMeans(final_df) num_features <- 10 # Number of frequent words to use? # Make sure we don't pick too many if(length(the_means) < num_features){ num_features <- length(the_means) } keepers <- names(sort(the_means, decreasing = TRUE)[1:num_features]) class_m <- final_df[,keepers] author_ids <- substr(rownames(class_m), 1, 4) class_df <- data.frame(author_ids, class_m, stringsAsFactors = FALSE) test_row <- which(class_df$author_ids == "anon") feature_cols <- 2:ncol(class_df) train <- class_df[-test_row, feature_cols] class_col <- class_df[-test_row, "author_ids"] test_data <- class_df[test_row, feature_cols] |