Functions
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 |
####################### # Day Four Functions # ####################### # Simple function for splitting a text by a delim split_text <- function(a_text, delim = " "){ unlist(strsplit(a_text, delim)) } select_tagged_words <- function(tagged_words, target_tag){ tagged_words[grep(target_tag, tagged_words)] } remove_tags <- function(word_pos){ gsub("/[A-Z]{2,3}.*", "", word_pos) } # Added this one on request. . . we did not cover this in class select_my_words <- function(tagged_words, a_word_list){ tagged_words <- gsub("/.*", "", tagged_words) tagged_words[which(tagged_words %in% a_word_list)] } |
Main Script
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
############## # Day 4 Code # ############## ############################################## # Call a source file of functions source("code/day_four_functions.R") # Set the input Parameters input_dir <- "data/taggedCorpus" files_v <- dir(input_dir, "\\.txt") pos_tag <- "IN|DT" # An empty container to store some data while corpus_data <- NULL ############################################## # The meat of the script is a for loop for(i in 1:length(files_v)){ path_to_file <- file.path(input_dir, files_v[i]) text_v <- scan(path_to_file, what = "character", sep = "\n", fileEncoding="UTF-8") parsed_text <- split_text(text_v) selected_word_pos <- select_tagged_words(parsed_text, pos_tag) words <- remove_tags(selected_word_pos) # Deal with possibility of no words of a particular POS using if/else if(length(words) > 1){ lower_words <- tolower(words) clean_words <- gsub("\\W", "", lower_words) clean_words <- clean_words[which(clean_words != "")] word_df <- as.data.frame(table(clean_words)/length(clean_words)) file_word_freq <- cbind(ID=files_v[i], word_df) corpus_data <- rbind(corpus_data, file_word_freq) } else { # Be kind and report the error! cat("file", files_v[i], "has no found words.") } } ############################################## # Now munge and reshape the data # xtabs works like "pivot table" in Excel final_df <- xtabs(Freq ~ ID+clean_words, data=corpus_data) # Let's only study the 100 most frequeny features in the data the_means <- colMeans(final_df) max_cols <- 100 if(length(the_means) < max_cols){ max_cols <- length(the_means) } keepers <- names(sort(the_means, decreasing = TRUE)[1:max_cols]) cluster_data <- final_df[,keepers] stop() # Calculate distance, cluster, and plot dendrogram. dm <- dist(cluster_data) hc <- hclust(dm) plot(hc, hang=-1,main = pos_tag) ############################################## # Here is a variation that let's you find a # specific list of hand curated words instead # of a POS tagged word. . . ############################################## ############################################## # Call a source file of functions source("code/day_four_functions.R") # Set the input Parameters input_dir <- "data/taggedCorpus" files_v <- dir(input_dir, "\\.txt") my_words <- c("the","of","and","it","a") # An empty container to store some data while # looping. . . . corpus_data <- NULL ############################################## # The meat of the script is a for loop for(i in 1:length(files_v)){ path_to_file <- file.path(input_dir, files_v[i]) text_v <- scan(path_to_file, what = "character", sep = "\n", fileEncoding="UTF-8") parsed_text <- split_text(text_v) words <- select_my_words(parsed_text, my_words) # Deal with possibility of no words of a particular POS using if/else if(length(words) > 1){ lower_words <- tolower(words) clean_words <- gsub("\\W", "", lower_words) clean_words <- clean_words[which(clean_words != "")] word_df <- as.data.frame(table(clean_words)/length(clean_words)) file_word_freq <- cbind(ID=files_v[i], word_df) corpus_data <- rbind(corpus_data, file_word_freq) } else { # Be kind and report the error! cat("file", files_v[i], "has no found words.") } } ############################################## # Now munge and reshape the data # xtabs works like "pivot table" in Excel final_df <- xtabs(Freq ~ ID+clean_words, data=corpus_data) # Let's only study the 100 most frequeny features in the data the_means <- colMeans(final_df) max_cols <- length(my_words) if(length(the_means) < max_cols){ max_cols <- length(the_means) } keepers <- names(sort(the_means, decreasing = TRUE)[1:max_cols]) cluster_data <- final_df[,keepers] stop() # Calculate distance, cluster, and plot dendrogram. dm <- dist(cluster_data) hc <- hclust(dm) plot(hc, hang=-1, main = paste(my_words, collapse = " ")) |