Functions
#######################
# Day Four Functions #
#######################
# Simple function for splitting a text by a delim
split_text <- function(a_text, delim = " "){
unlist(strsplit(a_text, delim))
}
select_tagged_words <- function(tagged_words, target_tag){
tagged_words[grep(target_tag, tagged_words)]
}
remove_tags <- function(word_pos){
gsub("/[A-Z]{2,3}.*", "", word_pos)
}
# Added this one on request. . . we did not cover this in class
select_my_words <- function(tagged_words, a_word_list){
tagged_words <- gsub("/.*", "", tagged_words)
tagged_words[which(tagged_words %in% a_word_list)]
}
Main Script
##############
# Day 4 Code #
##############
##############################################
# Call a source file of functions
source("code/day_four_functions.R")
# Set the input Parameters
input_dir <- "data/taggedCorpus"
files_v <- dir(input_dir, "\\.txt")
pos_tag <- "IN|DT"
# An empty container to store some data while
corpus_data <- NULL
##############################################
# The meat of the script is a for loop
for(i in 1:length(files_v)){
path_to_file <- file.path(input_dir, files_v[i])
text_v <- scan(path_to_file, what = "character", sep = "\n", fileEncoding="UTF-8")
parsed_text <- split_text(text_v)
selected_word_pos <- select_tagged_words(parsed_text, pos_tag)
words <- remove_tags(selected_word_pos)
# Deal with possibility of no words of a particular POS using if/else
if(length(words) > 1){
lower_words <- tolower(words)
clean_words <- gsub("\\W", "", lower_words)
clean_words <- clean_words[which(clean_words != "")]
word_df <- as.data.frame(table(clean_words)/length(clean_words))
file_word_freq <- cbind(ID=files_v[i], word_df)
corpus_data <- rbind(corpus_data, file_word_freq)
} else {
# Be kind and report the error!
cat("file", files_v[i], "has no found words.")
}
}
##############################################
# Now munge and reshape the data
# xtabs works like "pivot table" in Excel
final_df <- xtabs(Freq ~ ID+clean_words, data=corpus_data)
# Let's only study the 100 most frequeny features in the data
the_means <- colMeans(final_df)
max_cols <- 100
if(length(the_means) < max_cols){
max_cols <- length(the_means)
}
keepers <- names(sort(the_means, decreasing = TRUE)[1:max_cols])
cluster_data <- final_df[,keepers]
stop()
# Calculate distance, cluster, and plot dendrogram.
dm <- dist(cluster_data)
hc <- hclust(dm)
plot(hc, hang=-1,main = pos_tag)
##############################################
# Here is a variation that let's you find a
# specific list of hand curated words instead
# of a POS tagged word. . .
##############################################
##############################################
# Call a source file of functions
source("code/day_four_functions.R")
# Set the input Parameters
input_dir <- "data/taggedCorpus"
files_v <- dir(input_dir, "\\.txt")
my_words <- c("the","of","and","it","a")
# An empty container to store some data while
# looping. . . .
corpus_data <- NULL
##############################################
# The meat of the script is a for loop
for(i in 1:length(files_v)){
path_to_file <- file.path(input_dir, files_v[i])
text_v <- scan(path_to_file, what = "character", sep = "\n", fileEncoding="UTF-8")
parsed_text <- split_text(text_v)
words <- select_my_words(parsed_text, my_words)
# Deal with possibility of no words of a particular POS using if/else
if(length(words) > 1){
lower_words <- tolower(words)
clean_words <- gsub("\\W", "", lower_words)
clean_words <- clean_words[which(clean_words != "")]
word_df <- as.data.frame(table(clean_words)/length(clean_words))
file_word_freq <- cbind(ID=files_v[i], word_df)
corpus_data <- rbind(corpus_data, file_word_freq)
} else {
# Be kind and report the error!
cat("file", files_v[i], "has no found words.")
}
}
##############################################
# Now munge and reshape the data
# xtabs works like "pivot table" in Excel
final_df <- xtabs(Freq ~ ID+clean_words, data=corpus_data)
# Let's only study the 100 most frequeny features in the data
the_means <- colMeans(final_df)
max_cols <- length(my_words)
if(length(the_means) < max_cols){
max_cols <- length(the_means)
}
keepers <- names(sort(the_means, decreasing = TRUE)[1:max_cols])
cluster_data <- final_df[,keepers]
stop()
# Calculate distance, cluster, and plot dendrogram.
dm <- dist(cluster_data)
hc <- hclust(dm)
plot(hc, hang=-1, main = paste(my_words, collapse = " "))