Day Four Code

Functions

#######################
# Day Four Functions  #
#######################

# Simple function for splitting a text by a delim
split_text <- function(a_text, delim = " "){
  unlist(strsplit(a_text, delim))
}

select_tagged_words <- function(tagged_words, target_tag){
  tagged_words[grep(target_tag, tagged_words)]
}

remove_tags <- function(word_pos){
  gsub("/[A-Z]{2,3}.*", "", word_pos)
}

# Added this one on request. . . we did not cover this in class
select_my_words <- function(tagged_words, a_word_list){
  tagged_words <- gsub("/.*", "", tagged_words)
  tagged_words[which(tagged_words %in% a_word_list)]
}

Main Script

##############
# Day 4 Code #
##############

##############################################
# Call a source file of functions
source("code/day_four_functions.R")

# Set the input Parameters
input_dir <- "data/taggedCorpus"
files_v <- dir(input_dir, "\\.txt")
pos_tag <- "IN|DT"
# An empty container to store some data while 
corpus_data <- NULL

##############################################

# The meat of the script is a for loop
for(i in 1:length(files_v)){
  path_to_file <- file.path(input_dir, files_v[i])
  text_v <- scan(path_to_file, what = "character", sep = "\n", fileEncoding="UTF-8")
  parsed_text <- split_text(text_v)
  selected_word_pos <- select_tagged_words(parsed_text, pos_tag)
  words <- remove_tags(selected_word_pos)
  # Deal with possibility of no words of a particular POS using if/else
  if(length(words) > 1){
    lower_words <- tolower(words)
    clean_words <- gsub("\\W", "", lower_words)
    clean_words <- clean_words[which(clean_words != "")]
    word_df <- as.data.frame(table(clean_words)/length(clean_words))
    file_word_freq <- cbind(ID=files_v[i], word_df)
    corpus_data <- rbind(corpus_data, file_word_freq)
  } else {
    # Be kind and report the error!
    cat("file", files_v[i], "has no found words.")
  }
}

##############################################
# Now munge and reshape the data

# xtabs works like "pivot table" in Excel
final_df <- xtabs(Freq ~ ID+clean_words, data=corpus_data)

# Let's only study the 100 most frequeny features in the data
the_means <- colMeans(final_df)
max_cols <- 100
if(length(the_means) < max_cols){
  max_cols <- length(the_means)
}
keepers <- names(sort(the_means, decreasing = TRUE)[1:max_cols])

cluster_data <- final_df[,keepers]
stop()
# Calculate distance,  cluster, and plot dendrogram.
dm <- dist(cluster_data)
hc <- hclust(dm)
plot(hc, hang=-1,main = pos_tag)

##############################################
# Here is a variation that let's you find a
# specific list of hand curated words instead
# of a POS tagged word. . . 
##############################################

##############################################
# Call a source file of functions
source("code/day_four_functions.R")

# Set the input Parameters
input_dir <- "data/taggedCorpus"
files_v <- dir(input_dir, "\\.txt")
my_words <- c("the","of","and","it","a")
# An empty container to store some data while 
# looping. . . .
corpus_data <- NULL

##############################################
# The meat of the script is a for loop
for(i in 1:length(files_v)){
  path_to_file <- file.path(input_dir, files_v[i])
  text_v <- scan(path_to_file, what = "character", sep = "\n", fileEncoding="UTF-8")
  parsed_text <- split_text(text_v)
  words <- select_my_words(parsed_text, my_words)
  # Deal with possibility of no words of a particular POS using if/else
  if(length(words) > 1){
    lower_words <- tolower(words)
    clean_words <- gsub("\\W", "", lower_words)
    clean_words <- clean_words[which(clean_words != "")]
    word_df <- as.data.frame(table(clean_words)/length(clean_words))
    file_word_freq <- cbind(ID=files_v[i], word_df)
    corpus_data <- rbind(corpus_data, file_word_freq)
  } else {
    # Be kind and report the error!
    cat("file", files_v[i], "has no found words.")
  }
}

##############################################
# Now munge and reshape the data

# xtabs works like "pivot table" in Excel
final_df <- xtabs(Freq ~ ID+clean_words, data=corpus_data)

# Let's only study the 100 most frequeny features in the data
the_means <- colMeans(final_df)
max_cols <- length(my_words)
if(length(the_means) < max_cols){
  max_cols <- length(the_means)
}
keepers <- names(sort(the_means, decreasing = TRUE)[1:max_cols])

cluster_data <- final_df[,keepers]
stop()
# Calculate distance,  cluster, and plot dendrogram.
dm <- dist(cluster_data)
hc <- hclust(dm)
plot(hc, hang=-1, main = paste(my_words, collapse = " "))