Day Three Code

# Day Three Code
input_dir <- "data/plainText/"
files_v <- dir(input_dir, "\\.txt$")

# Function to print a vector of file 
# names in a user friendly format
show_files <- function(file_names_v){
  for(i in 1:length(file_names_v)){
    cat(i, file_names_v[i], "\n", sep=" ")
  }
}

show_files(files_v)

make_file_word_v <- function(files_input, in_dir){
  # set up empty container
  text_word_v_l <- list()
  # loop over some files
  for(i in 1:length(files_input)){
    text_v <- scan(paste(input_dir, files_input[i], sep=""), what = "character", sep="\n")
    text_v <- tolower(paste(text_v, collapse = " "))
    text_tokens_v <- unlist(strsplit(text_v, "\\W"))
    # remove those stinkin blanks. . . 
    text_tokens_v <- text_tokens_v[which(text_tokens_v != "")]
    text_word_v_l[[files_input[i]]] <- text_tokens_v
  }
  return(text_word_v_l)
}

my_corpus <- make_file_word_v(files_v, input_dir)

# Finding words and their neighbors. . . 
# Here is an example of what we have in our "my_corpus" list object.
which(my_corpus[[1]][1:10] == "jane")

girls_positions <- which(my_corpus[[1]] == "girl")
context <- 3
for(i in 1:length(girls_positions)){
  instance_index <- girls_positions[i]
  start <- instance_index - context
  end <- instance_index + context
  instance_output <- my_corpus[[1]][start:end]
  cat(instance_index, instance_output, "\n")
}

######################################################
# A better example becasue it is more abstract. . . .
######################################################

# First the Required arguments. . . 
context <- 5
text_id <- 2
my_word <- "whale"

# Now the main script 
positions_v <- which(my_corpus[[text_id]] == my_word)
for(i in 1:length(positions_v)){
  instance_index <- positions_v[i]
  l_start <- instance_index - context
  l_end <- instance_index - 1
  left_context <- my_corpus[[text_id]][l_start:l_end]
  r_end <- instance_index + context
  r_start <- instance_index + 1
  right_context <- my_corpus[[text_id]][r_start:r_end]
  keyword <- my_corpus[[text_id]][instance_index]
  cat("------------", i, "------------", "\n")
  cat(left_context, "[", keyword, "]",  right_context, "\n")
}


###############################################################
# Modified version to collect context words to the left and right. . . 
###############################################################
context <- 5
text_id <- 2
positions_v <- which(my_corpus[[text_id]] == "whale")
my_left_words <- NULL
my_right_words <- NULL

for(i in 1:length(positions_v)){
  instance_index <- positions_v[i]
  l_start <- instance_index - context
  l_end <- instance_index - 1
  left_context <- my_corpus[[text_id]][l_start:l_end]
  my_left_words <- c(my_words, left_context)
  r_end <- instance_index + context
  r_start <- instance_index + 1
  right_context <- my_corpus[[text_id]][r_start:r_end]
  my_right_words <- c(my_right_words, right_context)
  keyword <- my_corpus[[text_id]][instance_index]
  cat("------------", i, "------------", "\n")
  cat(left_context, "[", keyword, "]",  right_context, "\n")
}

sort(table(my_right_words), decreasing = T)[1:10]
sort(table(my_left_words), decreasing = T)[1:10]