# Day Three Code
input_dir <- "data/plainText/"
files_v <- dir(input_dir, "\\.txt$")
# Function to print a vector of file
# names in a user friendly format
show_files <- function(file_names_v){
for(i in 1:length(file_names_v)){
cat(i, file_names_v[i], "\n", sep=" ")
}
}
show_files(files_v)
make_file_word_v <- function(files_input, in_dir){
# set up empty container
text_word_v_l <- list()
# loop over some files
for(i in 1:length(files_input)){
text_v <- scan(paste(input_dir, files_input[i], sep=""), what = "character", sep="\n")
text_v <- tolower(paste(text_v, collapse = " "))
text_tokens_v <- unlist(strsplit(text_v, "\\W"))
# remove those stinkin blanks. . .
text_tokens_v <- text_tokens_v[which(text_tokens_v != "")]
text_word_v_l[[files_input[i]]] <- text_tokens_v
}
return(text_word_v_l)
}
my_corpus <- make_file_word_v(files_v, input_dir)
# Finding words and their neighbors. . .
# Here is an example of what we have in our "my_corpus" list object.
which(my_corpus[[1]][1:10] == "jane")
girls_positions <- which(my_corpus[[1]] == "girl")
context <- 3
for(i in 1:length(girls_positions)){
instance_index <- girls_positions[i]
start <- instance_index - context
end <- instance_index + context
instance_output <- my_corpus[[1]][start:end]
cat(instance_index, instance_output, "\n")
}
######################################################
# A better example becasue it is more abstract. . . .
######################################################
# First the Required arguments. . .
context <- 5
text_id <- 2
my_word <- "whale"
# Now the main script
positions_v <- which(my_corpus[[text_id]] == my_word)
for(i in 1:length(positions_v)){
instance_index <- positions_v[i]
l_start <- instance_index - context
l_end <- instance_index - 1
left_context <- my_corpus[[text_id]][l_start:l_end]
r_end <- instance_index + context
r_start <- instance_index + 1
right_context <- my_corpus[[text_id]][r_start:r_end]
keyword <- my_corpus[[text_id]][instance_index]
cat("------------", i, "------------", "\n")
cat(left_context, "[", keyword, "]", right_context, "\n")
}
###############################################################
# Modified version to collect context words to the left and right. . .
###############################################################
context <- 5
text_id <- 2
positions_v <- which(my_corpus[[text_id]] == "whale")
my_left_words <- NULL
my_right_words <- NULL
for(i in 1:length(positions_v)){
instance_index <- positions_v[i]
l_start <- instance_index - context
l_end <- instance_index - 1
left_context <- my_corpus[[text_id]][l_start:l_end]
my_left_words <- c(my_words, left_context)
r_end <- instance_index + context
r_start <- instance_index + 1
right_context <- my_corpus[[text_id]][r_start:r_end]
my_right_words <- c(my_right_words, right_context)
keyword <- my_corpus[[text_id]][instance_index]
cat("------------", i, "------------", "\n")
cat(left_context, "[", keyword, "]", right_context, "\n")
}
sort(table(my_right_words), decreasing = T)[1:10]
sort(table(my_left_words), decreasing = T)[1:10]