1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
# Day Three Code input_dir <- "data/plainText/" files_v <- dir(input_dir, "\\.txt$") # Function to print a vector of file # names in a user friendly format show_files <- function(file_names_v){ for(i in 1:length(file_names_v)){ cat(i, file_names_v[i], "\n", sep=" ") } } show_files(files_v) make_file_word_v <- function(files_input, in_dir){ # set up empty container text_word_v_l <- list() # loop over some files for(i in 1:length(files_input)){ text_v <- scan(paste(input_dir, files_input[i], sep=""), what = "character", sep="\n") text_v <- tolower(paste(text_v, collapse = " ")) text_tokens_v <- unlist(strsplit(text_v, "\\W")) # remove those stinkin blanks. . . text_tokens_v <- text_tokens_v[which(text_tokens_v != "")] text_word_v_l[[files_input[i]]] <- text_tokens_v } return(text_word_v_l) } my_corpus <- make_file_word_v(files_v, input_dir) # Finding words and their neighbors. . . # Here is an example of what we have in our "my_corpus" list object. which(my_corpus[[1]][1:10] == "jane") girls_positions <- which(my_corpus[[1]] == "girl") context <- 3 for(i in 1:length(girls_positions)){ instance_index <- girls_positions[i] start <- instance_index - context end <- instance_index + context instance_output <- my_corpus[[1]][start:end] cat(instance_index, instance_output, "\n") } ###################################################### # A better example becasue it is more abstract. . . . ###################################################### # First the Required arguments. . . context <- 5 text_id <- 2 my_word <- "whale" # Now the main script positions_v <- which(my_corpus[[text_id]] == my_word) for(i in 1:length(positions_v)){ instance_index <- positions_v[i] l_start <- instance_index - context l_end <- instance_index - 1 left_context <- my_corpus[[text_id]][l_start:l_end] r_end <- instance_index + context r_start <- instance_index + 1 right_context <- my_corpus[[text_id]][r_start:r_end] keyword <- my_corpus[[text_id]][instance_index] cat("------------", i, "------------", "\n") cat(left_context, "[", keyword, "]", right_context, "\n") } ############################################################### # Modified version to collect context words to the left and right. . . ############################################################### context <- 5 text_id <- 2 positions_v <- which(my_corpus[[text_id]] == "whale") my_left_words <- NULL my_right_words <- NULL for(i in 1:length(positions_v)){ instance_index <- positions_v[i] l_start <- instance_index - context l_end <- instance_index - 1 left_context <- my_corpus[[text_id]][l_start:l_end] my_left_words <- c(my_words, left_context) r_end <- instance_index + context r_start <- instance_index + 1 right_context <- my_corpus[[text_id]][r_start:r_end] my_right_words <- c(my_right_words, right_context) keyword <- my_corpus[[text_id]][instance_index] cat("------------", i, "------------", "\n") cat(left_context, "[", keyword, "]", right_context, "\n") } sort(table(my_right_words), decreasing = T)[1:10] sort(table(my_left_words), decreasing = T)[1:10] |