# day_two_code.R
###########################################
# There are two types of programmers: #
# those who comment their code and those #
# who are going to comment their code. #
###########################################
#########################
# Day 2 Part 1
#########################
text_v <- scan(file = "data/plainText/melville.txt", what = "character", sep = "\n")
novel_lines <- text_v[408:18576]
chap_pos <- grep("^CHAPTER \\d", novel_lines)
novel_lines <- c(novel_lines, "END")
last_pos <- length(novel_lines)
chap_pos <- c(chap_pos, last_pos)
chap_freqs_l <- list()
for(i in 1:length(chap_pos)){
if(i != length(chap_pos)){
start <- chap_pos[i] + 1
end <- chap_pos[i+1] - 1
chap_lines <- novel_lines[start:end]
chap_words <- tolower(paste(chap_lines, collapse = " "))
chap_words_l <- strsplit(chap_words, "\\W")
chap_words_v <- unlist(chap_words_l)
chap_words_v <- chap_words_v[which(chap_words_v != "")]
chap_words_t <- table(chap_words_v)/length(chap_words_v)
chap_title <- novel_lines[chap_pos[i]]
chap_freqs_l[[chap_title]] <- chap_words_t
}
}
my_whales <- lapply(chap_freqs_l, "[", "whale")
whales_m <- do.call(rbind, my_whales)
my_ahabs <- lapply(chap_freqs_l, "[", "ahab")
ahabs_m <- do.call(rbind, my_ahabs)
whales_ahabs_m <- cbind(whales_m[,1], ahabs_m[,1])
colnames(whales_ahabs_m)<- c("Whale", "Ahab")
barplot(whales_ahabs_m, beside = T, col="blue")
whales_ahabs_m[is.na(whales_ahabs_m)] <- 0
cor(whales_ahabs_m[,1], whales_ahabs_m[,2])
##################################################
# Day 2 Part 2: Randomization and Permutation
##################################################
# Convert Matrix to Dataframe. . .
w_a_df <- as.data.frame(whales_ahabs_m)
cor(w_a_df)
# Instantiate a empty vector
my_vals <- numeric() # OR <- NULL
# 10000 iterations with random sampling!
for(i in 1:10000){
my_vals <- c(my_vals, cor(sample(w_a_df$Whale), w_a_df$Ahab))
}
# Summarize the results
summary(my_vals)
# Plot the results as a histogram
plot_title <- paste("True Correlation = ", cor(w_a_df)[1,2], sep="")
h <- hist(my_vals, breaks=100, col = "blue", xlab = "Coorelation", main = plot_title, plot=T)
xfit <- seq(min(my_vals), max(my_vals), length=10000)
yfit <- dnorm(xfit, mean(my_vals), sd=sd(my_vals))
yfit <- yfit * diff(h$mids[1:2]) * length(my_vals)
lines(xfit, yfit, col="red", lwd=2)
##################################################
# Day 2 Part 3: Vocubulary Richness
##################################################
# Clear Environment
rm(list=ls())
# Copy code from this morning, but change one line
text_v <- scan(file = "data/plainText/melville.txt", what = "character", sep = "\n")
novel_lines <- text_v[408:18576]
chap_pos <- grep("^CHAPTER \\d", novel_lines)
novel_lines <- c(novel_lines, "END")
last_pos <- length(novel_lines)
chap_pos <- c(chap_pos, last_pos)
chap_freqs_l <- list()
for(i in 1:length(chap_pos)){
if(i != length(chap_pos)){
start <- chap_pos[i] + 1
end <- chap_pos[i+1] - 1
chap_lines <- novel_lines[start:end]
chap_words <- tolower(paste(chap_lines, collapse = " "))
chap_words_l <- strsplit(chap_words, "\\W")
chap_words_v <- unlist(chap_words_l)
chap_words_v <- chap_words_v[which(chap_words_v != "")]
# chap_words_t <- table(chap_words_v)/length(chap_words_v)
chap_words_t <- table(chap_words_v)
chap_title <- novel_lines[chap_pos[i]]
chap_freqs_l[[chap_title]] <- chap_words_t
}
}
# get Num words in first chapter
sum(chap_freqs_l[[1]])
# get num words in every chapter
chap_lens <-lapply(chap_freqs_l, sum)
chap_lens_m <- do.call(rbind, chap_lens)
plot(chap_lens_m, type="h")
chap_lens_m[which(chap_lens_m == max(chap_lens_m))]
mean(chap_lens_m)
summary(chap_lens_m)
##################################################
# A Function for calculating ttr
##################################################
ttr <- function(type_table){
t_types <- length(type_table)
t_tokens <- sum(type_table)
return(t_tokens/t_types)
}
# Apply the function
my_ttrs <- lapply(chap_freqs_l, ttr)
ttr_results_m <- do.call(rbind, my_ttrs)
summary(ttr_results_m)
# Is there a correlation?
ttr_len_m <- cbind(chap_lens_m[,1], ttr_results_m[,1])
ttr_len_df <- as.data.frame(ttr_len_m)
cor(ttr_len_df)
# A function for counting the hapax
hapax <- function(type_table){
length(which(type_table == 1))
}
# Apply the function
hapax_l <- lapply(chap_freqs_l, hapax)
hapax_m <- do.call(rbind, hapax_l)
hapax_len_m <- cbind(chap_lens_m[,1], hapax_m[,1])
cor(hapax_len_m)