Day Two Code

# day_two_code.R
###########################################
# There are two types of programmers:     #
# those who comment their code and those  #
# who are going to comment their code.    #
###########################################

#########################
# Day 2 Part 1
#########################
text_v <- scan(file = "data/plainText/melville.txt", what = "character", sep = "\n")
novel_lines <- text_v[408:18576]
chap_pos <- grep("^CHAPTER \\d", novel_lines)
novel_lines <- c(novel_lines, "END")
last_pos <- length(novel_lines)
chap_pos <- c(chap_pos, last_pos)
chap_freqs_l <- list()
for(i in 1:length(chap_pos)){
  if(i != length(chap_pos)){
    start <- chap_pos[i] + 1
    end <- chap_pos[i+1] - 1
    chap_lines <- novel_lines[start:end]
    chap_words <- tolower(paste(chap_lines, collapse = " "))
    chap_words_l <- strsplit(chap_words, "\\W")
    chap_words_v <- unlist(chap_words_l)
    chap_words_v <- chap_words_v[which(chap_words_v != "")]
    chap_words_t <- table(chap_words_v)/length(chap_words_v)
    chap_title <- novel_lines[chap_pos[i]]
    chap_freqs_l[[chap_title]] <- chap_words_t
  }
}
my_whales <- lapply(chap_freqs_l, "[", "whale")
whales_m <- do.call(rbind, my_whales)
my_ahabs <- lapply(chap_freqs_l, "[", "ahab")
ahabs_m <- do.call(rbind, my_ahabs)
whales_ahabs_m <- cbind(whales_m[,1], ahabs_m[,1])
colnames(whales_ahabs_m)<- c("Whale", "Ahab")
barplot(whales_ahabs_m, beside = T, col="blue")
whales_ahabs_m[is.na(whales_ahabs_m)] <- 0
cor(whales_ahabs_m[,1], whales_ahabs_m[,2])

##################################################
# Day 2 Part 2: Randomization and Permutation
##################################################

# Convert Matrix to Dataframe. . . 
w_a_df <- as.data.frame(whales_ahabs_m)
cor(w_a_df)
# Instantiate a empty vector
my_vals <- numeric() # OR <- NULL

# 10000 iterations with random sampling!
for(i in 1:10000){
  my_vals <- c(my_vals, cor(sample(w_a_df$Whale), w_a_df$Ahab))
}
# Summarize the results
summary(my_vals)

# Plot the results as a histogram
plot_title <- paste("True Correlation = ", cor(w_a_df)[1,2], sep="")
h <- hist(my_vals, breaks=100, col = "blue", xlab = "Coorelation", main = plot_title, plot=T)
xfit <- seq(min(my_vals), max(my_vals), length=10000)
yfit <- dnorm(xfit, mean(my_vals), sd=sd(my_vals))
yfit <-  yfit * diff(h$mids[1:2]) * length(my_vals)
lines(xfit, yfit, col="red", lwd=2)


##################################################
# Day 2 Part 3: Vocubulary Richness
##################################################
# Clear Environment
rm(list=ls())

# Copy code from this morning, but change one line 

text_v <- scan(file = "data/plainText/melville.txt", what = "character", sep = "\n")
novel_lines <- text_v[408:18576]
chap_pos <- grep("^CHAPTER \\d", novel_lines)
novel_lines <- c(novel_lines, "END")
last_pos <- length(novel_lines)
chap_pos <- c(chap_pos, last_pos)
chap_freqs_l <- list()
for(i in 1:length(chap_pos)){
  if(i != length(chap_pos)){
    start <- chap_pos[i] + 1
    end <- chap_pos[i+1] - 1
    chap_lines <- novel_lines[start:end]
    chap_words <- tolower(paste(chap_lines, collapse = " "))
    chap_words_l <- strsplit(chap_words, "\\W")
    chap_words_v <- unlist(chap_words_l)
    chap_words_v <- chap_words_v[which(chap_words_v != "")]
    # chap_words_t <- table(chap_words_v)/length(chap_words_v)
    chap_words_t <- table(chap_words_v)
    chap_title <- novel_lines[chap_pos[i]]
    chap_freqs_l[[chap_title]] <- chap_words_t
  }
}

# get Num words in first chapter
sum(chap_freqs_l[[1]])

# get num words in every chapter
chap_lens <-lapply(chap_freqs_l, sum)
chap_lens_m <- do.call(rbind, chap_lens)
plot(chap_lens_m, type="h")
chap_lens_m[which(chap_lens_m == max(chap_lens_m))]
mean(chap_lens_m)
summary(chap_lens_m)

##################################################
# A Function for calculating ttr
##################################################
ttr <- function(type_table){
  t_types <- length(type_table)
  t_tokens <- sum(type_table)
  return(t_tokens/t_types)
}

# Apply the function
my_ttrs <- lapply(chap_freqs_l, ttr)
ttr_results_m <- do.call(rbind, my_ttrs)
summary(ttr_results_m)

# Is there a correlation?
ttr_len_m <- cbind(chap_lens_m[,1], ttr_results_m[,1])
ttr_len_df <- as.data.frame(ttr_len_m)
cor(ttr_len_df)

# A function for counting the hapax
hapax <- function(type_table){
  length(which(type_table == 1))
}

# Apply the function
hapax_l <- lapply(chap_freqs_l, hapax)
hapax_m <- do.call(rbind, hapax_l)
hapax_len_m <- cbind(chap_lens_m[,1], hapax_m[,1])
cor(hapax_len_m)
Matthew L. Jockers

"Everything . . . in nature's vast workshop from the extinction of some remote sun to the blossoming of one of the countless flowers which beautify our public parks is subject to a law of numeration as yet unascertained.” (Joyce, Ulysses, 1922)