DHWI: R Code Day Two

###############################################################
# mjockers unl edu
# The Day in Code--DHWI Text Analysis with R. 
# Day 2
###############################################################

# Don't forget to set your working directory. . . . 

# Load Moby Dick File
text<-scan("data/plaintext/melville.txt", what="character", sep="\n")
start<-which(text == "CHAPTER 1. Loomings.")
end<-which(text == "End of Project Gutenberg's Moby Dick; or The Whale, by Herman Melville")
novel.lines<- text[start:(end-1)] 
novel.chapter.postions<-grep("^CHAPTER \\d", novel.lines)
last.position<-length(novel.lines)
novel.chapter.postions<-c(novel.chapter.postions, last.position)

# Create two list objects in which to store type/token tables for each chapter of the novel

chapter.list.freqs<-list()
chapter.list.raws<-list()

# run the loop to fill the two list objects

for(i in 1:length(novel.chapter.postions)){
  if(i != length(novel.chapter.postions)){
    chapter.title<-novel.lines[novel.chapter.postions[i]]
    start<-novel.chapter.postions[i]+1
    end<-novel.chapter.postions[i+1]-1
    chapter.lines<-novel.lines[start:end]
    chapter.words<-tolower(paste(chapter.lines, collapse=" "))
    chapter.words.list<-strsplit(chapter.words, "\\W")
    chapter.words.vector<-unlist(chapter.words.list)
    chapter.words.vector<-chapter.words.vector[which(chapter.words.vector!="")]
    chapter.freqs<-table(chapter.words.vector)
    chapter.list.raws[[chapter.title]]<-chapter.freqs
    chapter.freqs.rel<-100*(chapter.freqs/sum(chapter.freqs))
    chapter.list.freqs[[chapter.title]]<-chapter.freqs.rel
  }
}

#Access the count of "whale" in chapter 1
chapter.list.freqs[[1]]["whale"]

# Use lapply with bracket to get whale and ahab counts for each chapter
whale.list<-lapply(chapter.list.freqs, "[", "whale")
ahab.list<-lapply(chapter.list.freqs, "[", "ahab")

# convert the list data into matrix data with do.call
whales<-do.call(rbind, whale.list)
ahabs<-do.call(rbind, ahab.list)

#Recast as vectors
v.whales<-as.vector(whales)
v.ahabs<-as.vector(ahabs)

#bind the vectors into a new matrix
whales.ahabs<-cbind(v.whales, v.ahabs)

#Or alternate method that creates better column names
whales.ahabs<-cbind(whale=v.whales, ahab=v.ahabs)

#Get rid of those pesky NAs
whales.ahabs[which(is.na(whales.ahabs))]<-0

#Use colnames to make the names even better
colnames(whales.ahabs)<-c("WHALE", "AHAB")

#plot the values in each column side by side
barplot(whales.ahabs, beside=T, col="black")

#Look at the correlation using cor()
cor(whales.ahabs[,1], whales.ahabs[,2])

#recast the data as data frame
cordata<-as.data.frame(whales.ahabs)

#see how we can use the $
cor(cordata$WHALE, cordata$AHAB)

# Use cor with sample to randomize the order of values in one column to see what correlation looks like by chance
cor(sample(cordata$WHALE), cordata$AHAB)

# GUI VERSION
for(i in 1:100){
  print(cor(sample(cordata$WHALE), cordata$AHAB))
}

# Better way
myvar<-NULL
for(i in 1:1000){
  myvar<-rbind(myvar, cor(sample(cordata$WHALE), cordata$AHAB))
}
mean(myvar)
max(myvar)
min(myvar)
max(abs(myvar))

# Or just use cor.test to see the p-value and etc. . . 
cor.test(cordata$WHALE, cordata$AHAB)

####################################
# End of first session 10:30AM
####################################


################################################
# Begin 10:45-12:00 Session with fresh workspace
################################################
text<-scan("data/plaintext/melville.txt", what="character", sep="\n")
start<-which(text == "CHAPTER 1. Loomings.")
end<-which(text == "End of Project Gutenberg's Moby Dick; or The Whale, by Herman Melville")
novel.lines<- text[start:(end-1)] 
novel.chapter.postions<-grep("^CHAPTER \\d", novel.lines)
last.position<-length(novel.lines)
novel.chapter.postions<-c(novel.chapter.postions, last.position)
chapter.list.freqs<-list()
chapter.list.raws<-list()
for(i in 1:length(novel.chapter.postions)){
  if(i != length(novel.chapter.postions)){
    chapter.title<-novel.lines[novel.chapter.postions[i]]
    start<-novel.chapter.postions[i]+1
    end<-novel.chapter.postions[i+1]-1
    chapter.lines<-novel.lines[start:end]
    chapter.words<-tolower(paste(chapter.lines, collapse=" "))
    chapter.words.list<-strsplit(chapter.words, "\\W")
    chapter.words.vector<-unlist(chapter.words.list)
    chapter.words.vector<-chapter.words.vector[which(chapter.words.vector!="")]
    chapter.freqs<-table(chapter.words.vector)
    chapter.list.raws[[chapter.title]]<-chapter.freqs
    chapter.freqs.rel<-100*(chapter.freqs/sum(chapter.freqs))
    chapter.list.freqs[[chapter.title]]<-chapter.freqs.rel
  }
}


sum(chapter.list.raws[[1]]) # Total Tokens in Chapter 1
length(chapter.list.raws[[1]]) #Total Types in Chapter 1

sum(chapter.list.raws[[1]])/length(chapter.list.raws[[1]]) #TTR

mean(chapter.list.raws[[1]]) # mean of chapter one
lapply(chapter.list.raws, mean) #mean of all chapters

#now saving those means into new var
mean.word.use<-do.call(rbind, lapply(chapter.list.raws, mean))

#Look ma, row names are in them thar vectors
rownames(mean.word.use)

#Look at me!
plot(mean.word.use, type="h")

# Subtract the overall mean to see divergence from "expected."
scale(mean.word.use)
plot(scale(mean.word.use), type="h")

#That pesky order function
order(mean.word.use, decreasing=T)

# order the means
mean.word.use[order(mean.word.use, decreasing=T),]

#Grab all the type counts
chapter.lengths<-do.call(rbind, lapply(chapter.list.raws, sum))

# examine correlation between mean word use and type count
# the long way. . . 
my.m<-cbind(chapter.lengths, mean.word.use)
cor(my.m)

#short way
cor(cbind(chapter.lengths, mean.word.use))

#or using cor.test
cor.test(chapter.lengths, mean.word.use)

#let's get those pesky singletons. . . 
chapter.hapax<-sapply(chapter.list.raws, function(x) sum(x==1))

# and now as a percentage
hapax.percentage<-chapter.hapax/chapter.lengths

# look ma, I can plot them too. . .
barplot(hapax.percentage, beside=T, col="grey", names.arg=seq(1:length(chapter.list.raws)))

#Does the number of hapax correlate to chapter length?
cor.test(chapter.lengths, chapter.hapax)

################################################
# Afternoon Session 1, Day 2
# Clear your Workspace
# Clear your Console
# Clear your Mind
# Begin. . . 
################################################
# FUNCTIONs and Do It KWIC

inputDir<-"data/plainText"
files<-dir(inputDir, ".*txt") #dir function

################################################
# show.files Function
# A user-defined (e.g. "us," you and me) 
# function to print a vector of files names 
# in user-friendly format
################################################
show.files<-function(file.name.vector){
  for(i in 1:length(file.name.vector)){
    cat(i, file.name.vector[i], "\n", sep=" ")
  }
}

# now call the function with the files argument from above
show.files(files)

################################################
# make.file.word.list Function
# takes a vector of file names and a dir path
# and return a list in which each item in the list is 
# an ordered vector of words from one of the files in the
# files vector
################################################
make.file.word.list<-function(file.vector, inputDir){
  text.word.vector.list<-list()
  for(i in 1:length(file.vector)){
    # read in the file from dir
    text.lines<-scan(paste(inputDir, file.vector[i], sep="/"), what="character", sep="\n")
    # convert to single string
    text<-paste(text.lines, collapse=" ")
    text.lower<-tolower(text)
    text.words.list<-strsplit(text.lower, "\\W")
    text.word.vector<-unlist(text.words.list)
    # Remove the blanks
    text.word.vector<-text.word.vector[which(text.word.vector!="")]
    # use the index id from the file.vector vector as the "name" in the list
    text.word.vector.list[[file.vector[i]]]<-text.word.vector
  }
  return(text.word.vector.list)
}

#build a list object using the function
my.corpus<-make.file.word.list(files, inputDir)

my.corpus[[1]][1:100] # look at this

positions<-which(my.corpus[[1]][]=="gutenberg") #find gutenberg

first.instance<-positions[1]

cat(my.corpus[[1]][(first.instance-1):(first.instance+1)])

a.dogs<-which(my.corpus[[1]][]=="dog") # 1 for austen
m.dogs<-which(my.corpus[[2]][]=="dog") # 2 for melville

# a hard coded KWIC list
context<-2
for(i in 1:length(m.dogs)){
  cat(my.corpus[[2]][(m.dogs[i]-context):(m.dogs[i]+context)], "\n")
}

###############################################################
# Do It KWIC--BETTER
# Clear your workspace
# we are now putting our functions in another 
# file called "corpusFunctions.r"
###############################################################
source("code/corpusFunctions.r") # reference a supporting file
inputDir<-"data/plainText"
files<-dir(inputDir, ".*txt")
outputDir<-"results/"
my.corpus<-make.file.word.list(files, inputDir) # this function is in another file

###############################################################
# For the rest of Day 2, look at </code> <a href="https://www.matthewjockers.net/materials/dhwi-r-code-functions-file/">corpusFunctions.r</a><code>
###############################################################
Matthew L. Jockers

"Everything . . . in nature's vast workshop from the extinction of some remote sun to the blossoming of one of the countless flowers which beautify our public parks is subject to a law of numeration as yet unascertained.” (Joyce, Ulysses, 1922)

Leave a Reply Cancel reply