###############################################################
# mjockers unl edu
# The Day in Code--DHWI Text Analysis with R.
# Day 2
###############################################################
# Don't forget to set your working directory. . . .
# Load Moby Dick File
text<-scan("data/plaintext/melville.txt", what="character", sep="\n")
start<-which(text == "CHAPTER 1. Loomings.")
end<-which(text == "End of Project Gutenberg's Moby Dick; or The Whale, by Herman Melville")
novel.lines<- text[start:(end-1)]
novel.chapter.postions<-grep("^CHAPTER \\d", novel.lines)
last.position<-length(novel.lines)
novel.chapter.postions<-c(novel.chapter.postions, last.position)
# Create two list objects in which to store type/token tables for each chapter of the novel
chapter.list.freqs<-list()
chapter.list.raws<-list()
# run the loop to fill the two list objects
for(i in 1:length(novel.chapter.postions)){
if(i != length(novel.chapter.postions)){
chapter.title<-novel.lines[novel.chapter.postions[i]]
start<-novel.chapter.postions[i]+1
end<-novel.chapter.postions[i+1]-1
chapter.lines<-novel.lines[start:end]
chapter.words<-tolower(paste(chapter.lines, collapse=" "))
chapter.words.list<-strsplit(chapter.words, "\\W")
chapter.words.vector<-unlist(chapter.words.list)
chapter.words.vector<-chapter.words.vector[which(chapter.words.vector!="")]
chapter.freqs<-table(chapter.words.vector)
chapter.list.raws[[chapter.title]]<-chapter.freqs
chapter.freqs.rel<-100*(chapter.freqs/sum(chapter.freqs))
chapter.list.freqs[[chapter.title]]<-chapter.freqs.rel
}
}
#Access the count of "whale" in chapter 1
chapter.list.freqs[[1]]["whale"]
# Use lapply with bracket to get whale and ahab counts for each chapter
whale.list<-lapply(chapter.list.freqs, "[", "whale")
ahab.list<-lapply(chapter.list.freqs, "[", "ahab")
# convert the list data into matrix data with do.call
whales<-do.call(rbind, whale.list)
ahabs<-do.call(rbind, ahab.list)
#Recast as vectors
v.whales<-as.vector(whales)
v.ahabs<-as.vector(ahabs)
#bind the vectors into a new matrix
whales.ahabs<-cbind(v.whales, v.ahabs)
#Or alternate method that creates better column names
whales.ahabs<-cbind(whale=v.whales, ahab=v.ahabs)
#Get rid of those pesky NAs
whales.ahabs[which(is.na(whales.ahabs))]<-0
#Use colnames to make the names even better
colnames(whales.ahabs)<-c("WHALE", "AHAB")
#plot the values in each column side by side
barplot(whales.ahabs, beside=T, col="black")
#Look at the correlation using cor()
cor(whales.ahabs[,1], whales.ahabs[,2])
#recast the data as data frame
cordata<-as.data.frame(whales.ahabs)
#see how we can use the $
cor(cordata$WHALE, cordata$AHAB)
# Use cor with sample to randomize the order of values in one column to see what correlation looks like by chance
cor(sample(cordata$WHALE), cordata$AHAB)
# GUI VERSION
for(i in 1:100){
print(cor(sample(cordata$WHALE), cordata$AHAB))
}
# Better way
myvar<-NULL
for(i in 1:1000){
myvar<-rbind(myvar, cor(sample(cordata$WHALE), cordata$AHAB))
}
mean(myvar)
max(myvar)
min(myvar)
max(abs(myvar))
# Or just use cor.test to see the p-value and etc. . .
cor.test(cordata$WHALE, cordata$AHAB)
####################################
# End of first session 10:30AM
####################################
################################################
# Begin 10:45-12:00 Session with fresh workspace
################################################
text<-scan("data/plaintext/melville.txt", what="character", sep="\n")
start<-which(text == "CHAPTER 1. Loomings.")
end<-which(text == "End of Project Gutenberg's Moby Dick; or The Whale, by Herman Melville")
novel.lines<- text[start:(end-1)]
novel.chapter.postions<-grep("^CHAPTER \\d", novel.lines)
last.position<-length(novel.lines)
novel.chapter.postions<-c(novel.chapter.postions, last.position)
chapter.list.freqs<-list()
chapter.list.raws<-list()
for(i in 1:length(novel.chapter.postions)){
if(i != length(novel.chapter.postions)){
chapter.title<-novel.lines[novel.chapter.postions[i]]
start<-novel.chapter.postions[i]+1
end<-novel.chapter.postions[i+1]-1
chapter.lines<-novel.lines[start:end]
chapter.words<-tolower(paste(chapter.lines, collapse=" "))
chapter.words.list<-strsplit(chapter.words, "\\W")
chapter.words.vector<-unlist(chapter.words.list)
chapter.words.vector<-chapter.words.vector[which(chapter.words.vector!="")]
chapter.freqs<-table(chapter.words.vector)
chapter.list.raws[[chapter.title]]<-chapter.freqs
chapter.freqs.rel<-100*(chapter.freqs/sum(chapter.freqs))
chapter.list.freqs[[chapter.title]]<-chapter.freqs.rel
}
}
sum(chapter.list.raws[[1]]) # Total Tokens in Chapter 1
length(chapter.list.raws[[1]]) #Total Types in Chapter 1
sum(chapter.list.raws[[1]])/length(chapter.list.raws[[1]]) #TTR
mean(chapter.list.raws[[1]]) # mean of chapter one
lapply(chapter.list.raws, mean) #mean of all chapters
#now saving those means into new var
mean.word.use<-do.call(rbind, lapply(chapter.list.raws, mean))
#Look ma, row names are in them thar vectors
rownames(mean.word.use)
#Look at me!
plot(mean.word.use, type="h")
# Subtract the overall mean to see divergence from "expected."
scale(mean.word.use)
plot(scale(mean.word.use), type="h")
#That pesky order function
order(mean.word.use, decreasing=T)
# order the means
mean.word.use[order(mean.word.use, decreasing=T),]
#Grab all the type counts
chapter.lengths<-do.call(rbind, lapply(chapter.list.raws, sum))
# examine correlation between mean word use and type count
# the long way. . .
my.m<-cbind(chapter.lengths, mean.word.use)
cor(my.m)
#short way
cor(cbind(chapter.lengths, mean.word.use))
#or using cor.test
cor.test(chapter.lengths, mean.word.use)
#let's get those pesky singletons. . .
chapter.hapax<-sapply(chapter.list.raws, function(x) sum(x==1))
# and now as a percentage
hapax.percentage<-chapter.hapax/chapter.lengths
# look ma, I can plot them too. . .
barplot(hapax.percentage, beside=T, col="grey", names.arg=seq(1:length(chapter.list.raws)))
#Does the number of hapax correlate to chapter length?
cor.test(chapter.lengths, chapter.hapax)
################################################
# Afternoon Session 1, Day 2
# Clear your Workspace
# Clear your Console
# Clear your Mind
# Begin. . .
################################################
# FUNCTIONs and Do It KWIC
inputDir<-"data/plainText"
files<-dir(inputDir, ".*txt") #dir function
################################################
# show.files Function
# A user-defined (e.g. "us," you and me)
# function to print a vector of files names
# in user-friendly format
################################################
show.files<-function(file.name.vector){
for(i in 1:length(file.name.vector)){
cat(i, file.name.vector[i], "\n", sep=" ")
}
}
# now call the function with the files argument from above
show.files(files)
################################################
# make.file.word.list Function
# takes a vector of file names and a dir path
# and return a list in which each item in the list is
# an ordered vector of words from one of the files in the
# files vector
################################################
make.file.word.list<-function(file.vector, inputDir){
text.word.vector.list<-list()
for(i in 1:length(file.vector)){
# read in the file from dir
text.lines<-scan(paste(inputDir, file.vector[i], sep="/"), what="character", sep="\n")
# convert to single string
text<-paste(text.lines, collapse=" ")
text.lower<-tolower(text)
text.words.list<-strsplit(text.lower, "\\W")
text.word.vector<-unlist(text.words.list)
# Remove the blanks
text.word.vector<-text.word.vector[which(text.word.vector!="")]
# use the index id from the file.vector vector as the "name" in the list
text.word.vector.list[[file.vector[i]]]<-text.word.vector
}
return(text.word.vector.list)
}
#build a list object using the function
my.corpus<-make.file.word.list(files, inputDir)
my.corpus[[1]][1:100] # look at this
positions<-which(my.corpus[[1]][]=="gutenberg") #find gutenberg
first.instance<-positions[1]
cat(my.corpus[[1]][(first.instance-1):(first.instance+1)])
a.dogs<-which(my.corpus[[1]][]=="dog") # 1 for austen
m.dogs<-which(my.corpus[[2]][]=="dog") # 2 for melville
# a hard coded KWIC list
context<-2
for(i in 1:length(m.dogs)){
cat(my.corpus[[2]][(m.dogs[i]-context):(m.dogs[i]+context)], "\n")
}
###############################################################
# Do It KWIC--BETTER
# Clear your workspace
# we are now putting our functions in another
# file called "corpusFunctions.r"
###############################################################
source("code/corpusFunctions.r") # reference a supporting file
inputDir<-"data/plainText"
files<-dir(inputDir, ".*txt")
outputDir<-"results/"
my.corpus<-make.file.word.list(files, inputDir) # this function is in another file
###############################################################
# For the rest of Day 2, look at </code> <a href="http://www.matthewjockers.net/materials/dhwi-r-code-functions-file/">corpusFunctions.r</a><code>
###############################################################