DHWI: R Code Day One

###############################################################
# mjockers unl edu
# The Day in Code--DHWI Text Analysis with R. 
# Day 1
###############################################################

# Don't forget to set your working directory. . . . 

# Make moby.word.vector from Project Gutenberg Moby Dick
text<-scan("data/plaintext/melville.txt", what="character", sep="\n")
start<-which(text == "CHAPTER 1. Loomings.")
end<-which(text == "End of Project Gutenberg's Moby Dick; or The Whale, by Herman Melville")
novel.lines<- text[start:(end-1)] 
metadata<-c(text[1:start -1], text[end:length(text)])
novel<-paste(novel.lines, collapse=" ")
novel.lower <-tolower(novel)
moby.words<-strsplit(novel.lower, "\\W")
moby.word.vector<-unlist(moby.words)
not.blanks<- which(moby.word.vector!="")
moby.word.vector<- moby.word.vector[not.blanks]
whales<-which(moby.word.vector=="whale")
# Do some basic calculations
whale.hits<-length(whales)
total.words<-length(moby.word.vector)
100*(whale.hits/total.words) #frequency of whale as percentage

# How many word types in Moby Dick?
length(unique(moby.word.vector))

# How many tokens of each type?
moby.freqs<-table(moby.word.vector)

# show counts for first ten types
moby.freqs[1:10] 

# better when sorted
sorted.moby.freqs<-sort(moby.freqs , decreasing=T)
sorted.moby.freqs[1:10]

# and I can convert them to relative frequencies by recycling:
sorted.moby.rel.freqs<-100*(sorted.moby.freqs/sum(sorted.moby.freqs))

# look ma, I can plot the raw frequencies too
plot(sorted.moby.freqs[1:10], type="l")

# and I can plot the relative frequencies with some fancy pants
plot(sorted.moby.rel.freqs[1:10], type="b", xlab="Top Ten Words In Moby Dick by Rel Freq", ylab="Percentage of Full Text", xaxt="n")
axis(1, 1:10, labels=names(sorted.moby.rel.freqs[1:10]))

# See Matt look up frequencies by word type too!
sorted.moby.freqs["he"]
sorted.moby.freqs["she"]
sorted.moby.freqs["him"]
sorted.moby.freqs["her"]

# See Matt calculate some ratios
sorted.moby.freqs["him"]/sorted.moby.freqs["her"]
sorted.moby.freqs["he"]/sorted.moby.freqs["she"]

#plot some whales
xaxis<-1:length(moby.word.vector)
xaxis<-seq(1:length(moby.word.vector))
yaxis<-rep(NA,length(xaxis))
yaxis[whales]<-1
plot(yaxis, xlab="Novel Time", ylab="whale", type="h", ylim=c(0,1), yaxt="n")

# plot some ahabs
ahabs<-which(moby.word.vector=="ahab")
xaxis<-1:length(moby.word.vector)
xaxis<-seq(1:length(moby.word.vector))
yaxis<-rep(NA,length(xaxis))
yaxis[ahabs]<-1
plot(yaxis, xlab="Novel Time", ylab="ahab", type="h", ylim=c(0,1), yaxt="n")

# Divide Moby Dick into chapters and generate raw count and frequency tables for each chapter and store the data in two new list objects, pwhew!
novel.chapter.postions<-grep("^CHAPTER \\d", novel.lines)
novel.lines[novel.chapter.postions]
last.position<-length(novel.lines)
novel.chapter.postions<-c(novel.chapter.postions, last.position)

# Arrrrrrrrgh now we need to learn a forrrrrrr loop. So, . . . 

justforfun<-"All R and no R&R makes Matt a dull humanist."
for(i in 1:100){
  print(justforfun)
}

# Rolling Drum. . . the real for loop

# first instantiate two new list object
chapter.list.freqs<-list()
chapter.list.raws<-list()
for(i in 1:length(novel.chapter.postions)){
  if(i != length(novel.chapter.postions)){
    chapter.title<-novel.lines[novel.chapter.postions[i]]
    start<-novel.chapter.postions[i]+1
    end<-novel.chapter.postions[i+1]-1
    chapter.lines<-novel.lines[start:end]
    chapter.words<-tolower(paste(chapter.lines, collapse=" "))
    chapter.words.list<-strsplit(chapter.words, "\\W")
    chapter.words.vector<-unlist(chapter.words.list)
    chapter.words.vector<-chapter.words.vector[which(chapter.words.vector!="")]
    chapter.freqs<-table(chapter.words.vector)
    chapter.list.raws[[chapter.title]]<-chapter.freqs
    chapter.freqs.rel<-100*(chapter.freqs/sum(chapter.freqs))
    chapter.list.freqs[[chapter.title]]<-chapter.freqs.rel
  }
}

##########################################
# End of Day 1
# Happy Day two!
##########################################

# Let's see what this baby we made yesterday is made of. . . 
str(chapter.list.raws)

Leave a Reply