Workshop Code

###############################################################
# Matthew L. Jockers
# mjockers@unl.edu
# Text Analysis and Topic Modeling in the Humanities Workshop
# University of Wisconsin-Milwaukee
# April 19, 2013
###############################################################


###############################################################
# SESSION ONE (9:00-10:15)
###############################################################

#############################################
# 1.1 The R computing environment--what is R
#############################################

#############################################
# 1.2 R console vs. RStudio show both quickly
#############################################

#############################################
# 1.3 Vectors and basic math in R
#############################################

# Explain the assignment operator "<-"
myvar<-1
myvar
length(myvar)
# Notice that myvar is displayed with a [1]. R uses vectors to store most everything. a vector is simply a list of items.  in this case myvar is a vector of one item  

# now we create a new vector with 4 itmes using the "c" function
myvar<-c(1,2,4,6) # the "c" function means "combine"
myvar
length(myvar)
class(myvar) # show numeric class

#vectors can contain different types of data.  
myvar<-c('a', 'b', 'c')
class(myvar) # show character class

# But Vectors cannot hold values with different data types in the same vector
myvar<-c("a", "b", 1, 2, 3)
myvar
# Notice that R has converted the numbers to characters.

# R has lots of handy shortcuts
myvar<-5:25
myvar

# Accessing items in a vector can get confusing.  think about each item as having a numerical index from 1 to n
# in the sequence we just created, the first item in the vector list is the number 5.  We can access items in a vector by putting their index number inside square brackets, like this
myvar[1]
# and if we want to see the first three items
myvar[1:3]
# ? huh ?  Yes, this can be a bit confusing, especially when we use numbers as items in the vector.

# consider this instead

myvar<-c("This", "is", "a", "test", ".")
myvar[1:3]
myvar[3:5]

# Cool.  Now back to numbers for a minute.  With items in vectors, we can do vector math. . . 

myvar<-c(1,2,4,6)
myvar+10
#notice that R "recycles" the 10 over each value in the vector.  this is very handy and very important.  

# it can get even more fun when you have two vectors, like this
myvar1<-c(1,2,4,6)
myvar2<-c(10, 20,40,60)

# you can add them "index" wise across vectors.
myvar1+myvar2

# you can also use R as a calculator. . . 

1+1

2-3

3*3

8675309/3

# that's enough for now . . . let's play with some text

#############################################
# 1.4 Text manipulation in R
#############################################

# Load Plain text of Moby Dick from Project Gutenberg using the scan function

text<-scan("http://www.gutenberg.org/cache/epub/2701/pg2701.txt", what="character", sep="\n")
class(text) # text is a character vector

# what is that \n thingy?
length(text) #[1] 18874

text[1:3]
#explain line based chunking when using \n

# Explore how the text is ingested as lines.
text[1:407] #Introductory material and boilerplate
text[408] #start of first chapter
text[18577] #beginning of the final boilerplate
class(text)
str(text)

# Demonstrate the use of the which function as a way to find items in vector
start<-which(text == "CHAPTER 1. Loomings.")
start
end<-which(text == "End of Project Gutenberg's Moby Dick; or The Whale, by Herman Melville")
end
start<-408
end<-18577

# instead of this: novel.lines<- text[408:(18577-1)]
# we can use the start and end variables
# Now we can easily isolate the main text of the novel
novel.lines<- text[start:(end-1)] # why the -1
length(novel.lines)
novel.lines[1]

# so now we've got 18169 lines.  
# we want a single string of text
# so we can use paste to glue all of the lines together

novel<-paste(novel.lines, collapse=" ")
length(novel)
# huh?  What happened here. . . .

# now we want to begin doing some analysis, let's convert all to lowercase

novel.lower <-tolower(novel)
class(novel.lower)
novel.lower # check it out y'all

moby.words<-strsplit(novel.lower, "\\W") # huh?  wassup with that \\W, man?
class(moby.words) # a list, what the heck is a list?
length(moby.words) # well, it is a list of one, and the one happens to be a vector
# we can access the item(s) in a list using double brackets
# Here I'll show the entire vector of words that is in item 1
moby.words[[1]] 

# and we can see how many items are in the vector in the first list item, like this
length(moby.words[[1]])

# Well, in this case the list seems extraneous.  In fact it is.
# We only have a list here because that is the way that strsplit outputs
# its results.  So let's get rid of it.

moby.word.vector<-unlist(moby.words)
length(moby.word.vector) # hey, that's the same as this: length(moby.words[[1]])

# if you look at the vector of words you'll see some odd "blanks"
moby.word.vector[1:20]
# ? huh. . . when we use \\W, punctuation gets stripped and these remindered of where the punc chars were get left as a residue.

# we can figure out the POSITIONS of these blanks using which
which(moby.word.vector=="")

# and we can figure out which are not blanks by using the not equals operator !=
not.blanks<-which(moby.word.vector!="")
# so, not.blanks is now a vector of the "positions" of items in the moby.word.vector that are not blanks. 

# so we can overwrite the existing moby.word.vector with a pared down version that omits the blanks, like this
moby.word.vector<- moby.word.vector[not.blanks]

# once we have all the words in a vector, we can do some cool searching:
whales<-which(moby.word.vector=="whale")
whale.hits<-length(whales)
whale.hits/length(moby.word.vector) # relative frequency of whales
# compare to "the"
the<-which(moby.word.vector=="the")
the.hits<-length(the)
the.hits/length(moby.word.vector)

# we can make the percentages easier to read
100*(the.hits/length(moby.word.vector))
100*(whale.hits/length(moby.word.vector))

# we can figure out how many unique words there are in the novel
length(unique(moby.word.vector)) # that's big!

# we can even create an entire frequency list for all the words
moby.freqs<-table(moby.word.vector)
sorted.moby.freqs<-sort(moby.freqs , decreasing=TRUE)
sorted.moby.freqs[1:10]
sorted.moby.rel.freqs<-100*(sorted.moby.freqs/sum(sorted.moby.freqs))
sorted.moby.rel.freqs[1:10]
plot(sorted.moby.rel.freqs[1:10], type="b", xlab="Top Ten Words In Moby Dick by Rel Freq", ylab="Percentage of Full Text", xaxt="n")
axis(1, 1:10, labels=names(sorted.moby.rel.freqs[1:10]))

###############################################################
# BREAK TIME: (10:15-10:30)
###############################################################


###############################################################
# SESSION TWO (10:30-12:00)

#############################################
# 2.1 Downloading and exploring the exercise corpus
#############################################

# https://www.matthewjockers.net/wp-content/uploads/2013/04/uwm-workshop.zip

# setwd or Session->Set Working Dir->Choose Dir
setwd("~/Documents/Workshops/Wisconsin/uwm-workshop")

# Load a directory of files for analysis
corpusDir<-"data/corpus"
files<-dir(path=corpusDir, pattern=".*txt")
files #explain what we are seeing here

# Load and examine the metadata file
metadata<-read.csv("data/metadata.csv", stringsAsFactors=F)
dim(metadata)
class(metadata)
head(metadata)
# what is a data.frame?
# show how a dataframe is accessed
metadata[1:2, 1:4]

# show short cut for column names
metadata$author
metadata[,4]

#show colnames function
colnames(metadata)

# Summary:  So we have a vector of file names that we sucked out of a directory and we have a data.frame that we loaded from a .csv file (e.g. an excel file) that contains metadata about the files in the directory. We now want to create a topic model of all the files in that directory and we then want to be able to say something interesting about the topics in this corpus.

#############################################
# 2.2 What is Latent Dirichlet Allocation (LDA) anyhow?
#############################################
# LDA buffet story.

#############################################
# 2.3 Text Chunking
#############################################

# load a text from the directory--Dorian Gray
novel.lines<-scan(file.path(corpusDir, files[33]), what="character", sep="\n")
novel<-paste(novel.lines, collapse=" ")
novel.lower <-tolower(novel)
wilde.words<-strsplit(novel.lower, "\\W") # tokenize
wilde.words.vector<-unlist(wilde.words)
not.blanks<- which(wilde.words.vector!="") # remove blanks
wilde.words.vector<- wilde.words.vector[not.blanks]
length(wilde.words.vector)
# [1] 80294
# We want to chunk this into n 1000 word chunks.
chunk.size<-1000 # set a chunk variable
num.chunks<-length(wilde.words.vector)/chunk.size
num.chunks
x <- seq_along(wilde.words.vector)
# create a list where each item is a chunk vector
chunks <- split(wilde.words.vector, ceiling(x/chunk.size))
# What have we done here? ceiling is a rounding function.
? ceiling #see
class(chunks)

#inspect the first 1500 values
ceiling(x/chunk.size)[1:1500]

# what about split
? split

#examine the result
str(chunks)

# so now we need to convert this list of vectors into a list of contiguous text chunks
# so consider that this first chunk
chunks[[1]]
# contains a vector of words from the first 1000 words chunk.  But instead of a vector, we need a string of continues words, like this:
paste(chunks[[1]], collapse=" ")

#############################################
# 2.4 lapply
#############################################

# we can apply this paste function over and over again using lapply, a special function for iterating over list items.
# let's learn lapply.
# lapply is a function that takes a list as an argument (i.e. our chunks list)
# and then returns a new list having applied a function to each item in the list
chunks.as.strings<-lapply(chunks, paste, collapse=" ")
# here we use lappy to apply the "paste" function to each item in the list and we provide the additional collapse argument and an optional argument to the function

# so now this
chunks.as.strings[[1]]
# is the same as this:
paste(chunks[[1]], collapse=" ")
# but we also have 
chunks.as.strings[[2]]
# and so on.

# so now we have a list of text chunks as strings of text.  
# we want to send these to a topic model but before we can do that we have to get them out of the list format

chunk.vector<-unlist(chunks.as.strings)
length(chunk.vector)

#############################################
# 2.5 A Toy Model
#############################################

# Now we will run a topic model just for fun
# Let's see if we can find 5 topics in Dorian Gray
# I'll explain all of this later, so just play along for now.
library(lda)
doclines<-lexicalize(chunk.vector) 
set.seed(8675309)
K <- 5
num.iterations<-250
result <- lda.collapsed.gibbs.sampler(doclines$documents, K, doclines$vocab, num.iterations, 0.1, 0.1, compute.log.likelihood=TRUE)
top.words <- top.topic.words(result$topics, 25, by.score=TRUE)

# let's see what we've got
top.words
# tada, it's garbage!  The problem is that we have lot's of high frequency words that
# are screwing up the topical-ness of our topic model.
# so we need to get rid of these danged stop words.

#############################################
# 2.6 What is a stoplist
#############################################

# Here is a typical stop list
english.stoplist<-read.csv("/Users/mjockers/Documents/Workshops/Wisconsin/data/english.stop.txt", stringsAsFactors=F)
english.stoplist<-unique(english.stoplist[,1])

# our goal is to scan each chunk and remove words that are in the stoplist.
# remember chunks? 
str(chunks)
# it is  a list containing a series of word vectors.  
# we need to go back to the point just before we glued all of those words in each vector together with the following command
# chunks.as.strings<-lapply(chunks, paste, collapse=" ")
# before gluing in all together, we need to remove the stopwords from each chunk
# we can use lapply again here too, but we need a custom function to apply


#############################################
# 2.7 A Function
#############################################

# Explain this function
mywhich<-function(word.vector, stoplist){
  word.vector[-which(word.vector %in% stoplist)]
}
# Now apply the function over each item in the chunks list
chunks.without.stops<-lapply(chunks, mywhich, english.stoplist)

#now we need to collapse the chunks just like in the past
chunks.as.strings<-lapply(chunks.without.stops, paste, collapse=" ")

# and convert to a text cector from a list
chunk.vector<-unlist(chunks.as.strings)
length(chunk.vector)


#############################################
# 2.8 A Toy Model again
#############################################
# Now run the LDA again, over the new list
# Let's see if we can find 5 topics in Dorian Gray
# I'll explain all of this later, so just play along for now.
library(lda)
doclines<-lexicalize(chunk.vector) 
set.seed(8675309)
K <- 5
num.iterations<-250
result <- lda.collapsed.gibbs.sampler(doclines$documents, K, doclines$vocab, num.iterations, 0.1, 0.1, compute.log.likelihood=TRUE)
top.words <- top.topic.words(result$topics, 25, by.score=TRUE)

# let's see what we've got
top.words
# So, these topics are better, but still not the best we could hope for due
# to the presence  of a lots of character names.

###########################################################################
# LUNCH BREAK (12:00-1:30)
###########################################################################

###########################################################################
# SESSION 3 (1:30-2:45)

#############################################
# 3.1 A better stoplist	
#############################################

# lets load a much larger stoplist that has character names 
long.stoplist<-read.csv("data/stoplist.csv", stringsAsFactors=F)
long.stoplist<-unique(long.stoplist[,1])
length(long.stoplist)

#############################################
# 3.2 A slightly better model
#############################################

chunks.without.stops<-lapply(chunks, mywhich, long.stoplist)

#now we need to collapse the chunks just like in the past
chunks.as.strings<-lapply(chunks.without.stops, paste, collapse=" ")

# and convert to a text cector from a list
chunk.vector<-unlist(chunks.as.strings)

library(lda)
doclines<-lexicalize(chunk.vector) 
set.seed(8675309)
K <- 10
num.iterations<-250
result <- lda.collapsed.gibbs.sampler(doclines$documents, K, doclines$vocab, num.iterations, 0.1, 0.1, compute.log.likelihood=TRUE)
top.words <- top.topic.words(result$topics, 25, by.score=TRUE)

# let's see what we've got
top.words

# Well, this is getting better and maybe at the level of a single novel
# we might even want to keep the character names in.  But once we move 
# from a single text to a corpus, we really don't want character based topics.  
# visit 
# https://www.matthewjockers.net/2013/04/12/secret-recipe-for-topic-modeling-themes/ 
# to show how topics w just english stopwords have lots of character names.
# explain why this is a problem

#############################################
# 3.3 POS tagging
#############################################

novel.lines<-scan(file.path(corpusDir, files[33]), what="character", sep="\n")
text.blob<-paste(novel.lines, collapse=" ")
library(openNLP)
tagged_text <- tagPOS(text.blob)  # takes 2 minutes
# need a new function
tagged.words<-unlist(strsplit(tagged_text," "))
# Now chunk the full list
chunk.size<-1000 # set a chunk variable
num.chunks<-length(tagged.words)/chunk.size
x <- seq_along(tagged.words)
# create a list where each item is a chunk vector
chunks <- split(tagged.words, ceiling(x/chunk.size))

#############################################
# 3.4 new functions
#############################################
# New Function
SelectTaggedWords <- function(Words,tagID) {
  Words[ grep(tagID,Words) ]
}

tagged.words.select.list<-lapply(chunks, SelectTaggedWords, "/NN$")

# New Function
RemoveTags <- function(Words) {
  sub("/[A-Z]{2,3}","",Words)
}

words.select.list<-lapply(tagged.words.select.list, RemoveTags)

# New Function
RemoveNonChar <- function(Words) {
  gsub("[^[:alnum:][:space:]']","",Words)
}

text.word.vector <- lapply(words.select.list, RemoveNonChar)
chunks.without.stops<-lapply(text.word.vector, mywhich, english.stoplist)

#############################################
# 3.5 re-modeling
#############################################

#now we need to collapse the chunks just like in the past
chunks.as.strings<-lapply(chunks.without.stops, paste, collapse=" ")

# and convert to a text cector from a list
chunk.vector<-unlist(chunks.as.strings)

library(lda)
doclines<-lexicalize(chunk.vector) 
set.seed(8675309)
K <- 20
num.iterations<-250
result <- lda.collapsed.gibbs.sampler(doclines$documents, K, doclines$vocab, num.iterations, 0.1, 0.1, compute.log.likelihood=TRUE)
top.words <- top.topic.words(result$topics, 5, by.score=TRUE)

# let's see what we've got
top.words
# [,1]       [,2]      [,3]      [,4]     [,5]         [,6]         [,7]        [,8]     
# [1,] "pleasure" "lamp"    "gold"    "hansom" "murder"     "experience" "companion" "bag"    
# [2,] "worship"  "women"   "silver"  "horse"  "crime"      "art"        "grass"     "train"  
# [3,] "one"      "street"  "silk"    "driver" "laboratory" "impulse"    "wind"      "coat"   
# [4,] "coffee"   "curtain" "ceiling" "lace"   "action"     "instinct"   "daisy"     "servant"
# [5,] "spirit"   "ice"     "damask"  "cell"   "method"     "nothing"    "happiness" "uncle"  
# [,9]      [,10]    [,11]     [,12]   [,13]     [,14]       [,15]      [,16]      
# [1,] "art"     "blood"  "love"    "life"  "sir"     "music"     "accident" "mother"   
# [2,] "artist"  "cap"    "theatre" "man"   "picture" "principle" "sort"     "son"      
# [3,] "painter" "poison" "night"   "thing" "screen"  "advantage" "town"     "gentleman"
# [4,] "picture" "race"   "act"     "time"  "key"     "carpet"    "fellow"   "marriage" 
# [5,] "fellow"  "collar" "voice"   "room"  "tea"     "host"      "omen"     "child"    
# [,17]        [,18]      [,19]         [,20]    
# [1,] "book"       "picture"  "world"       "husband"
# [2,] "type"       "canvas"   "consolation" "tragedy"
# [3,] "paper"      "portrait" "yesterday"   "dear"   
# [4,] "music"      "prayer"   "globe"       "woman"  
# [5,] "background" "mouth"    "caprice"     "lady" 

###############################################################
# BREAK (2:45-3:00)
###############################################################

###############################################################
# SESSION 4 (3:00-4:30)
###############################################################

# now let's switch to a pre POS tagged corpus and make thematic clouds
# This Requires POS tagging all of the texts. Sicne this takes several hours 
#  i did it in advance.  Here is the code required to do it on your own

# library(openNLP)
# for(i in 13:length(files)){
  # text<-scan(file.path(corpusDir, files[i]), what="character", sep="\n")
  # text.blob<-paste(text, collapse=" ")
  # tagged_text <- tagPOS(text.blob) 
  # write(tagged_text, paste("data/taggedCorpus/",files[i], sep=""))
# }
###############################################################

#############################################
# 4.1 ingesting the entire corpus
#############################################
inputDir<-"data/taggedCorpus"
files<-dir(path=inputDir, pattern=".*txt")
chunk.size<-1000 #number of words per chunk
long.stoplist<-read.csv("data/stoplist.csv", stringsAsFactors=F)
long.stoplist<-unique(long.stoplist[,1])

# Let's take all of the functions we wrote above and move them into a 
# new file called pos.functions.r
# once they are in this other file, we can call them up using the "source" function.

source("code/pos.functions.r")
topic.df<-NULL
for(i in 1:length(files)){
  tagged_text<-scan(file.path(inputDir, files[i]), what="character", sep="\n")
  tagged_words <- unlist(strsplit(tagged_text," ")) 
  tagged_words <- c(SelectTaggedWords(tagged_words,"/NN$")) # SELECT POS TAG TYPES TO KEEP 
  tagged_words_less <- RemoveTags(tagged_words)
  word.vector <- RemoveNonChar(tagged_words_less)
  #remove the blanks
  text.word.vector<- word.vector[which(word.vector!="")]
  #remove stopwords
  text.word.vector<-text.word.vector[-which(text.word.vector %in% long.stoplist)]
  num.chunks<-length(text.word.vector)/chunk.size
  max<-length(text.word.vector)/num.chunks
  x <- seq_along(text.word.vector)
  chunks <- split(text.word.vector, ceiling(x/max))
  # Here I introduce a way to deal with small chunks at the end
  if(length(chunks[[length(chunks)]]) <= 500){
    chunks[[length(chunks)-1]]<-c(chunks[[length(chunks)-1]], chunks[[length(chunks)]])
    chunks[[length(chunks)]]<-NULL
  }
  chunks<-lapply(chunks, paste, collapse=" ")
  chunks.df<-do.call(rbind, chunks)
  chunks.df<-cbind(files[i],chunks.df)
  topic.df<-rbind(topic.df, chunks.df)
}

#############################################
# 4.2 Modeling the entire corpus
#############################################
library(lda)
doclines<-lexicalize(topic.df[,2]) 
set.seed(8675309)
K <- 50
num.iterations<-250
result <- lda.collapsed.gibbs.sampler(doclines$documents, K, doclines$vocab, num.iterations, 0.1, 0.1, compute.log.likelihood=TRUE)
top.words <- top.topic.words(result$topics, 10, by.score=TRUE)

# View top words
top.words

#############################################
# 4.3 Visualizing topics as clouds
#############################################

library(wordcloud)
result.dir<-"topicCloudsFromTagged/"
for(i in 1:nrow(result$topics)){
  cloud.data<-sort(result$topics[i,], decreasing=T)[1:50]
  png(paste(result.dir, i,".png", sep=""), width = 12, height = 12, units="in", res=300)
  print(wordcloud(names(cloud.data), freq=cloud.data, scale=c(4,.10), min.freq=1, rot.per=0, random.order=F))
  dev.off()
}

#############################################
# 4.4 Topic data analysis
#############################################

# after looking at the clouds, we decide we are interested in 
# topic 25 which has to do with depictions of slavery


# First let's get the proportional data from the model
topic.proportions <- t(result$document_sums) / colSums(result$document_sums)

# topic.proportions is a matrix with 932 rows: one for each 1000 word text segment
# and 50 columns, one for each topic.  To make life easier, let's create a column header row
colnames(topic.proportions)<-paste("Topic",1:50, sep="_")


# But there is no info in the matrix about which file chunk is which.
# but we have that data in the first column of topic.df and we can get it out like this
thefilenames<-topic.df[, 1]

# so now we need to assign each chunk a unique number 
chunk.ids<-ave(final.df[,3], final.df$thefilenames, FUN = seq_along)

# we can then "bind" all of this "column" data 
# to the the topic.proportions matrix, like this
final.df<-cbind.data.frame(thefilenames, chunk.ids, topic.proportions)

# show the result like this. . . .
final.df[1:25, 1:5]

# now the file names can act like "keys" in a relational database and allow us to lookup the metadata about those files in the metadata table we loaded earlier. for example,
metadata<-read.csv("data/metadata.for.tagged.csv", stringsAsFactors=F)

# but first we need to fix the file extensions since we switched to the .txt files
metadata$filename<-gsub(".xml", ".txt", metadata$filename)

# and now we can figure out which files are which using "which"
metadata[which(metadata$filename == "1008.txt"), ]

# so let's say we are interested in the slavery topic, topic 25.  

# We can identify the book with the largest use of the theme like this
max.slavery<-final.df[which(final.df$Topic_25 == max(final.df$Topic_25)), "thefilenames"]

# so, which book is it?
metadata[which(metadata$filename == max.slavery),]

# now let's plot the presence of slavery across the entire novel
plot(final.df[which(final.df$thefilenames == max.slavery), "Topic_25"], type="l")

# and finally, let's calculate and plot the mean usage of the theme across all the novels in the corpus.
#############################################
# 4.5 Basic Plotting
#############################################
mean.slavery<-aggregate(final.df$Topic_25, by=list(thefilenames), mean)

#plot it
barplot(mean.slavery$x,names.arg=mean.slavery$Group.1, las=2)

# now export the image.  this makes us wonder about text 500.txt
metadata[which(metadata$filename == "500.txt"),]

# X filename date                     author                             title
# 41 41  500.txt 1867 Child, Lydia Maria Francis A Romance of the Republic. (1867)
# nation
# 41 American

0 thoughts on “Workshop Code

  1. […] Participants interested in getting a head start can explore the R tutorials of Jeffrey Rydberg-Cox (http://www.chlt.org/StatisticalMethods/index.html#) and Matthew L. Jockers (http://www.matthewjockers.net/materials/uwm-2013/workshop-code/). […]

    [WORDPRESS HASHCASH] The comment’s server IP (66.155.8.249) doesn’t match the comment’s URL host IP (66.155.11.238) and so is spam.

Leave a Reply