###############################################################
# Matthew L. Jockers
# mjockers@unl.edu
# Text Analysis and Topic Modeling in the Humanities Workshop
# University of Wisconsin-Milwaukee
# April 19, 2013
###############################################################
###############################################################
# SESSION ONE (9:00-10:15)
###############################################################
#############################################
# 1.1 The R computing environment--what is R
#############################################
#############################################
# 1.2 R console vs. RStudio show both quickly
#############################################
#############################################
# 1.3 Vectors and basic math in R
#############################################
# Explain the assignment operator "<-"
myvar<-1
myvar
length(myvar)
# Notice that myvar is displayed with a [1]. R uses vectors to store most everything. a vector is simply a list of items. in this case myvar is a vector of one item
# now we create a new vector with 4 itmes using the "c" function
myvar<-c(1,2,4,6) # the "c" function means "combine"
myvar
length(myvar)
class(myvar) # show numeric class
#vectors can contain different types of data.
myvar<-c('a', 'b', 'c')
class(myvar) # show character class
# But Vectors cannot hold values with different data types in the same vector
myvar<-c("a", "b", 1, 2, 3)
myvar
# Notice that R has converted the numbers to characters.
# R has lots of handy shortcuts
myvar<-5:25
myvar
# Accessing items in a vector can get confusing. think about each item as having a numerical index from 1 to n
# in the sequence we just created, the first item in the vector list is the number 5. We can access items in a vector by putting their index number inside square brackets, like this
myvar[1]
# and if we want to see the first three items
myvar[1:3]
# ? huh ? Yes, this can be a bit confusing, especially when we use numbers as items in the vector.
# consider this instead
myvar<-c("This", "is", "a", "test", ".")
myvar[1:3]
myvar[3:5]
# Cool. Now back to numbers for a minute. With items in vectors, we can do vector math. . .
myvar<-c(1,2,4,6)
myvar+10
#notice that R "recycles" the 10 over each value in the vector. this is very handy and very important.
# it can get even more fun when you have two vectors, like this
myvar1<-c(1,2,4,6)
myvar2<-c(10, 20,40,60)
# you can add them "index" wise across vectors.
myvar1+myvar2
# you can also use R as a calculator. . .
1+1
2-3
3*3
8675309/3
# that's enough for now . . . let's play with some text
#############################################
# 1.4 Text manipulation in R
#############################################
# Load Plain text of Moby Dick from Project Gutenberg using the scan function
text<-scan("http://www.gutenberg.org/cache/epub/2701/pg2701.txt", what="character", sep="\n")
class(text) # text is a character vector
# what is that \n thingy?
length(text) #[1] 18874
text[1:3]
#explain line based chunking when using \n
# Explore how the text is ingested as lines.
text[1:407] #Introductory material and boilerplate
text[408] #start of first chapter
text[18577] #beginning of the final boilerplate
class(text)
str(text)
# Demonstrate the use of the which function as a way to find items in vector
start<-which(text == "CHAPTER 1. Loomings.")
start
end<-which(text == "End of Project Gutenberg's Moby Dick; or The Whale, by Herman Melville")
end
start<-408
end<-18577
# instead of this: novel.lines<- text[408:(18577-1)]
# we can use the start and end variables
# Now we can easily isolate the main text of the novel
novel.lines<- text[start:(end-1)] # why the -1
length(novel.lines)
novel.lines[1]
# so now we've got 18169 lines.
# we want a single string of text
# so we can use paste to glue all of the lines together
novel<-paste(novel.lines, collapse=" ")
length(novel)
# huh? What happened here. . . .
# now we want to begin doing some analysis, let's convert all to lowercase
novel.lower <-tolower(novel)
class(novel.lower)
novel.lower # check it out y'all
moby.words<-strsplit(novel.lower, "\\W") # huh? wassup with that \\W, man?
class(moby.words) # a list, what the heck is a list?
length(moby.words) # well, it is a list of one, and the one happens to be a vector
# we can access the item(s) in a list using double brackets
# Here I'll show the entire vector of words that is in item 1
moby.words[[1]]
# and we can see how many items are in the vector in the first list item, like this
length(moby.words[[1]])
# Well, in this case the list seems extraneous. In fact it is.
# We only have a list here because that is the way that strsplit outputs
# its results. So let's get rid of it.
moby.word.vector<-unlist(moby.words)
length(moby.word.vector) # hey, that's the same as this: length(moby.words[[1]])
# if you look at the vector of words you'll see some odd "blanks"
moby.word.vector[1:20]
# ? huh. . . when we use \\W, punctuation gets stripped and these remindered of where the punc chars were get left as a residue.
# we can figure out the POSITIONS of these blanks using which
which(moby.word.vector=="")
# and we can figure out which are not blanks by using the not equals operator !=
not.blanks<-which(moby.word.vector!="")
# so, not.blanks is now a vector of the "positions" of items in the moby.word.vector that are not blanks.
# so we can overwrite the existing moby.word.vector with a pared down version that omits the blanks, like this
moby.word.vector<- moby.word.vector[not.blanks]
# once we have all the words in a vector, we can do some cool searching:
whales<-which(moby.word.vector=="whale")
whale.hits<-length(whales)
whale.hits/length(moby.word.vector) # relative frequency of whales
# compare to "the"
the<-which(moby.word.vector=="the")
the.hits<-length(the)
the.hits/length(moby.word.vector)
# we can make the percentages easier to read
100*(the.hits/length(moby.word.vector))
100*(whale.hits/length(moby.word.vector))
# we can figure out how many unique words there are in the novel
length(unique(moby.word.vector)) # that's big!
# we can even create an entire frequency list for all the words
moby.freqs<-table(moby.word.vector)
sorted.moby.freqs<-sort(moby.freqs , decreasing=TRUE)
sorted.moby.freqs[1:10]
sorted.moby.rel.freqs<-100*(sorted.moby.freqs/sum(sorted.moby.freqs))
sorted.moby.rel.freqs[1:10]
plot(sorted.moby.rel.freqs[1:10], type="b", xlab="Top Ten Words In Moby Dick by Rel Freq", ylab="Percentage of Full Text", xaxt="n")
axis(1, 1:10, labels=names(sorted.moby.rel.freqs[1:10]))
###############################################################
# BREAK TIME: (10:15-10:30)
###############################################################
###############################################################
# SESSION TWO (10:30-12:00)
#############################################
# 2.1 Downloading and exploring the exercise corpus
#############################################
# https://www.matthewjockers.net/wp-content/uploads/2013/04/uwm-workshop.zip
# setwd or Session->Set Working Dir->Choose Dir
setwd("~/Documents/Workshops/Wisconsin/uwm-workshop")
# Load a directory of files for analysis
corpusDir<-"data/corpus"
files<-dir(path=corpusDir, pattern=".*txt")
files #explain what we are seeing here
# Load and examine the metadata file
metadata<-read.csv("data/metadata.csv", stringsAsFactors=F)
dim(metadata)
class(metadata)
head(metadata)
# what is a data.frame?
# show how a dataframe is accessed
metadata[1:2, 1:4]
# show short cut for column names
metadata$author
metadata[,4]
#show colnames function
colnames(metadata)
# Summary: So we have a vector of file names that we sucked out of a directory and we have a data.frame that we loaded from a .csv file (e.g. an excel file) that contains metadata about the files in the directory. We now want to create a topic model of all the files in that directory and we then want to be able to say something interesting about the topics in this corpus.
#############################################
# 2.2 What is Latent Dirichlet Allocation (LDA) anyhow?
#############################################
# LDA buffet story.
#############################################
# 2.3 Text Chunking
#############################################
# load a text from the directory--Dorian Gray
novel.lines<-scan(file.path(corpusDir, files[33]), what="character", sep="\n")
novel<-paste(novel.lines, collapse=" ")
novel.lower <-tolower(novel)
wilde.words<-strsplit(novel.lower, "\\W") # tokenize
wilde.words.vector<-unlist(wilde.words)
not.blanks<- which(wilde.words.vector!="") # remove blanks
wilde.words.vector<- wilde.words.vector[not.blanks]
length(wilde.words.vector)
# [1] 80294
# We want to chunk this into n 1000 word chunks.
chunk.size<-1000 # set a chunk variable
num.chunks<-length(wilde.words.vector)/chunk.size
num.chunks
x <- seq_along(wilde.words.vector)
# create a list where each item is a chunk vector
chunks <- split(wilde.words.vector, ceiling(x/chunk.size))
# What have we done here? ceiling is a rounding function.
? ceiling #see
class(chunks)
#inspect the first 1500 values
ceiling(x/chunk.size)[1:1500]
# what about split
? split
#examine the result
str(chunks)
# so now we need to convert this list of vectors into a list of contiguous text chunks
# so consider that this first chunk
chunks[[1]]
# contains a vector of words from the first 1000 words chunk. But instead of a vector, we need a string of continues words, like this:
paste(chunks[[1]], collapse=" ")
#############################################
# 2.4 lapply
#############################################
# we can apply this paste function over and over again using lapply, a special function for iterating over list items.
# let's learn lapply.
# lapply is a function that takes a list as an argument (i.e. our chunks list)
# and then returns a new list having applied a function to each item in the list
chunks.as.strings<-lapply(chunks, paste, collapse=" ")
# here we use lappy to apply the "paste" function to each item in the list and we provide the additional collapse argument and an optional argument to the function
# so now this
chunks.as.strings[[1]]
# is the same as this:
paste(chunks[[1]], collapse=" ")
# but we also have
chunks.as.strings[[2]]
# and so on.
# so now we have a list of text chunks as strings of text.
# we want to send these to a topic model but before we can do that we have to get them out of the list format
chunk.vector<-unlist(chunks.as.strings)
length(chunk.vector)
#############################################
# 2.5 A Toy Model
#############################################
# Now we will run a topic model just for fun
# Let's see if we can find 5 topics in Dorian Gray
# I'll explain all of this later, so just play along for now.
library(lda)
doclines<-lexicalize(chunk.vector)
set.seed(8675309)
K <- 5
num.iterations<-250
result <- lda.collapsed.gibbs.sampler(doclines$documents, K, doclines$vocab, num.iterations, 0.1, 0.1, compute.log.likelihood=TRUE)
top.words <- top.topic.words(result$topics, 25, by.score=TRUE)
# let's see what we've got
top.words
# tada, it's garbage! The problem is that we have lot's of high frequency words that
# are screwing up the topical-ness of our topic model.
# so we need to get rid of these danged stop words.
#############################################
# 2.6 What is a stoplist
#############################################
# Here is a typical stop list
english.stoplist<-read.csv("/Users/mjockers/Documents/Workshops/Wisconsin/data/english.stop.txt", stringsAsFactors=F)
english.stoplist<-unique(english.stoplist[,1])
# our goal is to scan each chunk and remove words that are in the stoplist.
# remember chunks?
str(chunks)
# it is a list containing a series of word vectors.
# we need to go back to the point just before we glued all of those words in each vector together with the following command
# chunks.as.strings<-lapply(chunks, paste, collapse=" ")
# before gluing in all together, we need to remove the stopwords from each chunk
# we can use lapply again here too, but we need a custom function to apply
#############################################
# 2.7 A Function
#############################################
# Explain this function
mywhich<-function(word.vector, stoplist){
word.vector[-which(word.vector %in% stoplist)]
}
# Now apply the function over each item in the chunks list
chunks.without.stops<-lapply(chunks, mywhich, english.stoplist)
#now we need to collapse the chunks just like in the past
chunks.as.strings<-lapply(chunks.without.stops, paste, collapse=" ")
# and convert to a text cector from a list
chunk.vector<-unlist(chunks.as.strings)
length(chunk.vector)
#############################################
# 2.8 A Toy Model again
#############################################
# Now run the LDA again, over the new list
# Let's see if we can find 5 topics in Dorian Gray
# I'll explain all of this later, so just play along for now.
library(lda)
doclines<-lexicalize(chunk.vector)
set.seed(8675309)
K <- 5
num.iterations<-250
result <- lda.collapsed.gibbs.sampler(doclines$documents, K, doclines$vocab, num.iterations, 0.1, 0.1, compute.log.likelihood=TRUE)
top.words <- top.topic.words(result$topics, 25, by.score=TRUE)
# let's see what we've got
top.words
# So, these topics are better, but still not the best we could hope for due
# to the presence of a lots of character names.
###########################################################################
# LUNCH BREAK (12:00-1:30)
###########################################################################
###########################################################################
# SESSION 3 (1:30-2:45)
#############################################
# 3.1 A better stoplist
#############################################
# lets load a much larger stoplist that has character names
long.stoplist<-read.csv("data/stoplist.csv", stringsAsFactors=F)
long.stoplist<-unique(long.stoplist[,1])
length(long.stoplist)
#############################################
# 3.2 A slightly better model
#############################################
chunks.without.stops<-lapply(chunks, mywhich, long.stoplist)
#now we need to collapse the chunks just like in the past
chunks.as.strings<-lapply(chunks.without.stops, paste, collapse=" ")
# and convert to a text cector from a list
chunk.vector<-unlist(chunks.as.strings)
library(lda)
doclines<-lexicalize(chunk.vector)
set.seed(8675309)
K <- 10
num.iterations<-250
result <- lda.collapsed.gibbs.sampler(doclines$documents, K, doclines$vocab, num.iterations, 0.1, 0.1, compute.log.likelihood=TRUE)
top.words <- top.topic.words(result$topics, 25, by.score=TRUE)
# let's see what we've got
top.words
# Well, this is getting better and maybe at the level of a single novel
# we might even want to keep the character names in. But once we move
# from a single text to a corpus, we really don't want character based topics.
# visit
# https://www.matthewjockers.net/2013/04/12/secret-recipe-for-topic-modeling-themes/
# to show how topics w just english stopwords have lots of character names.
# explain why this is a problem
#############################################
# 3.3 POS tagging
#############################################
novel.lines<-scan(file.path(corpusDir, files[33]), what="character", sep="\n")
text.blob<-paste(novel.lines, collapse=" ")
library(openNLP)
tagged_text <- tagPOS(text.blob) # takes 2 minutes
# need a new function
tagged.words<-unlist(strsplit(tagged_text," "))
# Now chunk the full list
chunk.size<-1000 # set a chunk variable
num.chunks<-length(tagged.words)/chunk.size
x <- seq_along(tagged.words)
# create a list where each item is a chunk vector
chunks <- split(tagged.words, ceiling(x/chunk.size))
#############################################
# 3.4 new functions
#############################################
# New Function
SelectTaggedWords <- function(Words,tagID) {
Words[ grep(tagID,Words) ]
}
tagged.words.select.list<-lapply(chunks, SelectTaggedWords, "/NN$")
# New Function
RemoveTags <- function(Words) {
sub("/[A-Z]{2,3}","",Words)
}
words.select.list<-lapply(tagged.words.select.list, RemoveTags)
# New Function
RemoveNonChar <- function(Words) {
gsub("[^[:alnum:][:space:]']","",Words)
}
text.word.vector <- lapply(words.select.list, RemoveNonChar)
chunks.without.stops<-lapply(text.word.vector, mywhich, english.stoplist)
#############################################
# 3.5 re-modeling
#############################################
#now we need to collapse the chunks just like in the past
chunks.as.strings<-lapply(chunks.without.stops, paste, collapse=" ")
# and convert to a text cector from a list
chunk.vector<-unlist(chunks.as.strings)
library(lda)
doclines<-lexicalize(chunk.vector)
set.seed(8675309)
K <- 20
num.iterations<-250
result <- lda.collapsed.gibbs.sampler(doclines$documents, K, doclines$vocab, num.iterations, 0.1, 0.1, compute.log.likelihood=TRUE)
top.words <- top.topic.words(result$topics, 5, by.score=TRUE)
# let's see what we've got
top.words
# [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
# [1,] "pleasure" "lamp" "gold" "hansom" "murder" "experience" "companion" "bag"
# [2,] "worship" "women" "silver" "horse" "crime" "art" "grass" "train"
# [3,] "one" "street" "silk" "driver" "laboratory" "impulse" "wind" "coat"
# [4,] "coffee" "curtain" "ceiling" "lace" "action" "instinct" "daisy" "servant"
# [5,] "spirit" "ice" "damask" "cell" "method" "nothing" "happiness" "uncle"
# [,9] [,10] [,11] [,12] [,13] [,14] [,15] [,16]
# [1,] "art" "blood" "love" "life" "sir" "music" "accident" "mother"
# [2,] "artist" "cap" "theatre" "man" "picture" "principle" "sort" "son"
# [3,] "painter" "poison" "night" "thing" "screen" "advantage" "town" "gentleman"
# [4,] "picture" "race" "act" "time" "key" "carpet" "fellow" "marriage"
# [5,] "fellow" "collar" "voice" "room" "tea" "host" "omen" "child"
# [,17] [,18] [,19] [,20]
# [1,] "book" "picture" "world" "husband"
# [2,] "type" "canvas" "consolation" "tragedy"
# [3,] "paper" "portrait" "yesterday" "dear"
# [4,] "music" "prayer" "globe" "woman"
# [5,] "background" "mouth" "caprice" "lady"
###############################################################
# BREAK (2:45-3:00)
###############################################################
###############################################################
# SESSION 4 (3:00-4:30)
###############################################################
# now let's switch to a pre POS tagged corpus and make thematic clouds
# This Requires POS tagging all of the texts. Sicne this takes several hours
# i did it in advance. Here is the code required to do it on your own
# library(openNLP)
# for(i in 13:length(files)){
# text<-scan(file.path(corpusDir, files[i]), what="character", sep="\n")
# text.blob<-paste(text, collapse=" ")
# tagged_text <- tagPOS(text.blob)
# write(tagged_text, paste("data/taggedCorpus/",files[i], sep=""))
# }
###############################################################
#############################################
# 4.1 ingesting the entire corpus
#############################################
inputDir<-"data/taggedCorpus"
files<-dir(path=inputDir, pattern=".*txt")
chunk.size<-1000 #number of words per chunk
long.stoplist<-read.csv("data/stoplist.csv", stringsAsFactors=F)
long.stoplist<-unique(long.stoplist[,1])
# Let's take all of the functions we wrote above and move them into a
# new file called pos.functions.r
# once they are in this other file, we can call them up using the "source" function.
source("code/pos.functions.r")
topic.df<-NULL
for(i in 1:length(files)){
tagged_text<-scan(file.path(inputDir, files[i]), what="character", sep="\n")
tagged_words <- unlist(strsplit(tagged_text," "))
tagged_words <- c(SelectTaggedWords(tagged_words,"/NN$")) # SELECT POS TAG TYPES TO KEEP
tagged_words_less <- RemoveTags(tagged_words)
word.vector <- RemoveNonChar(tagged_words_less)
#remove the blanks
text.word.vector<- word.vector[which(word.vector!="")]
#remove stopwords
text.word.vector<-text.word.vector[-which(text.word.vector %in% long.stoplist)]
num.chunks<-length(text.word.vector)/chunk.size
max<-length(text.word.vector)/num.chunks
x <- seq_along(text.word.vector)
chunks <- split(text.word.vector, ceiling(x/max))
# Here I introduce a way to deal with small chunks at the end
if(length(chunks[[length(chunks)]]) <= 500){
chunks[[length(chunks)-1]]<-c(chunks[[length(chunks)-1]], chunks[[length(chunks)]])
chunks[[length(chunks)]]<-NULL
}
chunks<-lapply(chunks, paste, collapse=" ")
chunks.df<-do.call(rbind, chunks)
chunks.df<-cbind(files[i],chunks.df)
topic.df<-rbind(topic.df, chunks.df)
}
#############################################
# 4.2 Modeling the entire corpus
#############################################
library(lda)
doclines<-lexicalize(topic.df[,2])
set.seed(8675309)
K <- 50
num.iterations<-250
result <- lda.collapsed.gibbs.sampler(doclines$documents, K, doclines$vocab, num.iterations, 0.1, 0.1, compute.log.likelihood=TRUE)
top.words <- top.topic.words(result$topics, 10, by.score=TRUE)
# View top words
top.words
#############################################
# 4.3 Visualizing topics as clouds
#############################################
library(wordcloud)
result.dir<-"topicCloudsFromTagged/"
for(i in 1:nrow(result$topics)){
cloud.data<-sort(result$topics[i,], decreasing=T)[1:50]
png(paste(result.dir, i,".png", sep=""), width = 12, height = 12, units="in", res=300)
print(wordcloud(names(cloud.data), freq=cloud.data, scale=c(4,.10), min.freq=1, rot.per=0, random.order=F))
dev.off()
}
#############################################
# 4.4 Topic data analysis
#############################################
# after looking at the clouds, we decide we are interested in
# topic 25 which has to do with depictions of slavery
# First let's get the proportional data from the model
topic.proportions <- t(result$document_sums) / colSums(result$document_sums)
# topic.proportions is a matrix with 932 rows: one for each 1000 word text segment
# and 50 columns, one for each topic. To make life easier, let's create a column header row
colnames(topic.proportions)<-paste("Topic",1:50, sep="_")
# But there is no info in the matrix about which file chunk is which.
# but we have that data in the first column of topic.df and we can get it out like this
thefilenames<-topic.df[, 1]
# so now we need to assign each chunk a unique number
chunk.ids<-ave(final.df[,3], final.df$thefilenames, FUN = seq_along)
# we can then "bind" all of this "column" data
# to the the topic.proportions matrix, like this
final.df<-cbind.data.frame(thefilenames, chunk.ids, topic.proportions)
# show the result like this. . . .
final.df[1:25, 1:5]
# now the file names can act like "keys" in a relational database and allow us to lookup the metadata about those files in the metadata table we loaded earlier. for example,
metadata<-read.csv("data/metadata.for.tagged.csv", stringsAsFactors=F)
# but first we need to fix the file extensions since we switched to the .txt files
metadata$filename<-gsub(".xml", ".txt", metadata$filename)
# and now we can figure out which files are which using "which"
metadata[which(metadata$filename == "1008.txt"), ]
# so let's say we are interested in the slavery topic, topic 25.
# We can identify the book with the largest use of the theme like this
max.slavery<-final.df[which(final.df$Topic_25 == max(final.df$Topic_25)), "thefilenames"]
# so, which book is it?
metadata[which(metadata$filename == max.slavery),]
# now let's plot the presence of slavery across the entire novel
plot(final.df[which(final.df$thefilenames == max.slavery), "Topic_25"], type="l")
# and finally, let's calculate and plot the mean usage of the theme across all the novels in the corpus.
#############################################
# 4.5 Basic Plotting
#############################################
mean.slavery<-aggregate(final.df$Topic_25, by=list(thefilenames), mean)
#plot it
barplot(mean.slavery$x,names.arg=mean.slavery$Group.1, las=2)
# now export the image. this makes us wonder about text 500.txt
metadata[which(metadata$filename == "500.txt"),]
# X filename date author title
# 41 41 500.txt 1867 Child, Lydia Maria Francis A Romance of the Republic. (1867)
# nation
# 41 American
[…] Participants interested in getting a head start can explore the R tutorials of Jeffrey Rydberg-Cox (http://www.chlt.org/StatisticalMethods/index.html#) and Matthew L. Jockers (http://www.matthewjockers.net/materials/uwm-2013/workshop-code/). […]
[WORDPRESS HASHCASH] The comment’s server IP (66.155.8.249) doesn’t match the comment’s URL host IP (66.155.11.238) and so is spam.