DHWI: R Code Day Four

###############################################################
# mjockers unl edu
# The Day in Code--DHWI Text Analysis with R. 
# Day 4
###############################################################
# Today we pick up where we left of yesterday. . . 

# Review how mapply and xtabs work from the end of the day yesterday.

###############################################################
# Here is the important code from yesterday again. . . 
inputDir<-"data/XMLAuthorCorpus"
files<-dir(path=inputDir, pattern=".*xml")
library(XML)
source("code/corpusFunctions.r")
book.list.freqs<-list()
for(i in 1:length(files)){
  doc<-xmlTreeParse(file.path(inputDir, files[i]), useInternalNodes=TRUE)
  worddata<-getWordLists(doc)
  book.list.freqs[[files[i]]]<-worddata
}
freqs.list<-mapply(data.frame, ID=seq_along(book.list.freqs), book.list.freqs, SIMPLIFY=FALSE, MoreArgs=list(stringsAsFactors=FALSE))
freqs.df<-do.call(rbind, freqs.list)
result<-xtabs(Freq ~ ID+Var1, data=freqs.df)
final.m<-apply(result, 2, as.numeric)

freqs.list <- mapply(data.frame, ID=seq_along(book.list.freqs), book.list.freqs, SIMPLIFY=FALSE, MoreArgs=list(stringsAsFactors=FALSE))
freqs.df <- do.call(rbind,freqs.list)
result <- xtabs(Freq ~ ID+Var1, data= freqs.df)

# End of yesterday's code
###############################################################
# now continue with clustering. . . 

smaller.m <- final.m[,apply(final.m,2,mean)>=.25]
dm<-dist(smaller.m) # Creates a distance object
cluster <- hclust(dm) # Performs a cluster analysis on the distance object
cluster$labels<-names(book.list.freqs) #get the book file names to use as lables.
plot(cluster) # plots the results as a dendrogram for our inspection.
# OR in one line
plot(hclust(dist( final.m[,apply(final.m,2,mean)>=2.5] )), labels<-names(book.list.freqs))

##############################################################
# Classification
# A Small Authorship Classification Experiment
##############################################################
# Don't forget to clear workspace and then reset setwd()

library(XML)
source("code/corpusFunctions.r")
inputDir<-"data/XMLAuthorCorpus"
files<-dir(path=inputDir, pattern=".*xml")
book.list.freqs<-list()
for(i in 1:length(files)){
  doc<-xmlTreeParse(file.path(inputDir, files[i]), useInternalNodes=TRUE)
  chunkdata<- getWordSegmentLists(doc,10)
  book.list.freqs[[files[i]]]<- chunkdata
}
freqs.list <- lapply(book.list.freqs, my.mapply)
freqs.df <- do.call(rbind,freqs.list)
dim(freqs.df)
head(freqs.df)

#eg
bookids<-gsub("\\..*", "", "anonymous.xml.1")
bookids<-gsub("\\..*", "", rownames(freqs.df))
book.chunk.ids<-paste(bookids, freqs.df$ID, sep="_")
freqs.df$ID<-book.chunk.ids
result <- xtabs(Freq ~ ID+Var1, data=freqs.df)
final<-as.data.frame.matrix(result) #convert to data.frame

# Need to create metadata columns from text-id column
metacols<-do.call(rbind, strsplit(rownames(final), "_"))
colnames(metacols)<-c("sampletext", "samplechunk")
author<-gsub("\\d.*", "", metacols[,"sampletext"])
final.df<-cbind(author, metacols, final)

#Winnow the data
freq.means<-colMeans(final.df[,4:ncol(final.df)])
keepers<-which(freq.means >=.005)
smaller.df<-final.df[, names(keepers)]
smaller.df<-cbind(author, metacols, smaller.df)

# Run SVM classification
library(e1071)
anon<-which(smaller.df$author == "anonymous")
train <- smaller.df[-anon,4:ncol(smaller.df)]
class <- smaller.df[-anon,"author"]
model <- svm(train, class)
pred <- predict(model, train) #same as pred <- fitted(model)
cm<-table(pred, class)
testdata <- smaller.df[anon,4:ncol(smaller.df)]
final.result<-predict (model, testdata)
as.data.frame(final.result)

###############################################################################
#                               Topic Modeling
###############################################################################
# Two options outlined below.  Choose one of the two option and then 
# skip down to section of code where the lda model gets run
###############################################################################

# OPTION ONE: Using a stop List
# Returns topics with lots of character names

inputDir<-"data/XMLAuthorCorpus"
files<-dir(path=inputDir, pattern=".*xml")
chunk.size<-1000 #number of words per chunk
stoplist<-read.csv("data/stoplist.csv", stringsAsFactors=F)
library(XML)

topic.df<-NULL
for(i in 1:length(files)){
  doc<-xmlTreeParse(file.path(inputDir, files[i]), useInternalNodes=TRUE)
  paras<-getNodeSet(doc, "/TEI/text/body//p")
  words<-paste(sapply(paras,xmlValue), collapse=" ")
  words.lower <-tolower(words)
  words.lower<-gsub("[^[:alnum:][:space:]']", " ", words.lower)#replace all punc except apostrope and replace with space character
  words.list<-strsplit(words.lower, "\\s+") #split on one or more spaces
  words.unlist<-unlist(words.list)
  word.vector<-words.unlist[-which(words.unlist %in% stoplist[,1])]
  num.chunks<-length(word.vector)/chunk.size
  max<-length(word.vector)/num.chunks
  x <- seq_along(word.vector)
  chunks <- split(word.vector, ceiling(x/max))
  #deal with small chunks at the end
  if(length(chunks[[length(chunks)]]) <= 500){
    chunks[[length(chunks)-1]]<-c(chunks[[length(chunks)-1]], chunks[[length(chunks)]])
    chunks[[length(chunks)]]<-NULL
  }
  chunks<-lapply(chunks, paste, collapse=" ")
  chunks.df<-do.call(rbind, chunks)
  chunks.df<-cbind(files[i],chunks.df)
  topic.df<-rbind(topic.df, chunks.df)
}

# Now skip to "Running the model" below


###############################################################################
# OPTION TWO: Requires POS tagging.  
# With this corpus total tagging time = ~2.5 hours
# Here is the code I used to tag the files
# library(openNLP)
# for(i in 1:length(files)){
#  doc<-xmlTreeParse(file.path(inputDir, files[i]), useInternalNodes=TRUE)
#  paras<-getNodeSet(doc, "/TEI/text/body//p")
#  words<-paste(sapply(paras,xmlValue), collapse=" ")
#  tagged_text <- tagPOS(words) 
#  write(tagged_text, paste("data/taggedCorpus/",files[i], ".txt", sep=""))
# }

inputDir<-"data/taggedCorpus"
files<-dir(path=inputDir, pattern=".*xml")
chunk.size<-1000 #number of words per chunk
source("code/corpusFunctions.r")
topic.df<-NULL
for(i in 1:length(files)){
  tagged_text<-scan(file.path(inputDir, files[i]), what="character", sep="\n")
  tagged_words <- SplitText(tagged_text)
  tagged_words <- c(SelectTaggedWords(tagged_words,"/NN$")) # SELECT POS TAG TYPES TO KEEP 
  tagged_words_less <- RemoveTags(tagged_words)
  word.vector <- RemoveNonChar(tagged_words_less)
  num.chunks<-length(word.vector)/chunk.size
  max<-length(word.vector)/num.chunks
  x <- seq_along(word.vector)
  chunks <- split(word.vector, ceiling(x/max))
  #deal with small chunks at the end
  if(length(chunks[[length(chunks)]]) <= 500){
    chunks[[length(chunks)-1]]<-c(chunks[[length(chunks)-1]], chunks[[length(chunks)]])
    chunks[[length(chunks)]]<-NULL
  }
  chunks<-lapply(chunks, paste, collapse=" ")
  chunks.df<-do.call(rbind, chunks)
  chunks.df<-cbind(files[i],chunks.df)
  topic.df<-rbind(topic.df, chunks.df)
}
# Now skip to "Running the model" below

##########################################################################################
#                     RUNNING THE TOPIC MODEL
##########################################################################################

library(lda)
doclines<-lexicalize(topic.df[,2]) 
set.seed(8675309)
K <- 40
num.iterations<-250
result <- lda.collapsed.gibbs.sampler(doclines$documents, K, doclines$vocab, num.iterations, 0.1, 0.1, compute.log.likelihood=TRUE)
top.words <- top.topic.words(result$topics, 25, by.score=TRUE)
topic.proportions <- t(result$document_sums) / colSums(result$document_sums)

# DONE. . . Naw, let's make some purdy clouds. . . .

##########################################################################################
# MAKE SOME WORD CLOUDS OF THE TOPIC MODEL
##########################################################################################
library(wordcloud)
result.dir<-"topicClouds/"
for(i in 1:nrow(result$topics)){
  cloud.data<-sort(result$topics[i,], decreasing=T)[1:50]
  pdf(paste(result.dir, i,".pdf", sep=""), paper="special")  
  print(wordcloud(names(cloud.data), freq=cloud.data, scale=c(3,.10), min.freq=1, rot.per=0, random.order=F))
  dev.off()
}
Matthew L. Jockers

"Everything . . . in nature's vast workshop from the extinction of some remote sun to the blossoming of one of the countless flowers which beautify our public parks is subject to a law of numeration as yet unascertained.” (Joyce, Ulysses, 1922)

Leave a Reply Cancel reply