###############################################################
# mjockers unl edu
# The Day in Code--DHWI Text Analysis with R.
# Day 4
###############################################################
# Today we pick up where we left of yesterday. . .
# Review how mapply and xtabs work from the end of the day yesterday.
###############################################################
# Here is the important code from yesterday again. . .
inputDir<-"data/XMLAuthorCorpus"
files<-dir(path=inputDir, pattern=".*xml")
library(XML)
source("code/corpusFunctions.r")
book.list.freqs<-list()
for(i in 1:length(files)){
doc<-xmlTreeParse(file.path(inputDir, files[i]), useInternalNodes=TRUE)
worddata<-getWordLists(doc)
book.list.freqs[[files[i]]]<-worddata
}
freqs.list<-mapply(data.frame, ID=seq_along(book.list.freqs), book.list.freqs, SIMPLIFY=FALSE, MoreArgs=list(stringsAsFactors=FALSE))
freqs.df<-do.call(rbind, freqs.list)
result<-xtabs(Freq ~ ID+Var1, data=freqs.df)
final.m<-apply(result, 2, as.numeric)
freqs.list <- mapply(data.frame, ID=seq_along(book.list.freqs), book.list.freqs, SIMPLIFY=FALSE, MoreArgs=list(stringsAsFactors=FALSE))
freqs.df <- do.call(rbind,freqs.list)
result <- xtabs(Freq ~ ID+Var1, data= freqs.df)
# End of yesterday's code
###############################################################
# now continue with clustering. . .
smaller.m <- final.m[,apply(final.m,2,mean)>=.25]
dm<-dist(smaller.m) # Creates a distance object
cluster <- hclust(dm) # Performs a cluster analysis on the distance object
cluster$labels<-names(book.list.freqs) #get the book file names to use as lables.
plot(cluster) # plots the results as a dendrogram for our inspection.
# OR in one line
plot(hclust(dist( final.m[,apply(final.m,2,mean)>=2.5] )), labels<-names(book.list.freqs))
##############################################################
# Classification
# A Small Authorship Classification Experiment
##############################################################
# Don't forget to clear workspace and then reset setwd()
library(XML)
source("code/corpusFunctions.r")
inputDir<-"data/XMLAuthorCorpus"
files<-dir(path=inputDir, pattern=".*xml")
book.list.freqs<-list()
for(i in 1:length(files)){
doc<-xmlTreeParse(file.path(inputDir, files[i]), useInternalNodes=TRUE)
chunkdata<- getWordSegmentLists(doc,10)
book.list.freqs[[files[i]]]<- chunkdata
}
freqs.list <- lapply(book.list.freqs, my.mapply)
freqs.df <- do.call(rbind,freqs.list)
dim(freqs.df)
head(freqs.df)
#eg
bookids<-gsub("\\..*", "", "anonymous.xml.1")
bookids<-gsub("\\..*", "", rownames(freqs.df))
book.chunk.ids<-paste(bookids, freqs.df$ID, sep="_")
freqs.df$ID<-book.chunk.ids
result <- xtabs(Freq ~ ID+Var1, data=freqs.df)
final<-as.data.frame.matrix(result) #convert to data.frame
# Need to create metadata columns from text-id column
metacols<-do.call(rbind, strsplit(rownames(final), "_"))
colnames(metacols)<-c("sampletext", "samplechunk")
author<-gsub("\\d.*", "", metacols[,"sampletext"])
final.df<-cbind(author, metacols, final)
#Winnow the data
freq.means<-colMeans(final.df[,4:ncol(final.df)])
keepers<-which(freq.means >=.005)
smaller.df<-final.df[, names(keepers)]
smaller.df<-cbind(author, metacols, smaller.df)
# Run SVM classification
library(e1071)
anon<-which(smaller.df$author == "anonymous")
train <- smaller.df[-anon,4:ncol(smaller.df)]
class <- smaller.df[-anon,"author"]
model <- svm(train, class)
pred <- predict(model, train) #same as pred <- fitted(model)
cm<-table(pred, class)
testdata <- smaller.df[anon,4:ncol(smaller.df)]
final.result<-predict (model, testdata)
as.data.frame(final.result)
###############################################################################
# Topic Modeling
###############################################################################
# Two options outlined below. Choose one of the two option and then
# skip down to section of code where the lda model gets run
###############################################################################
# OPTION ONE: Using a stop List
# Returns topics with lots of character names
inputDir<-"data/XMLAuthorCorpus"
files<-dir(path=inputDir, pattern=".*xml")
chunk.size<-1000 #number of words per chunk
stoplist<-read.csv("data/stoplist.csv", stringsAsFactors=F)
library(XML)
topic.df<-NULL
for(i in 1:length(files)){
doc<-xmlTreeParse(file.path(inputDir, files[i]), useInternalNodes=TRUE)
paras<-getNodeSet(doc, "/TEI/text/body//p")
words<-paste(sapply(paras,xmlValue), collapse=" ")
words.lower <-tolower(words)
words.lower<-gsub("[^[:alnum:][:space:]']", " ", words.lower)#replace all punc except apostrope and replace with space character
words.list<-strsplit(words.lower, "\\s+") #split on one or more spaces
words.unlist<-unlist(words.list)
word.vector<-words.unlist[-which(words.unlist %in% stoplist[,1])]
num.chunks<-length(word.vector)/chunk.size
max<-length(word.vector)/num.chunks
x <- seq_along(word.vector)
chunks <- split(word.vector, ceiling(x/max))
#deal with small chunks at the end
if(length(chunks[[length(chunks)]]) <= 500){
chunks[[length(chunks)-1]]<-c(chunks[[length(chunks)-1]], chunks[[length(chunks)]])
chunks[[length(chunks)]]<-NULL
}
chunks<-lapply(chunks, paste, collapse=" ")
chunks.df<-do.call(rbind, chunks)
chunks.df<-cbind(files[i],chunks.df)
topic.df<-rbind(topic.df, chunks.df)
}
# Now skip to "Running the model" below
###############################################################################
# OPTION TWO: Requires POS tagging.
# With this corpus total tagging time = ~2.5 hours
# Here is the code I used to tag the files
# library(openNLP)
# for(i in 1:length(files)){
# doc<-xmlTreeParse(file.path(inputDir, files[i]), useInternalNodes=TRUE)
# paras<-getNodeSet(doc, "/TEI/text/body//p")
# words<-paste(sapply(paras,xmlValue), collapse=" ")
# tagged_text <- tagPOS(words)
# write(tagged_text, paste("data/taggedCorpus/",files[i], ".txt", sep=""))
# }
inputDir<-"data/taggedCorpus"
files<-dir(path=inputDir, pattern=".*xml")
chunk.size<-1000 #number of words per chunk
source("code/corpusFunctions.r")
topic.df<-NULL
for(i in 1:length(files)){
tagged_text<-scan(file.path(inputDir, files[i]), what="character", sep="\n")
tagged_words <- SplitText(tagged_text)
tagged_words <- c(SelectTaggedWords(tagged_words,"/NN$")) # SELECT POS TAG TYPES TO KEEP
tagged_words_less <- RemoveTags(tagged_words)
word.vector <- RemoveNonChar(tagged_words_less)
num.chunks<-length(word.vector)/chunk.size
max<-length(word.vector)/num.chunks
x <- seq_along(word.vector)
chunks <- split(word.vector, ceiling(x/max))
#deal with small chunks at the end
if(length(chunks[[length(chunks)]]) <= 500){
chunks[[length(chunks)-1]]<-c(chunks[[length(chunks)-1]], chunks[[length(chunks)]])
chunks[[length(chunks)]]<-NULL
}
chunks<-lapply(chunks, paste, collapse=" ")
chunks.df<-do.call(rbind, chunks)
chunks.df<-cbind(files[i],chunks.df)
topic.df<-rbind(topic.df, chunks.df)
}
# Now skip to "Running the model" below
##########################################################################################
# RUNNING THE TOPIC MODEL
##########################################################################################
library(lda)
doclines<-lexicalize(topic.df[,2])
set.seed(8675309)
K <- 40
num.iterations<-250
result <- lda.collapsed.gibbs.sampler(doclines$documents, K, doclines$vocab, num.iterations, 0.1, 0.1, compute.log.likelihood=TRUE)
top.words <- top.topic.words(result$topics, 25, by.score=TRUE)
topic.proportions <- t(result$document_sums) / colSums(result$document_sums)
# DONE. . . Naw, let's make some purdy clouds. . . .
##########################################################################################
# MAKE SOME WORD CLOUDS OF THE TOPIC MODEL
##########################################################################################
library(wordcloud)
result.dir<-"topicClouds/"
for(i in 1:nrow(result$topics)){
cloud.data<-sort(result$topics[i,], decreasing=T)[1:50]
pdf(paste(result.dir, i,".pdf", sep=""), paper="special")
print(wordcloud(names(cloud.data), freq=cloud.data, scale=c(3,.10), min.freq=1, rot.per=0, random.order=F))
dev.off()
}