DHWI: R Code Day Three

###############################################################
# mjockers unl edu
# The Day in Code--DHWI Text Analysis with R. 
# Day 3
###############################################################

############################################################
# Parsing XML
############################################################
setwd()
library(XML) 
doc<-xmlParse("data/XML1/melville1.xml", useInternalNodes=TRUE)
chapters<-getNodeSet(doc, "/TEI//div1[@type='chapter']")
chapters[[2]]

chapter.list.freqs<-list()
chapter.list.raws<-list()

for(i in 1:length(chapters)){
  # first get the chapter title from the head element
  chap.title<-xmlValue(xmlElementsByTagName(chapters[[i]], "head")[[1]])
  chap.title<-paste(i, chap.title, sep=" ")
  # get only the contents of the paragraph tags
  paras<-xmlElementsByTagName(chapters[[i]], "p")
  #combine all the words from every paragraph
  chap.words<-paste(sapply(paras,xmlValue), collapse=" ")
  # convert to lowercase
  words.lower <-tolower(chap.words)
  # tokenize
  words.list<-strsplit(words.lower, "\\W")
  word.vector<-unlist(words.list)
  word.vector<- word.vector[which(word.vector!="")]
  # calculate the frequencies
  chapter.freqs<-table(word.vector)
  chapter.list.raws[[chap.title]]<- chapter.freqs
  chapter.freqs.rel<-100*(chapter.freqs/sum(chapter.freqs))
  chapter.list.freqs[[chap.title]]<-chapter.freqs.rel
}

whales<-do.call(rbind, lapply(chapter.list.freqs, '[', 'whale')) 
ahabs<-do.call(rbind, lapply(chapter.list.freqs, '[', 'ahab'))
whales.ahabs<-cbind(whales, ahabs)
whales.ahabs[which(is.na(whales.ahabs))]<-0
colnames(whales.ahabs)<-c("whale", "ahab")
barplot(whales.ahabs, beside=T, col="grey")


whales.ahabs.df<-as.data.frame(whales.ahabs)
cor.test(whales.ahabs.df$whale, whales.ahabs.df$ahab)

xpathApply(doc, "/TEI//fileDesc//titleStmt//title")
xpathApply(doc, "/TEI//fileDesc//titleStmt//title")[[1]]
title<-xpathApply(doc, "/TEI//fileDesc//titleStmt//title")[[1]]
title
xmlValue(title)
(title<-xpathApply(doc, "/TEI//fileDesc//titleStmt//title")[[1]])
(author<-unlist(xpathApply(doc, "/TEI//author//name"))[[1]])

(nation<-unlist(xpathApply(doc, "/TEI//teiHeader//note", xmlGetAttr, "nation")))
(gender<-unlist(xpathApply(doc, "/TEI//teiHeader//note", xmlGetAttr, "gender")))
(pubdate<-unlist(xpathApply(doc, "/TEI//teiHeader//creation/date", xmlGetAttr, "value")))

(respStmt<-xpathApply(doc, "/TEI//fileDesc//titleStmt//respStmt")[[1]])

################################################################
# Macroanalysis
################################################################
# Clustering


inputDir<-"data/XMLAuthorCorpus"
files<-dir(path=inputDir, pattern=".*xml")
library(XML)
source("code/corpusFunctions.r")
book.list.freqs<-list()
for(i in 1:length(files)){
  doc<-xmlTreeParse(file.path(inputDir, files[i]), useInternalNodes=TRUE)
  worddata<-getWordLists(doc)
  book.list.freqs[[files[i]]]<-worddata
}
freqs.list<-mapply(data.frame, ID=seq_along(book.list.freqs), book.list.freqs, SIMPLIFY=FALSE, MoreArgs=list(stringsAsFactors=FALSE))
freqs.df<-do.call(rbind, freqs.list)
result<-xtabs(Freq ~ ID+Var1, data=freqs.df)
final.m<-apply(result, 2, as.numeric)

freqs.list <- mapply(data.frame, ID=seq_along(book.list.freqs), book.list.freqs, SIMPLIFY=FALSE, MoreArgs=list(stringsAsFactors=FALSE))
freqs.df <- do.call(rbind,freqs.list)
result <- xtabs(Freq ~ ID+Var1, data= freqs.df)
#########################################################
# Did not complete clustering in this session. See day 4
#########################################################
Matthew L. Jockers

"Everything . . . in nature's vast workshop from the extinction of some remote sun to the blossoming of one of the countless flowers which beautify our public parks is subject to a law of numeration as yet unascertained.” (Joyce, Ulysses, 1922)

Leave a Reply Cancel reply