############################################################### # mjockers unl edu # The Day in Code--DHWI Text Analysis with R. # Day 3 ############################################################### ############################################################ # Parsing XML ############################################################ setwd() library(XML) doc<-xmlParse("data/XML1/melville1.xml", useInternalNodes=TRUE) chapters<-getNodeSet(doc, "/TEI//div1[@type='chapter']") chapters[[2]] chapter.list.freqs<-list() chapter.list.raws<-list() for(i in 1:length(chapters)){ # first get the chapter title from the head element chap.title<-xmlValue(xmlElementsByTagName(chapters[[i]], "head")[[1]]) chap.title<-paste(i, chap.title, sep=" ") # get only the contents of the paragraph tags paras<-xmlElementsByTagName(chapters[[i]], "p") #combine all the words from every paragraph chap.words<-paste(sapply(paras,xmlValue), collapse=" ") # convert to lowercase words.lower <-tolower(chap.words) # tokenize words.list<-strsplit(words.lower, "\\W") word.vector<-unlist(words.list) word.vector<- word.vector[which(word.vector!="")] # calculate the frequencies chapter.freqs<-table(word.vector) chapter.list.raws[[chap.title]]<- chapter.freqs chapter.freqs.rel<-100*(chapter.freqs/sum(chapter.freqs)) chapter.list.freqs[[chap.title]]<-chapter.freqs.rel } whales<-do.call(rbind, lapply(chapter.list.freqs, '[', 'whale')) ahabs<-do.call(rbind, lapply(chapter.list.freqs, '[', 'ahab')) whales.ahabs<-cbind(whales, ahabs) whales.ahabs[which(is.na(whales.ahabs))]<-0 colnames(whales.ahabs)<-c("whale", "ahab") barplot(whales.ahabs, beside=T, col="grey") whales.ahabs.df<-as.data.frame(whales.ahabs) cor.test(whales.ahabs.df$whale, whales.ahabs.df$ahab) xpathApply(doc, "/TEI//fileDesc//titleStmt//title") xpathApply(doc, "/TEI//fileDesc//titleStmt//title")[[1]] title<-xpathApply(doc, "/TEI//fileDesc//titleStmt//title")[[1]] title xmlValue(title) (title<-xpathApply(doc, "/TEI//fileDesc//titleStmt//title")[[1]]) (author<-unlist(xpathApply(doc, "/TEI//author//name"))[[1]]) (nation<-unlist(xpathApply(doc, "/TEI//teiHeader//note", xmlGetAttr, "nation"))) (gender<-unlist(xpathApply(doc, "/TEI//teiHeader//note", xmlGetAttr, "gender"))) (pubdate<-unlist(xpathApply(doc, "/TEI//teiHeader//creation/date", xmlGetAttr, "value"))) (respStmt<-xpathApply(doc, "/TEI//fileDesc//titleStmt//respStmt")[[1]]) ################################################################ # Macroanalysis ################################################################ # Clustering inputDir<-"data/XMLAuthorCorpus" files<-dir(path=inputDir, pattern=".*xml") library(XML) source("code/corpusFunctions.r") book.list.freqs<-list() for(i in 1:length(files)){ doc<-xmlTreeParse(file.path(inputDir, files[i]), useInternalNodes=TRUE) worddata<-getWordLists(doc) book.list.freqs[[files[i]]]<-worddata } freqs.list<-mapply(data.frame, ID=seq_along(book.list.freqs), book.list.freqs, SIMPLIFY=FALSE, MoreArgs=list(stringsAsFactors=FALSE)) freqs.df<-do.call(rbind, freqs.list) result<-xtabs(Freq ~ ID+Var1, data=freqs.df) final.m<-apply(result, 2, as.numeric) freqs.list <- mapply(data.frame, ID=seq_along(book.list.freqs), book.list.freqs, SIMPLIFY=FALSE, MoreArgs=list(stringsAsFactors=FALSE)) freqs.df <- do.call(rbind,freqs.list) result <- xtabs(Freq ~ ID+Var1, data= freqs.df) ######################################################### # Did not complete clustering in this session. See day 4 #########################################################