Last year we got our hands dirty working through all of the chapters in Text Analysis with R for Students of Literature. This year, we’ll do a “deep(ish) dive” into dplyr and xml2, two awesome R packages developed by R-Star, Hadley Wickham.
In preparation for the year two R sessions, you should:
- Make sure that you have everything in order (see instructions from last year)
- Carefully review chapter 7 of *Text Analysis with R for Students of Literature* and be sure you understand all the practice exercises on page 72 as well as the coded answers to those questions that are found on pages 182 – 183.
- Skim chapter 10, especially sections 10.5 and 10.6.
- Copy a specially marked up version of Hamlet from my website: (or you can download, unzip, and save a version here.)
- Copy (or unzip) and save the xml file as “hamlet.xml” and place it inside the data folder of your main TextAnalysisWithR folder.
Start Up Code:
text.v <- scan("/your/path/here/TextAnalysisWithR/data/plainText/melville.txt", what="character", sep="\n")
start.v <- which(text.v == "CHAPTER 1. Loomings.")
end.v <- which(text.v == "orphan.")
novel.lines.v <- text.v[start.v:end.v]
novel.lines.v <- unlist(novel.lines.v)
chap.positions.v <- grep("^CHAPTER \\d", novel.lines.v)
last.position.v <- length(novel.lines.v)
chap.positions.v <- c(chap.positions.v , last.position.v)
chapter.freqs.l <- list()
chapter.raws.l <- list()
for(i in 1:length(chap.positions.v)){
if(i != length(chap.positions.v)){
chapter.title <- novel.lines.v[chap.positions.v[i]]
start <- chap.positions.v[i]+1
end <- chap.positions.v[i+1]-1
chapter.lines.v <- novel.lines.v[start:end]
chapter.words.v <- tolower(paste(chapter.lines.v, collapse=" "))
chapter.words.l <- strsplit(chapter.words.v, "\\W")
chapter.word.v <- unlist(chapter.words.l)
chapter.word.v <- chapter.word.v[which(chapter.word.v!="")]
chapter.freqs.t <- table(chapter.word.v)
chapter.raws.l[[chapter.title]] <- chapter.freqs.t
chapter.freqs.t.rel <- 100*(chapter.freqs.t/sum(chapter.freqs.t))
chapter.freqs.l[[chapter.title]] <- chapter.freqs.t.rel
}
}
Additional Code
chapter.lengths.m <- do.call(rbind, lapply(chapter.raws.l,sum))
sum_list <- lapply(chapter.raws.l,sum)
chap_lens <- stack(sum_list)
head(chap_lens)
chap_lens$values # tada, the lengths
chapter_hapax.v <- sapply(chapter.raws.l, function(x) sum(x == 1))
chap_haps_l <- lapply(chapter.raws.l, function(x) sum(x == 1))
chap_haps <- stack(chap_haps_l)
class(chapter_hapax.v)
class(chap_haps)
head(chapter_hapax.v)
head(chap_haps)
a_data_frame <- data.frame(chap_lens, chap_haps)
dim(a_data_frame)
hap_lens_df <- data.frame(chap_names = chap_lens$ind, chap_lens=chap_lens$value, num_hapax=chap_haps$values)
head(hap_lens_df)
library(dplyr)
new_df <- mutate(hap_lens_df, hap_percent = num_hapax/chap_lens)
barplot(new_df$hap_percent, names.arg = seq(1:length(chapter.raws.l)))
nice_df <- mutate(new_df, short_title = gsub("\\..*$", "", chap_names))
head(nice_df)
colnames(nice_df)
head(nice_df)
filter(nice_df, hap_percent > .5)
select(nice_df, hap_percent)
filter(nice_df, hap_percent > .5) %>%
select(short_title, hap_percent)
filter(nice_df, hap_percent > .5) %>%
select(hap_percent) %>%
summary()
filter(nice_df, hap_percent > .5) %>%
select(hap_percent, chap_lens) %>%
summary()
filter(nice_df, hap_percent < .2) %>%
select(short_title, hap_percent) %>%
arrange(hap_percent)
filter(nice_df, hap_percent < .2) %>%
select(short_title, hap_percent) %>%
arrange(desc(hap_percent))