Year Two Instructions

Last year we got our hands dirty working through all of the chapters in Text Analysis with R for Students of Literature. This year, we’ll do a “deep(ish) dive” into dplyr and xml2, two awesome R packages developed by R-Star, Hadley Wickham.

In preparation for the year two R sessions, you should:

  1. Make sure that you have everything in order (see instructions from last year)
  2. Carefully review chapter 7 of *Text Analysis with R for Students of Literature* and be sure you understand all the practice exercises on page 72 as well as the coded answers to those questions that are found on pages 182 – 183.
  3. Skim chapter 10, especially sections 10.5 and 10.6.
  4. Copy a specially marked up version of Hamlet from my website: (or you can download, unzip, and save a version here.)
  5. Copy (or unzip) and save the xml file as “hamlet.xml” and place it inside the data folder of your main TextAnalysisWithR folder.

Start Up Code:

text.v <- scan("/your/path/here/TextAnalysisWithR/data/plainText/melville.txt", what="character", sep="\n")
start.v <- which(text.v == "CHAPTER 1. Loomings.")
end.v <- which(text.v == "orphan.")
novel.lines.v <-  text.v[start.v:end.v]
novel.lines.v <- unlist(novel.lines.v)
chap.positions.v <- grep("^CHAPTER \\d", novel.lines.v)
last.position.v <-  length(novel.lines.v)
chap.positions.v  <-  c(chap.positions.v , last.position.v)
chapter.freqs.l <- list()
chapter.raws.l <- list()
for(i in 1:length(chap.positions.v)){
  if(i != length(chap.positions.v)){
    chapter.title <- novel.lines.v[chap.positions.v[i]]
    start <- chap.positions.v[i]+1
    end <- chap.positions.v[i+1]-1
    chapter.lines.v <- novel.lines.v[start:end]
    chapter.words.v <- tolower(paste(chapter.lines.v, collapse=" "))
    chapter.words.l <- strsplit(chapter.words.v, "\\W")
    chapter.word.v <- unlist(chapter.words.l)
    chapter.word.v <- chapter.word.v[which(chapter.word.v!="")] 
    chapter.freqs.t <- table(chapter.word.v)
    chapter.raws.l[[chapter.title]] <-  chapter.freqs.t
    chapter.freqs.t.rel <- 100*(chapter.freqs.t/sum(chapter.freqs.t))
    chapter.freqs.l[[chapter.title]] <- chapter.freqs.t.rel
  }
}

Additional Code

chapter.lengths.m <- do.call(rbind, lapply(chapter.raws.l,sum))

sum_list <- lapply(chapter.raws.l,sum)
chap_lens <- stack(sum_list)

head(chap_lens)

chap_lens$values # tada, the lengths

chapter_hapax.v <- sapply(chapter.raws.l, function(x) sum(x == 1))

chap_haps_l <- lapply(chapter.raws.l, function(x) sum(x == 1))
chap_haps <- stack(chap_haps_l)

class(chapter_hapax.v)

class(chap_haps)

head(chapter_hapax.v)

head(chap_haps)

a_data_frame <- data.frame(chap_lens, chap_haps)

dim(a_data_frame)

hap_lens_df <- data.frame(chap_names = chap_lens$ind, chap_lens=chap_lens$value, num_hapax=chap_haps$values)

head(hap_lens_df)

library(dplyr)

new_df <- mutate(hap_lens_df, hap_percent = num_hapax/chap_lens)

barplot(new_df$hap_percent, names.arg = seq(1:length(chapter.raws.l)))

nice_df <- mutate(new_df, short_title = gsub("\\..*$", "", chap_names))
head(nice_df)

colnames(nice_df)

head(nice_df)

filter(nice_df, hap_percent > .5)

select(nice_df, hap_percent)

filter(nice_df, hap_percent > .5) %>%
  select(short_title, hap_percent)

filter(nice_df, hap_percent > .5) %>%
  select(hap_percent) %>%
  summary()

filter(nice_df, hap_percent > .5) %>%
  select(hap_percent, chap_lens) %>%
  summary()

filter(nice_df, hap_percent < .2) %>% 
  select(short_title, hap_percent) %>%
  arrange(hap_percent)

filter(nice_df, hap_percent < .2) %>% 
  select(short_title, hap_percent) %>%
  arrange(desc(hap_percent))