1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
############################################################### # mjockers unl edu # The Day in Code--DHWI Text Analysis with R. # Day 1 ############################################################### # Don't forget to set your working directory. . . . # Make moby.word.vector from Project Gutenberg Moby Dick text<-scan("data/plaintext/melville.txt", what="character", sep="\n") start<-which(text == "CHAPTER 1. Loomings.") end<-which(text == "End of Project Gutenberg's Moby Dick; or The Whale, by Herman Melville") novel.lines<- text[start:(end-1)] metadata<-c(text[1:start -1], text[end:length(text)]) novel<-paste(novel.lines, collapse=" ") novel.lower <-tolower(novel) moby.words<-strsplit(novel.lower, "\\W") moby.word.vector<-unlist(moby.words) not.blanks<- which(moby.word.vector!="") moby.word.vector<- moby.word.vector[not.blanks] whales<-which(moby.word.vector=="whale") # Do some basic calculations whale.hits<-length(whales) total.words<-length(moby.word.vector) 100*(whale.hits/total.words) #frequency of whale as percentage # How many word types in Moby Dick? length(unique(moby.word.vector)) # How many tokens of each type? moby.freqs<-table(moby.word.vector) # show counts for first ten types moby.freqs[1:10] # better when sorted sorted.moby.freqs<-sort(moby.freqs , decreasing=T) sorted.moby.freqs[1:10] # and I can convert them to relative frequencies by recycling: sorted.moby.rel.freqs<-100*(sorted.moby.freqs/sum(sorted.moby.freqs)) # look ma, I can plot the raw frequencies too plot(sorted.moby.freqs[1:10], type="l") # and I can plot the relative frequencies with some fancy pants plot(sorted.moby.rel.freqs[1:10], type="b", xlab="Top Ten Words In Moby Dick by Rel Freq", ylab="Percentage of Full Text", xaxt="n") axis(1, 1:10, labels=names(sorted.moby.rel.freqs[1:10])) # See Matt look up frequencies by word type too! sorted.moby.freqs["he"] sorted.moby.freqs["she"] sorted.moby.freqs["him"] sorted.moby.freqs["her"] # See Matt calculate some ratios sorted.moby.freqs["him"]/sorted.moby.freqs["her"] sorted.moby.freqs["he"]/sorted.moby.freqs["she"] #plot some whales xaxis<-1:length(moby.word.vector) xaxis<-seq(1:length(moby.word.vector)) yaxis<-rep(NA,length(xaxis)) yaxis[whales]<-1 plot(yaxis, xlab="Novel Time", ylab="whale", type="h", ylim=c(0,1), yaxt="n") # plot some ahabs ahabs<-which(moby.word.vector=="ahab") xaxis<-1:length(moby.word.vector) xaxis<-seq(1:length(moby.word.vector)) yaxis<-rep(NA,length(xaxis)) yaxis[ahabs]<-1 plot(yaxis, xlab="Novel Time", ylab="ahab", type="h", ylim=c(0,1), yaxt="n") # Divide Moby Dick into chapters and generate raw count and frequency tables for each chapter and store the data in two new list objects, pwhew! novel.chapter.postions<-grep("^CHAPTER \\d", novel.lines) novel.lines[novel.chapter.postions] last.position<-length(novel.lines) novel.chapter.postions<-c(novel.chapter.postions, last.position) # Arrrrrrrrgh now we need to learn a forrrrrrr loop. So, . . . justforfun<-"All R and no R&R makes Matt a dull humanist." for(i in 1:100){ print(justforfun) } # Rolling Drum. . . the real for loop # first instantiate two new list object chapter.list.freqs<-list() chapter.list.raws<-list() for(i in 1:length(novel.chapter.postions)){ if(i != length(novel.chapter.postions)){ chapter.title<-novel.lines[novel.chapter.postions[i]] start<-novel.chapter.postions[i]+1 end<-novel.chapter.postions[i+1]-1 chapter.lines<-novel.lines[start:end] chapter.words<-tolower(paste(chapter.lines, collapse=" ")) chapter.words.list<-strsplit(chapter.words, "\\W") chapter.words.vector<-unlist(chapter.words.list) chapter.words.vector<-chapter.words.vector[which(chapter.words.vector!="")] chapter.freqs<-table(chapter.words.vector) chapter.list.raws[[chapter.title]]<-chapter.freqs chapter.freqs.rel<-100*(chapter.freqs/sum(chapter.freqs)) chapter.list.freqs[[chapter.title]]<-chapter.freqs.rel } } ########################################## # End of Day 1 # Happy Day two! ########################################## # Let's see what this baby we made yesterday is made of. . . str(chapter.list.raws) |