############################################################### # mjockers unl edu # The Day in Code--DHWI Text Analysis with R. # Day 2 ############################################################### # Don't forget to set your working directory. . . . # Load Moby Dick File text<-scan("data/plaintext/melville.txt", what="character", sep="\n") start<-which(text == "CHAPTER 1. Loomings.") end<-which(text == "End of Project Gutenberg's Moby Dick; or The Whale, by Herman Melville") novel.lines<- text[start:(end-1)] novel.chapter.postions<-grep("^CHAPTER \\d", novel.lines) last.position<-length(novel.lines) novel.chapter.postions<-c(novel.chapter.postions, last.position) # Create two list objects in which to store type/token tables for each chapter of the novel chapter.list.freqs<-list() chapter.list.raws<-list() # run the loop to fill the two list objects for(i in 1:length(novel.chapter.postions)){ if(i != length(novel.chapter.postions)){ chapter.title<-novel.lines[novel.chapter.postions[i]] start<-novel.chapter.postions[i]+1 end<-novel.chapter.postions[i+1]-1 chapter.lines<-novel.lines[start:end] chapter.words<-tolower(paste(chapter.lines, collapse=" ")) chapter.words.list<-strsplit(chapter.words, "\\W") chapter.words.vector<-unlist(chapter.words.list) chapter.words.vector<-chapter.words.vector[which(chapter.words.vector!="")] chapter.freqs<-table(chapter.words.vector) chapter.list.raws[[chapter.title]]<-chapter.freqs chapter.freqs.rel<-100*(chapter.freqs/sum(chapter.freqs)) chapter.list.freqs[[chapter.title]]<-chapter.freqs.rel } } #Access the count of "whale" in chapter 1 chapter.list.freqs[[1]]["whale"] # Use lapply with bracket to get whale and ahab counts for each chapter whale.list<-lapply(chapter.list.freqs, "[", "whale") ahab.list<-lapply(chapter.list.freqs, "[", "ahab") # convert the list data into matrix data with do.call whales<-do.call(rbind, whale.list) ahabs<-do.call(rbind, ahab.list) #Recast as vectors v.whales<-as.vector(whales) v.ahabs<-as.vector(ahabs) #bind the vectors into a new matrix whales.ahabs<-cbind(v.whales, v.ahabs) #Or alternate method that creates better column names whales.ahabs<-cbind(whale=v.whales, ahab=v.ahabs) #Get rid of those pesky NAs whales.ahabs[which(is.na(whales.ahabs))]<-0 #Use colnames to make the names even better colnames(whales.ahabs)<-c("WHALE", "AHAB") #plot the values in each column side by side barplot(whales.ahabs, beside=T, col="black") #Look at the correlation using cor() cor(whales.ahabs[,1], whales.ahabs[,2]) #recast the data as data frame cordata<-as.data.frame(whales.ahabs) #see how we can use the $ cor(cordata$WHALE, cordata$AHAB) # Use cor with sample to randomize the order of values in one column to see what correlation looks like by chance cor(sample(cordata$WHALE), cordata$AHAB) # GUI VERSION for(i in 1:100){ print(cor(sample(cordata$WHALE), cordata$AHAB)) } # Better way myvar<-NULL for(i in 1:1000){ myvar<-rbind(myvar, cor(sample(cordata$WHALE), cordata$AHAB)) } mean(myvar) max(myvar) min(myvar) max(abs(myvar)) # Or just use cor.test to see the p-value and etc. . . cor.test(cordata$WHALE, cordata$AHAB) #################################### # End of first session 10:30AM #################################### ################################################ # Begin 10:45-12:00 Session with fresh workspace ################################################ text<-scan("data/plaintext/melville.txt", what="character", sep="\n") start<-which(text == "CHAPTER 1. Loomings.") end<-which(text == "End of Project Gutenberg's Moby Dick; or The Whale, by Herman Melville") novel.lines<- text[start:(end-1)] novel.chapter.postions<-grep("^CHAPTER \\d", novel.lines) last.position<-length(novel.lines) novel.chapter.postions<-c(novel.chapter.postions, last.position) chapter.list.freqs<-list() chapter.list.raws<-list() for(i in 1:length(novel.chapter.postions)){ if(i != length(novel.chapter.postions)){ chapter.title<-novel.lines[novel.chapter.postions[i]] start<-novel.chapter.postions[i]+1 end<-novel.chapter.postions[i+1]-1 chapter.lines<-novel.lines[start:end] chapter.words<-tolower(paste(chapter.lines, collapse=" ")) chapter.words.list<-strsplit(chapter.words, "\\W") chapter.words.vector<-unlist(chapter.words.list) chapter.words.vector<-chapter.words.vector[which(chapter.words.vector!="")] chapter.freqs<-table(chapter.words.vector) chapter.list.raws[[chapter.title]]<-chapter.freqs chapter.freqs.rel<-100*(chapter.freqs/sum(chapter.freqs)) chapter.list.freqs[[chapter.title]]<-chapter.freqs.rel } } sum(chapter.list.raws[[1]]) # Total Tokens in Chapter 1 length(chapter.list.raws[[1]]) #Total Types in Chapter 1 sum(chapter.list.raws[[1]])/length(chapter.list.raws[[1]]) #TTR mean(chapter.list.raws[[1]]) # mean of chapter one lapply(chapter.list.raws, mean) #mean of all chapters #now saving those means into new var mean.word.use<-do.call(rbind, lapply(chapter.list.raws, mean)) #Look ma, row names are in them thar vectors rownames(mean.word.use) #Look at me! plot(mean.word.use, type="h") # Subtract the overall mean to see divergence from "expected." scale(mean.word.use) plot(scale(mean.word.use), type="h") #That pesky order function order(mean.word.use, decreasing=T) # order the means mean.word.use[order(mean.word.use, decreasing=T),] #Grab all the type counts chapter.lengths<-do.call(rbind, lapply(chapter.list.raws, sum)) # examine correlation between mean word use and type count # the long way. . . my.m<-cbind(chapter.lengths, mean.word.use) cor(my.m) #short way cor(cbind(chapter.lengths, mean.word.use)) #or using cor.test cor.test(chapter.lengths, mean.word.use) #let's get those pesky singletons. . . chapter.hapax<-sapply(chapter.list.raws, function(x) sum(x==1)) # and now as a percentage hapax.percentage<-chapter.hapax/chapter.lengths # look ma, I can plot them too. . . barplot(hapax.percentage, beside=T, col="grey", names.arg=seq(1:length(chapter.list.raws))) #Does the number of hapax correlate to chapter length? cor.test(chapter.lengths, chapter.hapax) ################################################ # Afternoon Session 1, Day 2 # Clear your Workspace # Clear your Console # Clear your Mind # Begin. . . ################################################ # FUNCTIONs and Do It KWIC inputDir<-"data/plainText" files<-dir(inputDir, ".*txt") #dir function ################################################ # show.files Function # A user-defined (e.g. "us," you and me) # function to print a vector of files names # in user-friendly format ################################################ show.files<-function(file.name.vector){ for(i in 1:length(file.name.vector)){ cat(i, file.name.vector[i], "\n", sep=" ") } } # now call the function with the files argument from above show.files(files) ################################################ # make.file.word.list Function # takes a vector of file names and a dir path # and return a list in which each item in the list is # an ordered vector of words from one of the files in the # files vector ################################################ make.file.word.list<-function(file.vector, inputDir){ text.word.vector.list<-list() for(i in 1:length(file.vector)){ # read in the file from dir text.lines<-scan(paste(inputDir, file.vector[i], sep="/"), what="character", sep="\n") # convert to single string text<-paste(text.lines, collapse=" ") text.lower<-tolower(text) text.words.list<-strsplit(text.lower, "\\W") text.word.vector<-unlist(text.words.list) # Remove the blanks text.word.vector<-text.word.vector[which(text.word.vector!="")] # use the index id from the file.vector vector as the "name" in the list text.word.vector.list[[file.vector[i]]]<-text.word.vector } return(text.word.vector.list) } #build a list object using the function my.corpus<-make.file.word.list(files, inputDir) my.corpus[[1]][1:100] # look at this positions<-which(my.corpus[[1]][]=="gutenberg") #find gutenberg first.instance<-positions[1] cat(my.corpus[[1]][(first.instance-1):(first.instance+1)]) a.dogs<-which(my.corpus[[1]][]=="dog") # 1 for austen m.dogs<-which(my.corpus[[2]][]=="dog") # 2 for melville # a hard coded KWIC list context<-2 for(i in 1:length(m.dogs)){ cat(my.corpus[[2]][(m.dogs[i]-context):(m.dogs[i]+context)], "\n") } ############################################################### # Do It KWIC--BETTER # Clear your workspace # we are now putting our functions in another # file called "corpusFunctions.r" ############################################################### source("code/corpusFunctions.r") # reference a supporting file inputDir<-"data/plainText" files<-dir(inputDir, ".*txt") outputDir<-"results/" my.corpus<-make.file.word.list(files, inputDir) # this function is in another file ############################################################### # For the rest of Day 2, look at </code> <a href="http://www.matthewjockers.net/materials/dhwi-r-code-functions-file/">corpusFunctions.r</a><code> ###############################################################