############################################################### # mjockers unl edu # The Day in Code--DHWI Text Analysis with R. # Functions ############################################################### ####################################################################### # A Function to print a vector of file names in user friendly format ####################################################################### show.files<-function(file.name.vector){ for(i in 1:length(file.name.vector)){ cat(i, file.name.vector[i], "\n", sep=" ") } } ################################################################ # A Function takes a vector of file names and a directory path and # returns a list in which each item in the list is an ordered # vector of words from one of the files in the files vector ################################################################ make.file.word.list<-function(file.vector, inputDir){ text.word.vector.list<-list() for(i in 1:length(file.vector)){ # read in the file from dir text.lines<-scan(paste(inputDir, file.vector[i], sep="/"), what="character", sep="\n") # convert to a single string called text # text<-paste(text.lines, collapse=" ") text.lower<-tolower(text) text.words.list<-strsplit(text.lower, "\\W") text.word.vector<-unlist(text.words.list) # remove the blanks # text.word.vector<-text.word.vector[which(text.word.vector!="")] # use the index id from the file.vector vector as the "name" of the list text.word.vector.list[[file.vector[i]]]<-text.word.vector } return(text.word.vector.list) } ################################################################ # A Simple Function for creating a KWIC list ################################################################ doitKwic<-function(named.text.word.vector.list){ show.files(names(named.text.word.vector.list)) # ask the user for three bits of information fileid<- as.numeric(readline("Which file would you like to examine? Enter a file number: \n")) context<- as.numeric(readline("How much context do you want to see, Enter a number: \n")) keyword<- tolower((readline("Enter a keyword: \n"))) hits<-which(named.text.word.vector.list[[fileid]] == keyword) if(length(hits)>0){ for(h in 1:length(hits)){ start<-hits[h]-context if(start < 1){ # may need this: if(start < 1 && h == 1){ start<-1 } end<-hits[h]+context cat(named.text.word.vector.list[[fileid]][start:end], "\n") } } } ################################################################ # A Nicer Function for creating a KWIC list ################################################################ doitKwicBetter<-function(named.text.word.vector.list){ show.files(names(named.text.word.vector.list)) # ask the user for three bits of information fileid<- as.numeric(readline("Which file would you like to examine? Enter a file number: \n")) context<- as.numeric(readline("How much context do you want to see, Enter a number: \n")) keyword<- tolower((readline("Enter a keyword: \n"))) hits<-which(named.text.word.vector.list[[fileid]] == keyword) if(length(hits)>0){ result<-NULL for(h in 1:length(hits)){ start<-hits[h]-context if(start < 1){ #if(start < 1 && h == 1){ start<-1 } end<-hits[h]+context cat("\n-----------------------", h, "-------------------------\n") cat(named.text.word.vector.list[[fileid]][start:(hits[h]-1)], sep=" ") cat(" [", named.text.word.vector.list[[fileid]][hits[h]],"] ", sep="") cat(named.text.word.vector.list[[fileid]][(hits[h]+1):end], sep=" ") myrow<-cbind(hits[h], paste(named.text.word.vector.list[[fileid]][start:(hits[h]-1)], collapse=" "), paste(named.text.word.vector.list[[fileid]][hits[h]], collapse=" "), paste(named.text.word.vector.list[[fileid]][(hits[h]+1):end], collapse=" ")) result<-rbind(result,myrow) } colnames(result)<-c("position", "left", "keyword", "right") toprint<-as.numeric((readline("Would you like to save this result to a file: enter 1=yes or 0=no \n"))) if(toprint==1){ write.csv(result, paste(keyword,"_In_", context, names(named.text.word.vector.list)[fileid], ".csv")) } } else { cat("YOUR KEYWORD WAS NOT FOUND\n") } } ############################################################ # A Function to extract a table of relative frequencies ############################################################ getWordLists<-function(doc.object){ paras<-getNodeSet(doc.object, "/TEI/text/body//p") words<-paste(sapply(paras, xmlValue), collapse=" ") words.lower<-tolower(words) words.list<-strsplit(words.lower, "\\W|_") word.vector<-unlist(words.list) book.freqs<-table(word.vector[which(word.vector!="")]) book.freqs.rel<-100*(book.freqs/sum(book.freqs)) return(book.freqs.rel) } getWordSegmentLists<-function(doc.object, chunk.size=10){ paras<-getNodeSet(doc.object, "/TEI/text/body//p") words<-paste(sapply(paras,xmlValue), collapse=" ") words.lower <-tolower(words) words.list<-strsplit(words.lower, "\\W") word.vector<-unlist(words.list) chunk.max<-length(word.vector)/chunk.size x <- seq_along(word.vector) chunks <- split(word.vector, ceiling(x/chunk.max)) chunks<-lapply(chunks, removeBlanks) # note to self, fix this so it comes before chunking freq.chunks<-lapply(chunks, table) rel.freq.chunk.list<-lapply(freq.chunks, prop.table) return(rel.freq.chunk.list) } removeBlanks<-function(x){ x<-gsub("_+", "", x) x[which(x!="")] } my.mapply<-function(chunk.list){ my.list<-mapply(data.frame, ID=seq_along(chunk.list), chunk.list, SIMPLIFY=FALSE, MoreArgs=list(stringsAsFactors=FALSE)) my.df <- do.call(rbind, my.list) return(my.df) } ####Functions associated with POS Tagging/selection SplitText <- function(Phrase) { unlist(strsplit(Phrase," ")) } SelectTaggedWords <- function(Words,tagID) { Words[ grep(tagID,Words) ] } RemoveTags <- function(Words) { sub("/[A-Z]{2,3}","",Words) } RemoveNonChar <- function(Words) { gsub("[^[:alnum:][:space:]']","",Words) }