1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
############################################################### # mjockers unl edu # The Day in Code--DHWI Text Analysis with R. # Functions ############################################################### ####################################################################### # A Function to print a vector of file names in user friendly format ####################################################################### show.files<-function(file.name.vector){ for(i in 1:length(file.name.vector)){ cat(i, file.name.vector[i], "\n", sep=" ") } } ################################################################ # A Function takes a vector of file names and a directory path and # returns a list in which each item in the list is an ordered # vector of words from one of the files in the files vector ################################################################ make.file.word.list<-function(file.vector, inputDir){ text.word.vector.list<-list() for(i in 1:length(file.vector)){ # read in the file from dir text.lines<-scan(paste(inputDir, file.vector[i], sep="/"), what="character", sep="\n") # convert to a single string called text # text<-paste(text.lines, collapse=" ") text.lower<-tolower(text) text.words.list<-strsplit(text.lower, "\\W") text.word.vector<-unlist(text.words.list) # remove the blanks # text.word.vector<-text.word.vector[which(text.word.vector!="")] # use the index id from the file.vector vector as the "name" of the list text.word.vector.list[[file.vector[i]]]<-text.word.vector } return(text.word.vector.list) } ################################################################ # A Simple Function for creating a KWIC list ################################################################ doitKwic<-function(named.text.word.vector.list){ show.files(names(named.text.word.vector.list)) # ask the user for three bits of information fileid<- as.numeric(readline("Which file would you like to examine? Enter a file number: \n")) context<- as.numeric(readline("How much context do you want to see, Enter a number: \n")) keyword<- tolower((readline("Enter a keyword: \n"))) hits<-which(named.text.word.vector.list[[fileid]] == keyword) if(length(hits)>0){ for(h in 1:length(hits)){ start<-hits[h]-context if(start < 1){ # may need this: if(start < 1 && h == 1){ start<-1 } end<-hits[h]+context cat(named.text.word.vector.list[[fileid]][start:end], "\n") } } } ################################################################ # A Nicer Function for creating a KWIC list ################################################################ doitKwicBetter<-function(named.text.word.vector.list){ show.files(names(named.text.word.vector.list)) # ask the user for three bits of information fileid<- as.numeric(readline("Which file would you like to examine? Enter a file number: \n")) context<- as.numeric(readline("How much context do you want to see, Enter a number: \n")) keyword<- tolower((readline("Enter a keyword: \n"))) hits<-which(named.text.word.vector.list[[fileid]] == keyword) if(length(hits)>0){ result<-NULL for(h in 1:length(hits)){ start<-hits[h]-context if(start < 1){ #if(start < 1 && h == 1){ start<-1 } end<-hits[h]+context cat("\n-----------------------", h, "-------------------------\n") cat(named.text.word.vector.list[[fileid]][start:(hits[h]-1)], sep=" ") cat(" [", named.text.word.vector.list[[fileid]][hits[h]],"] ", sep="") cat(named.text.word.vector.list[[fileid]][(hits[h]+1):end], sep=" ") myrow<-cbind(hits[h], paste(named.text.word.vector.list[[fileid]][start:(hits[h]-1)], collapse=" "), paste(named.text.word.vector.list[[fileid]][hits[h]], collapse=" "), paste(named.text.word.vector.list[[fileid]][(hits[h]+1):end], collapse=" ")) result<-rbind(result,myrow) } colnames(result)<-c("position", "left", "keyword", "right") toprint<-as.numeric((readline("Would you like to save this result to a file: enter 1=yes or 0=no \n"))) if(toprint==1){ write.csv(result, paste(keyword,"_In_", context, names(named.text.word.vector.list)[fileid], ".csv")) } } else { cat("YOUR KEYWORD WAS NOT FOUND\n") } } ############################################################ # A Function to extract a table of relative frequencies ############################################################ getWordLists<-function(doc.object){ paras<-getNodeSet(doc.object, "/TEI/text/body//p") words<-paste(sapply(paras, xmlValue), collapse=" ") words.lower<-tolower(words) words.list<-strsplit(words.lower, "\\W|_") word.vector<-unlist(words.list) book.freqs<-table(word.vector[which(word.vector!="")]) book.freqs.rel<-100*(book.freqs/sum(book.freqs)) return(book.freqs.rel) } getWordSegmentLists<-function(doc.object, chunk.size=10){ paras<-getNodeSet(doc.object, "/TEI/text/body//p") words<-paste(sapply(paras,xmlValue), collapse=" ") words.lower <-tolower(words) words.list<-strsplit(words.lower, "\\W") word.vector<-unlist(words.list) chunk.max<-length(word.vector)/chunk.size x <- seq_along(word.vector) chunks <- split(word.vector, ceiling(x/chunk.max)) chunks<-lapply(chunks, removeBlanks) # note to self, fix this so it comes before chunking freq.chunks<-lapply(chunks, table) rel.freq.chunk.list<-lapply(freq.chunks, prop.table) return(rel.freq.chunk.list) } removeBlanks<-function(x){ x<-gsub("_+", "", x) x[which(x!="")] } my.mapply<-function(chunk.list){ my.list<-mapply(data.frame, ID=seq_along(chunk.list), chunk.list, SIMPLIFY=FALSE, MoreArgs=list(stringsAsFactors=FALSE)) my.df <- do.call(rbind, my.list) return(my.df) } ####Functions associated with POS Tagging/selection SplitText <- function(Phrase) { unlist(strsplit(Phrase," ")) } SelectTaggedWords <- function(Words,tagID) { Words[ grep(tagID,Words) ] } RemoveTags <- function(Words) { sub("/[A-Z]{2,3}","",Words) } RemoveNonChar <- function(Words) { gsub("[^[:alnum:][:space:]']","",Words) } |