DHWI: R Code Functions File

###############################################################
# mjockers unl edu
# The Day in Code--DHWI Text Analysis with R. 
# Functions
###############################################################

#######################################################################
# A Function to print a vector of file names in user friendly format
#######################################################################

show.files<-function(file.name.vector){
  for(i in 1:length(file.name.vector)){
    cat(i, file.name.vector[i], "\n", sep=" ")
  }  
}

################################################################
# A Function takes a vector of file names and a directory path and 
# returns a list in which each item in the list is an ordered 
# vector of words from one of the files in the files vector
################################################################

make.file.word.list<-function(file.vector, inputDir){
  text.word.vector.list<-list()
  for(i in 1:length(file.vector)){
    # read in the file from dir
    text.lines<-scan(paste(inputDir, file.vector[i], sep="/"), what="character", sep="\n")
    # convert to a single string called text #
    text<-paste(text.lines, collapse=" ")
    text.lower<-tolower(text)
    text.words.list<-strsplit(text.lower, "\\W")
    text.word.vector<-unlist(text.words.list)
    # remove the blanks #
    text.word.vector<-text.word.vector[which(text.word.vector!="")]
    # use the index id from the file.vector vector as the "name" of the list
    text.word.vector.list[[file.vector[i]]]<-text.word.vector
    
  }
  return(text.word.vector.list)  
}

################################################################
# A Simple Function for creating a KWIC list
################################################################

doitKwic<-function(named.text.word.vector.list){
  show.files(names(named.text.word.vector.list))
  # ask the user for three bits of information
  fileid<- as.numeric(readline("Which file would you like to 
examine? Enter a file number: \n"))
  context<- as.numeric(readline("How much context do you want to 
see, Enter a number: \n"))
  keyword<- tolower((readline("Enter a keyword: \n")))
  hits<-which(named.text.word.vector.list[[fileid]] == keyword)
  if(length(hits)>0){
    for(h in 1:length(hits)){
      start<-hits[h]-context
      if(start < 1){ # may need this: if(start < 1 && h == 1){
        start<-1
      }
      end<-hits[h]+context
      cat(named.text.word.vector.list[[fileid]][start:end], 
          "\n")
    }
  }
}

################################################################
# A Nicer Function for creating a KWIC list
################################################################

doitKwicBetter<-function(named.text.word.vector.list){
  show.files(names(named.text.word.vector.list))
  # ask the user for three bits of information
  fileid<- as.numeric(readline("Which file would you like to examine? Enter a file number: \n"))
  context<- as.numeric(readline("How much context do you want to see, Enter a number: \n"))
  keyword<- tolower((readline("Enter a keyword: \n")))
  hits<-which(named.text.word.vector.list[[fileid]] == keyword)
  if(length(hits)>0){
    result<-NULL
    for(h in 1:length(hits)){
      start<-hits[h]-context
      if(start < 1){ #if(start < 1 && h == 1){
        start<-1
      }
      end<-hits[h]+context
      cat("\n-----------------------", h, "-------------------------\n")
      cat(named.text.word.vector.list[[fileid]][start:(hits[h]-1)], sep=" ")
      cat(" [", named.text.word.vector.list[[fileid]][hits[h]],"] ", sep="")
      cat(named.text.word.vector.list[[fileid]][(hits[h]+1):end], sep=" ")
      myrow<-cbind(hits[h], paste(named.text.word.vector.list[[fileid]][start:(hits[h]-1)], collapse=" "), paste(named.text.word.vector.list[[fileid]][hits[h]], collapse=" "), paste(named.text.word.vector.list[[fileid]][(hits[h]+1):end], collapse=" "))
      result<-rbind(result,myrow)
    }
    colnames(result)<-c("position", "left", "keyword", "right")
    toprint<-as.numeric((readline("Would you like to save this result to a file: enter 1=yes or 0=no \n")))
    if(toprint==1){
      write.csv(result, paste(keyword,"_In_", context, names(named.text.word.vector.list)[fileid], ".csv"))
    }
  } else {
    cat("YOUR KEYWORD WAS NOT FOUND\n")
  }
}

############################################################
# A Function to extract a table of relative frequencies
############################################################
getWordLists<-function(doc.object){
  paras<-getNodeSet(doc.object, "/TEI/text/body//p")
  words<-paste(sapply(paras, xmlValue), collapse=" ")
  words.lower<-tolower(words)
  words.list<-strsplit(words.lower, "\\W|_")
  word.vector<-unlist(words.list)
  book.freqs<-table(word.vector[which(word.vector!="")])
  book.freqs.rel<-100*(book.freqs/sum(book.freqs))
  return(book.freqs.rel)
}


getWordSegmentLists<-function(doc.object, chunk.size=10){
  paras<-getNodeSet(doc.object, "/TEI/text/body//p")
  words<-paste(sapply(paras,xmlValue), collapse=" ")
  words.lower <-tolower(words)
  words.list<-strsplit(words.lower, "\\W")
  word.vector<-unlist(words.list)
  chunk.max<-length(word.vector)/chunk.size
  x <- seq_along(word.vector)
  chunks <- split(word.vector, ceiling(x/chunk.max))
  chunks<-lapply(chunks, removeBlanks) # note to self, fix this so it comes before chunking
  freq.chunks<-lapply(chunks, table)
  rel.freq.chunk.list<-lapply(freq.chunks, prop.table)
  return(rel.freq.chunk.list)
}

removeBlanks<-function(x){
  x<-gsub("_+", "", x)
  x[which(x!="")]
}

my.mapply<-function(chunk.list){
  my.list<-mapply(data.frame, ID=seq_along(chunk.list), chunk.list, SIMPLIFY=FALSE, MoreArgs=list(stringsAsFactors=FALSE))
  my.df <- do.call(rbind, my.list)
  return(my.df)
}

####Functions associated with POS Tagging/selection
SplitText <- function(Phrase) {
  unlist(strsplit(Phrase," "))
}
SelectTaggedWords <- function(Words,tagID) {
  Words[ grep(tagID,Words) ]
}
RemoveTags <- function(Words) {
  sub("/[A-Z]{2,3}","",Words)
}

RemoveNonChar <- function(Words) {
  gsub("[^[:alnum:][:space:]']","",Words)
}

Leave a Reply