###############################################################
# mjockers unl edu
# The Day in Code--DHWI Text Analysis with R.
# Functions
###############################################################
#######################################################################
# A Function to print a vector of file names in user friendly format
#######################################################################
show.files<-function(file.name.vector){
for(i in 1:length(file.name.vector)){
cat(i, file.name.vector[i], "\n", sep=" ")
}
}
################################################################
# A Function takes a vector of file names and a directory path and
# returns a list in which each item in the list is an ordered
# vector of words from one of the files in the files vector
################################################################
make.file.word.list<-function(file.vector, inputDir){
text.word.vector.list<-list()
for(i in 1:length(file.vector)){
# read in the file from dir
text.lines<-scan(paste(inputDir, file.vector[i], sep="/"), what="character", sep="\n")
# convert to a single string called text #
text<-paste(text.lines, collapse=" ")
text.lower<-tolower(text)
text.words.list<-strsplit(text.lower, "\\W")
text.word.vector<-unlist(text.words.list)
# remove the blanks #
text.word.vector<-text.word.vector[which(text.word.vector!="")]
# use the index id from the file.vector vector as the "name" of the list
text.word.vector.list[[file.vector[i]]]<-text.word.vector
}
return(text.word.vector.list)
}
################################################################
# A Simple Function for creating a KWIC list
################################################################
doitKwic<-function(named.text.word.vector.list){
show.files(names(named.text.word.vector.list))
# ask the user for three bits of information
fileid<- as.numeric(readline("Which file would you like to
examine? Enter a file number: \n"))
context<- as.numeric(readline("How much context do you want to
see, Enter a number: \n"))
keyword<- tolower((readline("Enter a keyword: \n")))
hits<-which(named.text.word.vector.list[[fileid]] == keyword)
if(length(hits)>0){
for(h in 1:length(hits)){
start<-hits[h]-context
if(start < 1){ # may need this: if(start < 1 && h == 1){
start<-1
}
end<-hits[h]+context
cat(named.text.word.vector.list[[fileid]][start:end],
"\n")
}
}
}
################################################################
# A Nicer Function for creating a KWIC list
################################################################
doitKwicBetter<-function(named.text.word.vector.list){
show.files(names(named.text.word.vector.list))
# ask the user for three bits of information
fileid<- as.numeric(readline("Which file would you like to examine? Enter a file number: \n"))
context<- as.numeric(readline("How much context do you want to see, Enter a number: \n"))
keyword<- tolower((readline("Enter a keyword: \n")))
hits<-which(named.text.word.vector.list[[fileid]] == keyword)
if(length(hits)>0){
result<-NULL
for(h in 1:length(hits)){
start<-hits[h]-context
if(start < 1){ #if(start < 1 && h == 1){
start<-1
}
end<-hits[h]+context
cat("\n-----------------------", h, "-------------------------\n")
cat(named.text.word.vector.list[[fileid]][start:(hits[h]-1)], sep=" ")
cat(" [", named.text.word.vector.list[[fileid]][hits[h]],"] ", sep="")
cat(named.text.word.vector.list[[fileid]][(hits[h]+1):end], sep=" ")
myrow<-cbind(hits[h], paste(named.text.word.vector.list[[fileid]][start:(hits[h]-1)], collapse=" "), paste(named.text.word.vector.list[[fileid]][hits[h]], collapse=" "), paste(named.text.word.vector.list[[fileid]][(hits[h]+1):end], collapse=" "))
result<-rbind(result,myrow)
}
colnames(result)<-c("position", "left", "keyword", "right")
toprint<-as.numeric((readline("Would you like to save this result to a file: enter 1=yes or 0=no \n")))
if(toprint==1){
write.csv(result, paste(keyword,"_In_", context, names(named.text.word.vector.list)[fileid], ".csv"))
}
} else {
cat("YOUR KEYWORD WAS NOT FOUND\n")
}
}
############################################################
# A Function to extract a table of relative frequencies
############################################################
getWordLists<-function(doc.object){
paras<-getNodeSet(doc.object, "/TEI/text/body//p")
words<-paste(sapply(paras, xmlValue), collapse=" ")
words.lower<-tolower(words)
words.list<-strsplit(words.lower, "\\W|_")
word.vector<-unlist(words.list)
book.freqs<-table(word.vector[which(word.vector!="")])
book.freqs.rel<-100*(book.freqs/sum(book.freqs))
return(book.freqs.rel)
}
getWordSegmentLists<-function(doc.object, chunk.size=10){
paras<-getNodeSet(doc.object, "/TEI/text/body//p")
words<-paste(sapply(paras,xmlValue), collapse=" ")
words.lower <-tolower(words)
words.list<-strsplit(words.lower, "\\W")
word.vector<-unlist(words.list)
chunk.max<-length(word.vector)/chunk.size
x <- seq_along(word.vector)
chunks <- split(word.vector, ceiling(x/chunk.max))
chunks<-lapply(chunks, removeBlanks) # note to self, fix this so it comes before chunking
freq.chunks<-lapply(chunks, table)
rel.freq.chunk.list<-lapply(freq.chunks, prop.table)
return(rel.freq.chunk.list)
}
removeBlanks<-function(x){
x<-gsub("_+", "", x)
x[which(x!="")]
}
my.mapply<-function(chunk.list){
my.list<-mapply(data.frame, ID=seq_along(chunk.list), chunk.list, SIMPLIFY=FALSE, MoreArgs=list(stringsAsFactors=FALSE))
my.df <- do.call(rbind, my.list)
return(my.df)
}
####Functions associated with POS Tagging/selection
SplitText <- function(Phrase) {
unlist(strsplit(Phrase," "))
}
SelectTaggedWords <- function(Words,tagID) {
Words[ grep(tagID,Words) ]
}
RemoveTags <- function(Words) {
sub("/[A-Z]{2,3}","",Words)
}
RemoveNonChar <- function(Words) {
gsub("[^[:alnum:][:space:]']","",Words)
}