########################################################################################################## # THREADNET: Core functions # This software may be used according to the terms provided in the # GNU General Public License (GPL-3.0) https://opensource.org/licenses/GPL-3.0? # Absolutely no warranty! ########################################################################################################## # These are the basic functions that convert threads to networks, etc. #' Converts threads to network #' #' Converts a sequentially ordered streams of ;events (threads) and creates a unimodal, unidimensional network. #' Sequentially adjacent pairs of events become edges in the resulting network. #' @family ThreadNet_Core #' #' @param et dataframe containing threads #' @param TN name of column in dataframe that contains a unique thread number for each thread #' @param CF name of the column in dataframe that contains the events that will form the nodes of the network #' @param timesplit time measure #' #' @return a list containing two dataframes, one for the nodes (nodeDF) and one for the edges (edgeDF) #' #' @export threads_to_network <- function(et,TN,CF,timesplit){ et$time = et[[timesplit]] #et$time = et$POVseqNum #et$time<-as.numeric(et$tStamp) # First get the node names & remove the spaces node_label = unique(et[[CF]]) node_label=str_replace_all(node_label," ","_") # print("node_label") # print(node_label) # set up the data frames we need to draw the network nodes = data.frame( id = 1:length(node_label), label = node_label, title=node_label) node_position_y = data.frame(table(et[[CF]])) colnames(node_position_y) <- c('label', 'y_pos') node_position_x = aggregate(et$time, list(et[[CF]]), mean) colnames(node_position_x) <- c('label', 'x_pos') nodes = merge(nodes, node_position_y, by=c("label")) nodes = merge(nodes, node_position_x, by=c("label")) # get the 2 grams for the edges ngdf = count_ngrams(et,TN, CF, 2) # need to split 2-grams into from and to from_to_str = str_split(str_trim(ngdf$ngrams), " ", n=2) # need to find a better way to do this... nEdges = length(from_to_str) from_labels=matrix(data="", nrow=nEdges,ncol=1) to_labels =matrix(data="", nrow=nEdges,ncol=1) from=integer(nEdges) to=integer(nEdges) for (i in 1:length(from_to_str)){ # Get from and to by spliting the 2-gram from_labels[i] = str_split(from_to_str[[i]]," ")[1] to_labels[i] = str_split(from_to_str[[i]]," ")[2] # use match to lookup the nodeID from the label... from[i] = match(from_labels[i], nodes$label) to[i] = match(to_labels[i], nodes$label) } edges = data.frame( from, to, label = paste(ngdf$freq) ) edges = merge(edges, nodes[,c('id', 'y_pos', 'x_pos')], by.x=c('from'), by.y=c('id')) edges = merge(edges, nodes[,c('id', 'y_pos', 'x_pos')], by.x=c('to'), by.y=c('id')) colnames(edges)<-c('from', 'to', 'label', 'from_y', 'from_x', 'to_y', 'to_x') return(list(nodeDF = nodes, edgeDF = edges)) } # here is a version without all the position stuff, which should be separated out, if possible. # Added in the "group" for the network graphics - default group is 'threadNum' because it will always be there threads_to_network_original <- function(et,TN,CF,grp='threadNum'){ # print(head(et)) # # print(paste('CF=', CF)) # print(paste('grp=', grp)) # First get the node names & remove the spaces node_label = levels(factor(et[[CF]])) # unique(et[[CF]]) node_label=str_replace_all(node_label," ","_") nNodes = length(node_label) # print("node_label") # print(node_label) # print(paste('nNodes=', nNodes)) node_group=character() for (n in 1:nNodes){ node_group = c(node_group, as.character(unlist( et[which(et[[CF]]==node_label[n]),grp][1]) ) ) } # set up the data frames we need to draw the network nodes = data.frame( id = 1:length(node_label), label = node_label, Group = node_group, title=node_label) # get the 2 grams for the edges ngdf = count_ngrams(et,TN, CF, 2) # Adjust the frequency of the edges to 0-1 range ngdf$freq = round(ngdf$freq/max(ngdf$freq),3) # need to split 2-grams into from and to from_to_str = str_split(str_trim(ngdf$ngrams), " ", n=2) # need to find a better way to do this... nEdges = length(from_to_str) from_labels=matrix(data="", nrow=nEdges,ncol=1) to_labels =matrix(data="", nrow=nEdges,ncol=1) from=integer(nEdges) to=integer(nEdges) for (i in 1:length(from_to_str)){ # Get from and to by spliting the 2-gram from_labels[i] = str_split(from_to_str[[i]]," ")[1] to_labels[i] = str_split(from_to_str[[i]]," ")[2] # use match to lookup the nodeID from the label... from[i] = match(from_labels[i], nodes$label) to[i] = match(to_labels[i], nodes$label) } edges = data.frame( from, to, label = ngdf$freq, Value =ngdf$freq) %>% filter(!from==to) # print(paste("Edges:",edges)) return(list(nodeDF = nodes, edgeDF = edges)) } # Counting ngrams is essential to several ThreadNet functions #' Counts ngrams in a set of threads #' #' This function counts n-grams within threads where the length of the thread is greater than n. #' @family ThreadNet_Core #' #' @param o dataframe containing threads #' @param TN name of column in dataframe that contains a unique thread number for each thread #' @param CF name of the column in dataframe that contains the events that will form the nodes of the network #' @param n length of ngrams to count #' #' @return a dataframe with ngram, frequency and proportion in descending order #' #' @export count_ngrams <- function(o,TN,CF,n){ # Need a vector of strings, one for each thread, delimited by spaces # the function long_enough filters out the threads that are shorter than n # use space for the delimiter here text_vector = long_enough( thread_text_vector(o,TN,CF,' '), n, ' ') # print("text_vector") # print(text_vector) ng = get.phrasetable(ngram(text_vector,n)) # add a column here for the length of the ngram -- useful later! ng$len = n return(ng) } ################################################################# # #' Make new threads from a new POV #' #' Take the raw occurrences from the input file and sort them by time stamp within #' a set of contextual factors that remain constant for each thread. #' @family ThreadNet_Core #' #' @param o is the dataframe of cleaned ocurrences #' @param THREAD_CF is a list of 1 or more context factors that define the threads (and stay constant during each thread) #' @param EVENT_CF is a list of 1 or more context factors that define events (and change during threads) #' #' @return dataframe containing the same occurrences sorted from a different point of view #' #'@export ThreadOccByPOV <- function(o,THREAD_CF,EVENT_CF){ withProgress(message = "Creating Events", value = 0,{ n = 5 # make sure there is a value if (length(THREAD_CF) == 0 | length(EVENT_CF)==0){return(data.frame())} incProgress(1/n) # Sort by POV and timestamp. The idea is to get the stream of activities from # a particular point of view (e.g., actor, location, etc.) # add the new column that combines CFs, if necessary # get a new column name based on the thread_CF -- use this to define threads nPOV = newColName(THREAD_CF) occ = combineContextFactors(o,THREAD_CF, nPOV ) # print("nPOV") # print(nPOV) # # print("THREAD_CF") # print(THREAD_CF) # The event context factors define the new category of events within those threads occ = combineContextFactors(occ,EVENT_CF,newColName(EVENT_CF)) occ = occ[order(occ[nPOV],occ$tStamp),] # add two columns to the data frame occ$threadNum = integer(nrow(occ)) occ$seqNum = integer(nrow(occ)) # add new column called label - just copy the new combined event_CF column occ$label = occ[[newColName(EVENT_CF)]] # occurrences have zero duration occ$eventDuration = 0 # Also add columns for the time gapsthat appear from this POV occ$timeGap = diff_tStamp(occ$tStamp) # create new column for relative time stamp. Initialize to absolute tStamp and adjust below occ$relativeTime = lubridate::ymd_hms(occ$tStamp) # then get the unique values in that POV occ[nPOV] = as.factor(occ[,nPOV]) pov_list = levels(occ[[nPOV]]) incProgress(2/n) # now loop through the pov_list and assign values to the new columns start_row=1 thrd=1 for (p in pov_list){ # get the length of the thread tlen = sum(occ[[nPOV]]==p) # print(paste('start_row=',start_row)) # print(paste('thrd =', thrd )) # print(paste('p =', p )) # print(paste('tlen =', tlen )) # guard against error if (tlen>0){ #compute the index of the end row end_row = start_row+tlen-1 # print(paste('start_row =', start_row )) # print(paste('end_row =',end_row )) # they all get the same thread number and incrementing seqNum occ[start_row:end_row, "threadNum"] <- as.matrix(rep(as.integer(thrd),tlen)) occ[start_row:end_row, "seqNum"] <- as.matrix(c(1:tlen)) # find the earliest time value for this thread start_time = min(lubridate::ymd_hms(occ$tStamp[start_row:end_row])) # print(start_time) # increment the counters for the next thread start_row = end_row + 1 thrd=thrd+1 } # tlen>0 } incProgress(3/n) # split occ data frame by threadNum to find earliest time value for that thread # then substract that from initiated relativeTime from above occ_split = lapply(split(occ, occ$threadNum), function(x) {x$relativeTime = x$relativeTime - min(lubridate::ymd_hms(x$tStamp)); x}) # # row bind data frame back together occ= data.frame(do.call(rbind, occ_split)) # these are just equal to the row numbers -- one occurrence per event occ["occurrences"] = 1:nrow(occ) # now go through and change each of the CF values to a vector (0,0,0,1,0,0,0,0) for (cf in EVENT_CF){ #make a new column for each CF VCF = paste0("V_",cf) occ[[VCF]]= vector(mode = "integer",length=nrow(occ)) for (r in 1:nrow(occ)){ occ[[r,VCF]] = list(convert_CF_to_vector(occ,cf,r)) } } incProgress(4/n) # just add the one column with the combined values # occ["ZM_1"] = as.integer(occ[,newColName(EVENT_CF)]) # this will store the event map in the GlobalEventMappings and return events with network cluster added for zooming... e=clusterEvents(occ, 'OneToOne', 'Network Proximity', EVENT_CF,'threads') }) # for debugging, this is really handy # save(occ,e,file="O_and_E_1.rdata") print('done converting occurrences...') shinyjs::show(selector = "#navbar li a[data-value=visualize]") shinyjs::show(selector = "#navbar li a[data-value=subsets]") shinyjs::show(selector = "#navbar li a[data-value=comparisons]") shinyjs::show(selector = "#navbar li a[data-value=movingWindow]") shinyjs::show(selector = "#navbar li a[data-value=parameterSettings]") incProgress(5/n) return( e ) } ############################################################################################################## #' Maps occurrences into events #' #' Thus function provides a place to map occurrences into events, so is is not necessary to interpret individual #' occurrences in isolation. There are many ways to accomplish this mapping. #' @family ThreadNet_Core #' #' @param o a dataframe of occurrences #' @param m = method parameter = one of c('Variable chunks','Uniform chunks') #' @param EventMapName = used to store this mapping for visualization and comparison #' @param uniform_chunk_size = used to identify breakpoints -- from input slider #' @param tThreshold = used to identify breakpoints -- from input slider #' @param timescale hours, min or sec #' @param chunk_CF - context factors used to delineate chunks #' @param EVENT_CF - context factors used to define events #' @param compare_CF = context factors used for comparison -- need to be copied over here when the thread is created. #' #' @result event data frame, with occurrences aggregated into events. #' #' @export # new version containing more ways to create chunks -- uses concepts from original prototype, but better implementation # chunk by handoff, time gap and handoff gap OccToEvents_By_Chunk <- function(o, m, EventMapName, uniform_chunk_size, tThreshold, timescale='mins', chunk_CF, event_CF, compare_CF){ # Inputs: o = table of occurrences # m = method parameter = c( "Handoffs", "Time Gap","Fixed Size") # uniform_chunk_size = used to identify breakpoints -- from input slider # tThreshold = used to identify breakpoints -- from input slider # EventMapName = used to store this mapping in an environment # CF_compare = context factors used for comparison -- need to be copied over here when the thread is created. # Only run if eventMapName is filled in if (EventMapName =="") {return(data.frame()) } # put this here for now timescale='mins' #### First get the break points between the events # Ideally, these should operate WITHIN each thread, not on the whole set of occurrences... # Add RLE -- consecutive runs -- as a way to chunk -- let user pick the CFs # very similar to the changes algorithm... # choices = c( "Changes", "Time Gap","Fixed Size"), if (m=="Changes"){ o$handoffGap = diff_handoffs(o[chunk_CF]) breakpoints = which(o$handoffGap == 0) } else if (m=="Time Gap") { breakpoints = which(o$timeGap > tThreshold) } else if (m=="Fixed Size") { breakpoints = seq(1,nrow(o),uniform_chunk_size) } # Grab the breakpoints from the beginning of the threads as well threadbreaks = which(o$seqNum == 1) breakpoints = sort(union(threadbreaks,breakpoints)) ### Use the break points to find the chunks -- just store the index back to the raw data nChunks = length(breakpoints) # print(paste("nChunks=",nChunks)) # make the dataframe for the results. This is the main data structure for the visualizations and comparisons. e = make_event_df(event_CF, compare_CF, nChunks) # need to create chunks WITHIN threads. Need to respect thread boundaries # take union of the breakpoints, plus thread boundaries, plus 1st and last row # counters for assigning thread and sequence numbers thisThread=1 # just for counting in this loop lastThread=0 seqNo=0 # resets for each new thread for (chunkNo in 1:nChunks){ # Chunks start at the current breakpoint start_idx=breakpoints[chunkNo] # Chunks end at the next breakpoint, minus one # for the last chunk,the stop_idx is the last row if (chunkNo < nChunks){ stop_idx = breakpoints[chunkNo+1] - 1 } else if (chunkNo==nChunks){ stop_idx = nrow(o) } # assign the occurrences e$occurrences[[chunkNo]] = list(start_idx:stop_idx) e$label[[chunkNo]] = paste0('<', str_replace_all(concatenate(o$label[start_idx:stop_idx]), ' ','++'), '>') # assign timestamp and duration e$tStamp[chunkNo] = o$tStamp[start_idx] e$eventDuration[chunkNo] = difftime(o$tStamp[stop_idx], o$tStamp[start_idx],units=timescale ) # copy in the threadNum and assign sequence number e$threadNum[chunkNo] = o$threadNum[start_idx] thisThread = o$threadNum[start_idx] # fill in data for each of the context factors for (cf in compare_CF){ e[chunkNo,cf] = as.character(o[start_idx,cf]) } for (cf in event_CF){ VCF = paste0("V_",cf) e[[chunkNo, VCF]] = list(aggregate_VCF_for_event(o,e$occurrences[chunkNo],cf )) } # Advance or reset the seq counters if (thisThread == lastThread){ seqNo = seqNo +1 } else if (thisThread != lastThread){ lastThread = thisThread seqNo = 1 } e$seqNum[chunkNo] = seqNo } # convert them to factors for (cf in compare_CF){ e[cf] = as.factor(e[,cf]) } # fill in the last column with the label (tried using row number...) e$ZM_1 = as.factor(e$label) # e$ZM_1 = 1:nrow(e) # print(head(e)) # this will store the event map in the GlobalEventMappings and return events with network cluster added for zooming... e=clusterEvents(e, EventMapName, 'Contextual Similarity', event_CF,'threads') # for debugging, this is really handy # save(o,e,file="O_and_E_2.rdata") return(e) } # this one creates events based on frequent ngrams or regular expressions OccToEvents3 <- function(o, EventMapName,EVENT_CF, compare_CF,TN, CF, rx, KeepIrregularEvents){ # print(rx) # Only run if eventMapName is filled in if (EventMapName =="") {return(data.frame()) } # keep track of the length of each pattern for (i in 1:nrow(rx)) {rx$patLength[i] = length(unlist(strsplit(rx$pattern[i], ','))) } # print(rx) # print(rx$label) # put this here for now timescale='mins' # Only run if eventMapName is filled in; return empty data frame otherwise if (EventMapName ==""){return(data.frame())} # get the text vector for this set of threaded occurrences, delimited by commas tv = thread_text_vector( o, TN, CF, ',' ) # apply regex to tv and split on the commas tvrx = replace_regex_list( tv, rx ) tvrxs = lapply(1:length(tvrx), function(i) {unlist(strsplit(tvrx[[i]],','))}) # print('tvrx') # print(tvrx[1:3]) # print('tvrxs') # print(tvrxs[1:3]) # count the total number of chunks nChunks = length(unlist(tvrxs)) # make the dataframe for the results. This is the main data structure for the visualizations and comparisons. e = make_event_df(EVENT_CF, compare_CF, nChunks) # # for debugging, this is really handy # save(o,rx,tvrxs, file="O_and_E_3.rdata") #loop through the threads and fill in the data for each event # when it's a row number in the input data array, just copy the row # when it's one of the regex labels, use the numbers in the pattern to compute V_ for the new event chunkNo=0 original_row=0 for (thread in 1:length(tvrxs)){ # the events stay in sequence for (sn in 1:length(tvrxs[[thread]])){ # increment the current row numbers chunkNo = chunkNo+1 original_row=original_row+1 # print(paste("original_row=",original_row)) # assign the thread and sequence number e$threadNum[chunkNo] = thread e$seqNum[chunkNo] = sn # Make sure the CFs are factors for (cf in compare_CF){ e[[chunkNo, cf]] = o[[original_row, cf]] } # this is a chunk that matched one of the patterns if (tvrxs[[thread]][sn] %in% rx$label ){ # print(paste('thread sn = ',thread, sn)) # print(paste('matched regex label',tvrxs[[thread]][sn])) # Use the ZM_1 column to store the new labels e$ZM_1[chunkNo] = tvrxs[[thread]][sn] e$label[chunkNo] = tvrxs[[thread]][sn] # need to locate the unique for from the o dataframe and aggregate those V_cf values. rxLen = rx$patLength[which( rx$label==tvrxs[[thread]][sn])] e$occurrences[[chunkNo]] = list(seq(original_row,original_row+rxLen-1,1)) original_row = original_row + rxLen-1 # compute the V_ based on the occurrences for (cf in EVENT_CF){ VCF = paste0("V_",cf) e[[chunkNo, VCF]] = list(aggregate_VCF_for_regex(o,e$occurrences[chunkNo],cf )) } # assign timestamp and duration -- use first - last occurrence times # e$eventDuration[chunkNo] = difftime(o$tStamp[stop_idx], o$tStamp[start_idx],units=timescale ) e[[chunkNo,'tStamp']] = o[[original_row,'tStamp']] } else if (KeepIrregularEvents=='Keep') { # copy data from input structure # print(paste('no match',tvrxs[[thread]][sn])) # Use the ZM_1 column to store the new labels e$ZM_1[chunkNo] = tvrxs[[thread]][sn] e$label[chunkNo] = tvrxs[[thread]][sn] e$occurrences[[chunkNo]] = original_row # copy the rest of the data for (cf in EVENT_CF){ VCF = paste0("V_",cf) e[[chunkNo, VCF]] = o[[original_row, VCF]] } e[[chunkNo,'tStamp']] = o[[original_row,'tStamp']] e[[chunkNo,'eventDuration']] = o[[original_row,'eventDuration']] } } # sn loop } # thread loop # take out the irregular events (empty rows) if so desired if (KeepIrregularEvents=='Drop'){ # keep the subset where the event is not blank e=subset(e, !ZM_1=='') } # # for debugging, this is really handy # save(o,e,rx,tvrxs, file="O_and_E.rdata") # store the event map in the GlobalEventMappings and return the eventmap eventMap = store_event_mapping(EventMapName, e) return(eventMap[['threads']]) } # new function for new tab # e is the event list # EventMapName is an input selected from the list of available mappings # cluster_method is either "Sequential similarity" or "Contextual Similarity" or "Network Structure" clusterEvents <- function(e, NewMapName, cluster_method, event_CF,what_to_return='cluster'){ # make sure to cluster on the correct column (one that exists...) if (cluster_method=="Sequential similarity") { dd = dist_matrix_seq(e) } else if (cluster_method=="Contextual Similarity") { dd = dist_matrix_context(e,event_CF) } else if (cluster_method=="Network Proximity") { # The focal column is used to trade the network. It will probably only be present in the OneToOne mapping, but we should check more generally # if it's not present, then use the highest granularity of zooming available. focalCol =newColName(event_CF) # print(paste('in cluster_events, at first, focalCol=',focalCol)) # print( colnames(e)) if (! focalCol %in% colnames(e)) {focalCol = paste0('ZM_',zoom_upper_limit(e))} # print(paste('in cluster_events, then, focalCol=',focalCol)) dd = dist_matrix_network(e,focalCol) } ### cluster the elements clust = hclust( dd, method="ward.D2" ) ######## need to delete the old ZM_ columns and append the new ones. ########### e[grep("ZM_",colnames(e))]<-NULL # number of chunks is the number of rows in the distance matrix nChunks = attr(dd,'Size') # print(paste('nChunks = ', nChunks)) # make new data frame with column names for each cluster level zm = setNames(data.frame(matrix(ncol = nChunks, nrow = nChunks)), paste0("ZM_", 1:nChunks)) ## Create a new column for each cluster solution for (cluster_level in 1:nChunks){ clevelName = paste0("ZM_",cluster_level) zm[clevelName] = cutree(clust, k=cluster_level) } # for cluster_level # append this onto the events to allow zooming # need to handle differently for network clusters # we are relying on "unique" returning values in the same order whenever it is called on the same data if (cluster_method=="Network Proximity") {merge_col_name = newColName(event_CF) zm[[merge_col_name]]=unique(e[[merge_col_name]]) newmap = merge(e, zm, by=merge_col_name) } else {newmap=cbind(e, zm)} # save(newmap,e,zm, file='O_and_E_zoom.rdata') # only store the event map in the GlobalEventMappings if something is filled in if (!NewMapName=="") { eventMap = store_event_mapping( NewMapName, newmap ) } # return the cluster solution for display if (what_to_return=='cluster') {return(clust)} else {return(eventMap[['threads']])} } # this function pulls computes their similarity of chunks based on sequence dist_matrix_seq <- function(e){ nChunks = nrow(e) evector=vector(mode="list", length = nChunks) for (i in 1:nChunks){ evector[i]=unique(as.integer(unlist(e$occurrences[[i]]))) } return( stringdistmatrix( evector, method="osa") ) } # this function pulls computes their similarity of chunks based on context # e = events, with V_columns # CF = event CFs # w = weights (0-1) # dist_matrix_context <- function( e, CF ){ nChunks = nrow(e) evector= VCF_matrix( e, paste0( "V_",CF )) return( dist( evector, method="euclidean") ) } # this function computes their similarity of chunks based on network dist_matrix_network <- function(e,CF){ # first get the nodes and edges n=threads_to_network_original(e,'threadNum',CF) # print(paste('in dist_matrix_network, n=', n)) # now get the shortest paths between all nodes in the graph d=distances(graph_from_data_frame(n$edgeDF), v=n$nodeDF[['label']], to=n$nodeDF[['label']]) return( as.dist(d) ) } net_adj_matrix <- function(edges){ return(as_adj(graph_from_edgelist(as.matrix(edges)))) } # new data structure for events (BTP 3/28) make_event_df <- function(event_CF,compare_CF,N){ # Make a data frame with columns for each CF, and put one vector into each column e = data.frame( tStamp = numeric(N), # this is the event start time eventDuration = numeric(N), label = character(N), occurrences = integer(N), threadNum = integer(N), seqNum = integer(N)) # add columns for each of the context factors used to define events # first make the dataframes for each cf1v=setNames(data.frame(matrix(ncol = length(event_CF), nrow = N)), paste0("V_",event_CF)) cf2=setNames(data.frame(matrix(ncol = length(compare_CF), nrow = N)), compare_CF) # Then combine them e = cbind(e, cf2,cf1v) # and add one more column for the event code/description -- maybe use label instead of this? e$ZM_1 = character(N) return(e) } # this will convert the context factor into a list (like this: 0 0 0 0 0 0 0 0 1 0 0) # o is the dataframe of occurrences # CF is the context factor (column) # r is the row (occurrence number in the One-to-One mapping) convert_CF_to_vector <- function(o,CF,r){ return(as.integer((levels(o[[CF]]) ==o[[r,CF]])*1)) } # Aggregate the VCF (CF vector) for that CF # There are two layers to this. # 1) aggregate_VCF_for_event # Within an single event, aggregate the VCF for the occurrences that make up that event. # This function will only get used when creating from the fuctions that convert occurrences to events # 2) aggregate_VCF_for_cluster # For a cluster level, aggregate the events at that cluster level (e.g., ZM_n) # This function will work on any event, even the one_to_one mapping. # # o is a dataframe of occurrences. The values of V_ (the VCF) does not have to be filled in. It gets re-computed here for each occurrence. # occlist is the list of occurrences of that event (e$occurrences) # cf is the name of the contextual factor to create the VCF aggregate_VCF_for_event <- function(o, occList, cf){ # get the column name for the VCF VCF = paste0("V_",cf) # start with the first one so the dimension of the vector is correct aggCF = convert_CF_to_vector(o, cf, unlist(occList)[1]) # print( aggCF) # now add the rest, if there are any if (length(unlist(occList)) > 1){ for (idx in seq(2,length(unlist(occList)),1)){ aggCF = aggCF + convert_CF_to_vector(o, cf, unlist(occList)[idx]) }} return(aggCF) } # this version assumes that the VCF is already computed. # Might come in handy, but it's not correct... aggregate_VCF_for_regex <- function(o, occList, cf){ # get the column name for the VCF VCF = paste0("V_",cf) # start with the first one so the dimension of the vector is correct aggCF = unlist(o[unlist(occList)[1],VCF]) # print( aggCF) # now add the rest, if there are any if (length(unlist(occList)) > 1){ for (idx in seq(2,length(unlist(occList)),1)){ # print( aggCF) aggCF = aggCF+unlist(o[[unlist(occList)[idx],VCF]]) }} return(aggCF) } # Same basic idea, but works on a set of events within a cluster, rather than a set of occurrences within an event # so you get get a subset of rows, convert to a matrix and add them up # e holds the events # cf holds a single contextual factor, so you need to call this in a loop # zoom_col and z are used to subset the data. They could actually be anything. aggregate_VCF_for_cluster <- function(e, cf, eclust, zoom_col){ # get the column name for the VCF VCF = paste0("V_",cf) # get the matrix for each # get the subset of events for that cluster -- just the VCF column # s = e[ which(e[[zoom_col]]==eclust), VCF] This version uses the s = e[ which(as.integer(e[[zoom_col]])==eclust), VCF] # print (s) # print(paste("length(s)",length(s))) if ( is.null(unlist(s) )) return(NULL) else return( colSums( matrix( unlist(s), nrow = length(s), byrow = TRUE) )) } # this one takes the whole list VCF_matrix <- function(e, vcf ){ m = one_vcf_matrix(e, vcf[1] ) if (length(vcf)>1){ for (idx in seq(2,length(vcf),1)){ m = cbind( m, one_vcf_matrix(e, vcf[idx] ) ) }} return(m) } # this one takes a single column as an argument one_vcf_matrix <- function(e, vcf){ return( matrix( unlist( e[[vcf]] ), nrow = length( e[[vcf]] ), byrow = TRUE) ) } # this is used for the regex pages to show the threads. # similar code is used in count_ngrams and to make networks, but with different delimiters # and with a minimum sequence length (ngram size), but this can be filtered after this function3 thread_text_vector <- function(o, TN, CF, delimiter){ # Initialize text vector tv = vector(mode="character") # Loop through the unique thread numbers j=0 for (i in unique(o[[TN]])){ txt =o[o[[TN]]==i,CF] j=j+1 tv[j] = str_replace_all(concatenate(o[o[[TN]]==i,CF] ),' ',delimiter) } return(tv) } # use this to replace patterns for regex and ngrams # tv is the text vector for the set of threads # rx is the dataframe for regexpressions ($pattern, $label) replace_regex_list <- function(tv, rx ){ for (i in 1:length(tv)) { for (j in 1:nrow(rx) ) { tv[i] = str_replace_all(tv[i],rx$pattern[j],rx$label[j]) } } return(tv) } # same function, but with lapply -- but does not work. # replace_regex_list_lapply <- function(tv, rx){ # # lapply(1:length(tv), function(i){ # lapply(1:nrow(rx),function(j){ # str_replace_all(tv[i], rx$pattern[j], rx$label[j] ) } # ) }) # } # No longer needed? # selectize_frequent_ngrams<- function(e, TN, CF, minN, maxN, threshold){ # # f=str_replace_all(trimws(frequent_ngrams(e, TN, CF, minN, maxN, threshold,TRUE)[,'ngrams'], which=c('right')), ' ',',') # return(f) # } # combined set of frequent ngrams # add parameter to make maximal a choice frequent_ngrams <- function(e, TN, CF, minN, maxN, onlyMaximal=TRUE){ # initialize the output ng = count_ngrams(e,TN, CF,minN) if (maxN > minN){ for (i in seq(minN+1,maxN,1)){ ng = rbind(ng,count_ngrams(e,TN, CF,i)) } } # remove the rows that happen once and only keep the columns we want ng=ng[ng$freq>1,c('ngrams','freq', 'len')] # just take the maximal ones if so desired if (onlyMaximal) { ng=maximal_ngrams(ng) } # return the set sorted by most frequent return(ng[order(-ng$freq),]) } # this filters out ngrams that are contained within others ('2 2' is part of '2 2 2') maximal_ngrams <- function(ng){ # find out if each ngram is contained in all the others w = lapply(1:nrow(ng), function(i){ grep(ng$ngrams[i],ng$ngrams)} ) # get howMany times each one appears howMany = lapply(1:length(w), function(i){ length(w[[i]])} ) # return the ones that are unique return(ng[which(howMany==1),]) } # compute support level for each ngram # tv = text vectors for the threads # ng = frequent ngrams data frame # returns ng data frame with support level added support_level <- function(tv, ng) { # change the commas back to spaces tv=str_replace_all(tv, ',' , ' ') totalN = length(tv) # need to remove whitespace from the trailing edge of the ngrams ng$ngrams = trimws(ng$ngrams) # find out how many times each ngram is contained in each TV ng$support = unlist(lapply(1:nrow(ng), function(i){ length(grep(ng$ngrams[i],tv)) }) )/totalN # toss in the generativity level ng = generativity_level(tv,ng) return(ng) } # compute the generativity = in-degree and out-degree generativity_level<- function(tv, ng){ # for each ngram, look at the next longer size # Find the n+1-grams that match (as in the code for maximal ngrams). # There are two possibilities -- matching in the first or second position # The number of matches in the first position = the out-degree # The number of matches in the second position = the in-degree # if so desired, it should be possible to keep a list. # problem is that the tokens can be 1-3 characters long, and there are spaces... # Big Idea for frequent n-grams: use the DT:: and let people sort, select and apply all the ngrams they want. # Name them using the tokens but with a different delimiter to avoid confusion. Go Crazy! # convert to spaces tv=str_replace_all(tv, ',',' ') # first get the range we are looking for nList = unique(ng$len) z=list() # loop through them for (n in nList){ # print(paste('n = ',n)) #pick the ngrams of length n from the list given ngn= ng[ng$len==n,] # get ngrams of length n+1 -- make sure the threads are long enough ngplus = get.phrasetable(ngram( long_enough(tv,n+1, ' '), n+1)) # this picks out the ones that match w = lapply(1:nrow(ngn), function(i){ grep(ngn$ngrams[i],ngplus$ngrams)} ) #print(w) # print('z = ') zplus = lapply(1:nrow(ngn), function(i){ str_locate(ngplus$ngrams[w[[i]]],ngn$ngrams[i]) } ) # print(z) z = c(z,zplus) } # compute the in and out degree ng$in_degree = unlist(lapply(1:nrow(ng), function(i){ zm=z[[i]] length( zm[zm[,1]>1,1] ) } )) ng$out_degree = unlist( lapply(1:nrow(ng), function(i){ zm=z[[i]] length( zm[zm[,1]==1,1] ) } )) # ng$generativity = lapply(1:nrow(ng), function(i) {ng$out_degree[i] * ng$in_degree[i]}) return(ng) } # to avoid errors in count_ngrams, make sure the length of each thread in the text_vector tv is longer than the n-gram size, n # this gets used in various places so need to pass in the delimiter long_enough = function(tv,n,delimiter){ return(tv[ unlist(lapply(1:length(tv), function(i) {length(unlist(strsplit(tv[[i]],delimiter)))>=n})) ]) } # cluster by network path length