ThreadNet/R/ThreadNet_Core.R at 53_truncated_hover_threadmap · ThreadNet/ThreadNet

956 lines (706 loc) · 30.1 KB
##########################################################################################################
# THREADNET:  Core functions
# This software may be used according to the terms provided in the
# GNU General Public License (GPL-3.0) https://opensource.org/licenses/GPL-3.0?
# Absolutely no warranty!
##########################################################################################################
# These are the basic functions that convert threads to networks, etc.
#' Converts threads to network
#' Converts a sequentially ordered streams of ;events (threads) and creates a unimodal, unidimensional network.
#' Sequentially adjacent pairs of events become edges in the resulting network.
#' @family ThreadNet_Core
#' @param et dataframe containing threads
#' @param TN name of column in dataframe that contains a unique thread number for each thread
#' @param CF name of the column in dataframe that contains the events that will form the nodes of the network
#' @param timesplit time measure
#' @return a list containing two dataframes, one for the nodes (nodeDF) and one for the edges (edgeDF)
threads_to_network <- function(et,TN,CF,timesplit){
  et$time = et[[timesplit]]
  #et$time = et$POVseqNum
  #et$time<-as.numeric(et$tStamp)
  # First get the node names & remove the spaces
  node_label = unique(et[[CF]])
  node_label=str_replace_all(node_label," ","_")
  # print("node_label")
  # print(node_label)
  # set up the data frames we need to draw the network
  nodes = data.frame(
    id = 1:length(node_label),
    label = node_label,
    title=node_label)
  node_position_y = data.frame(table(et[[CF]]))
  colnames(node_position_y) <- c('label', 'y_pos')
  node_position_x = aggregate(et$time, list(et[[CF]]), mean)
  colnames(node_position_x) <- c('label', 'x_pos')
  nodes = merge(nodes, node_position_y, by=c("label"))
  nodes = merge(nodes, node_position_x, by=c("label"))
  # get the 2 grams for the edges
  ngdf = count_ngrams(et,TN, CF, 2)
  # need to split 2-grams into from and to
  from_to_str = str_split(str_trim(ngdf$ngrams), " ", n=2)
  # need to find a better way to do this...
  nEdges = length(from_to_str)
  from_labels=matrix(data="", nrow=nEdges,ncol=1)
  to_labels =matrix(data="", nrow=nEdges,ncol=1)
  from=integer(nEdges)
  to=integer(nEdges)
  for (i in 1:length(from_to_str)){
    # Get from and to by spliting the 2-gram
    from_labels[i] = str_split(from_to_str[[i]]," ")[1]
    to_labels[i] = str_split(from_to_str[[i]]," ")[2]
    # use match to lookup the nodeID from the label...
    from[i] = match(from_labels[i], nodes$label)
    to[i] = match(to_labels[i], nodes$label)
  edges = data.frame(
    label = paste(ngdf$freq)
  edges = merge(edges, nodes[,c('id', 'y_pos', 'x_pos')], by.x=c('from'), by.y=c('id'))
  edges = merge(edges, nodes[,c('id', 'y_pos', 'x_pos')], by.x=c('to'), by.y=c('id'))
  colnames(edges)<-c('from', 'to', 'label', 'from_y', 'from_x', 'to_y', 'to_x')
  return(list(nodeDF = nodes, edgeDF = edges))
# Counting ngrams is essential to several ThreadNet functions
#' Counts ngrams in a set of threads
#' This function counts n-grams within threads where the length of the thread is greater than n.
#' @family ThreadNet_Core
#' @param o dataframe containing threads
#' @param TN name of column in dataframe that contains a unique thread number for each thread
#' @param CF name of the column in dataframe that contains the events that will form the nodes of the network
#' @param n length of ngrams to count
#' @return a dataframe with ngram, frequency and proportion in descending order
count_ngrams <- function(o,TN,CF,n){
  # Need a vector of strings, one for each thread, delimited by spaces
  # the function long_enough filters out the threads that are shorter than n
  # use space for the delimiter here
  text_vector = long_enough( thread_text_vector(o,TN,CF,' '), n, ' ')
    # print("text_vector")
    # print(text_vector)
  ng = get.phrasetable(ngram(text_vector,n))
  # add a column here for the length of the ngram -- useful later!
  ng$len = n
  return(ng)
#################################################################
#' Make new threads from a new POV
#' Take the raw occurrences from the input file and sort them by time stamp within
#' a set of contextual factors that remain constant for each thread.
#' @family ThreadNet_Core
#' @param  o is the dataframe of cleaned ocurrences
#' #' @param  THREAD_CF is a list of 1 or more context factors that define the threads (and stay constant during each thread)
#' @param  EVENT_CF is a list of 1 or more context factors that define events (and change during threads)
#' @return dataframe containing the same occurrences sorted from a different point of view
ThreadOccByPOV <- function(o,THREAD_CF,EVENT_CF){
  # make sure there is a value
  if (length(THREAD_CF) == 0){return(o)}
  # Sort by POV and timestamp. The idea is to get the stream of activities from
  # a particular point of view (e.g., actor, location, etc.)
  # add the new column that combines CFs, if necessary
  # get a new column name based on the thread_CF -- use this to define threads
  nPOV = newColName(THREAD_CF)
  occ = combineContextFactors(o,THREAD_CF,newColName(THREAD_CF))
  # print("nPOV")
  # print(nPOV)
  # print("THREAD_CF")
  # print(THREAD_CF)
  # The event context factors define the new category of events within those threads
  occ = combineContextFactors(occ,EVENT_CF,newColName(EVENT_CF))
  occ = occ[order(occ[nPOV],occ$tStamp),]
  # add two columns to the data frame
  tNum = integer(nrow(occ))
  oNum = integer(nrow(occ))
  # CONSIDER USING variable normal names (drop POV here)
  occ$POVthreadNum = tNum
  occ$POVseqNum =   oNum
  # Also add columns for the time gaps and handoff gaps that appear from this POV
  timeGap = diff_tStamp(occ$tStamp)
  handoffGap = diff_handoffs(occ[EVENT_CF])
  occ$timeGap  =  timeGap
  occ$handoffGap = handoffGap
  # create new column for relative time stamp. Initialize to absolute tStamp and adjust below
  occ$relativeTime = lubridate::mdy_hms(occ$tStamp)
  # then get the unique values in that POV
  occ[nPOV] = as.factor(occ[,nPOV])
  pov_list = levels(occ[[nPOV]])
  # now loop through the pov_list and assign values to the new columns
  start_row=1
  for (p in pov_list){
    # get the length of the thread
    tlen = sum(occ[[nPOV]]==p)
    # guard against error
    if (tlen>0){
      #compute the index of the end row
      end_row = start_row+tlen-1
      # they all get the same thread number and incrementing seqNum
      occ[start_row:end_row, "POVthreadNum"] <- as.matrix(rep(as.integer(thrd),tlen))
      occ[start_row:end_row, "POVseqNum"] <- as.matrix(c(1:tlen))
      # split occ data frame by POVthreadNum to find earliest time value for that thread
      # then substract that from initiated relativeTime from above
      occ_split = lapply(split(occ, occ$POVthreadNum), function(x) {x$relativeTime = x$relativeTime - min(lubridate::mdy_hms(x$tStamp)); x})
      # row bind data frame back together
      occ_comb= do.call(rbind, occ_split)
      occ_comb = data.frame(occ_comb)
      # increment the counters for the next thread
      start_row = end_row + 1
      thrd=thrd+1
    } # tlen>0
  return(occ_comb)
##############################################################################################################
#' Maps occurrences into events
#' Thus function provides a place to map occurrences into events, so is is not necessary to interpret individual
#' occurrences in isolation.  There are many ways to accomplish this mapping.
#' @family ThreadNet_Core
#' @param  o  a dataframe of occurrences
#' @param mapping = one-to-one or clustering
#' @param m = method parameter = one of c('Variable chunks','Uniform chunks')
#' @param uniform_chunk_size = used to identify breakpoints -- from input slider
#' @param tThreshold = used to identify breakpoints -- from input slider
#' @param EventMapName = used to store this mapping in an environment (not used yet)
#' @param chunk_CF - context factors used to delineate chunks
#' @param EVENT_CF - context factors used to define events
#' @param compare_CF = context factors used for comparison -- need to be copied over here when the thread is created.
#' @param timescale hours, min or sec
#' @result event data frame, with occurrences aggregated into events.  Dataframe includes: threadNum, seqNum,
#' and set of columns E_1, E_2, ..., that indicate the membership of events in clusters of events.
OccToEvents1 <- function(o,EventMapName,EVENT_CF, compare_CF){
  # Only run if eventMapName is filled in; return empty data frame otherwise
  if (EventMapName ==""){return(data.frame())}
  # we are mapping one-to-one, so copy the input to the output and then add/rename some other columns as needed
   # occurrences have no duration
   e$eventDuration = 0
    # rename the threadNum and seqNum columns
    names(e)[names(e)=="POVthreadNum"] <- "threadNum"
    names(e)[names(e)=="POVseqNum"] <- "seqNum"
    #  these are just equal to the row numbers -- one occurrence per event
    e["occurrences"] =   1:nrow(e)
    # now go through and change each of the CF values to a vector (0,0,0,1,0,0,0,0)
      for (cf in EVENT_CF){
        #make a new column for each CF
        VCF = paste0("V_",cf)
        e[[VCF]]= vector(mode = "integer",length=nrow(e))
        for (r in 1:nrow(e)){
        e[[r,VCF]] = list(convert_CF_to_vector(e,cf,r))
    # just add the one column with the combined values
  #  e["ZM_1"] = as.factor(e[,newColName(EVENT_CF)])
     e["ZM_1"] = as.integer(e[,newColName(EVENT_CF)])
    # Add the mapping to the global list of mappings.  No longer storing the cluster solution here.
    # map = list(name = paste(EventMapName), threads = e)
    # GlobalEventMappings <<- append(list(map), GlobalEventMappings )
    # store the event map in the GlobalEventMappings
    eventMap = store_event_mapping(EventMapName, e)
    # print( get_event_mapping_names( GlobalEventMappings ) )
    # save(GlobalEventMappings, file="eventMappings.RData")
    # for debugging, this is really handy
    # save(o,e,file="O_and_E_1.rdata")
    return(eventMap)
# this one creates chunks based on the handoff index
OccToEvents2 <- function(o, EventMapName,EVENT_CF, compare_CF){
  # put this here for now
  timescale='mins'
  # Only run if eventMapName is filled in; return empty data frame otherwise
  if (EventMapName ==""){return(data.frame())}
  #### First get the break points between the events
        # whenever there is a zero handoff gap that means everything has changed
        breakpoints = which(o$handoffGap == 0)
        # Grab the breakpoints from the beginning of the threads as well
        threadbreaks = which(o$seqNum == 1)
        breakpoints = sort(union(threadbreaks,breakpoints))
  ### Use the break points to find the chunks -- just store the index back to the raw data
  nChunks = length(breakpoints)
  # print(paste("nChunks=",nChunks))
  # make the dataframe for the results.  This is the main data structure for the visualizations and comparisons.
  e = make_event_df(EVENT_CF, compare_CF, nChunks)
  #  need to create chunks WITHIN threads.  Need to respect thread boundaries
  # take union of the breakpoints, plus thread boundaries, plus 1st and last row
  # counters for assigning thread and sequence numbers
  thisThread=1  # just for counting in this loop
  lastThread=0
  seqNo=0  # resets for each new thread
  for (chunkNo in 1:nChunks){
    # Chunks start at the current breakpoint
    start_idx=breakpoints[chunkNo]
    # Chunks end at the next breakpoint, minus one
    # for the last chunk,the stop_idx is the last row
    if (chunkNo < nChunks){
      stop_idx = breakpoints[chunkNo+1] - 1
    } else if (chunkNo==nChunks){
      stop_idx = nrow(o)
    # assign the occurrences
    e$occurrences[[chunkNo]] = list(start_idx:stop_idx)
   # e$eventStop[chunkNo] = as.integer(stop_idx)
   # e$eventStart[chunkNo] = as.integer(start_idx)
    # assign timestamp and duration
    e$tStamp[chunkNo] = o$tStamp[start_idx]
    e$eventDuration[chunkNo] = difftime(o$tStamp[stop_idx], o$tStamp[start_idx],units=timescale )
    # copy in the threadNum and assign sequence number
    e$threadNum[chunkNo] = o$POVthreadNum[start_idx]
    thisThread = o$POVthreadNum[start_idx]
    # fill in data for each of the context factors
    for (cf in compare_CF){
      e[chunkNo,cf] = as.character(o[start_idx,cf])
    for (cf in EVENT_CF){
      VCF = paste0("V_",cf)
      e[[chunkNo, VCF]] = list(aggregate_VCF_for_event(o,e$occurrences[chunkNo],cf ))
    # Advance or reset the seq counters
    if (thisThread == lastThread){
      seqNo = seqNo +1
    } else if (thisThread != lastThread){
      lastThread = thisThread
      seqNo = 1
    e$seqNum[chunkNo] = seqNo
  # convert them to factors
  for (cf in compare_CF){
    e[cf] = as.factor(e[,cf])
  ### Use optimal string alignment to compare the chunks.  This is O(n^^2)
  clust = hclust( dist_matrix_seq(e),  method="ward.D2" )
  ## Create a new column for each cluster solution -- would be faster with data.table
  for (cluster_level in 1:nChunks){
    clevelName = paste0("ZM_",cluster_level)
    e[clevelName] = cutree(clust, k=cluster_level)
      } # for cluster_level
  # for debugging, this is really handy
  #  save(o,e,file="O_and_E_2.rdata")
  # store the event map in the GlobalEventMappings
  eventMap = store_event_mapping(EventMapName, e)
 # print( get_event_mapping_names( GlobalEventMappings ) )
 # save(GlobalEventMappings, file="eventMappings.RData")
  #  need return the threads and also the cluster solution for display
  return(eventMap)
# this one creates events based on frequent ngrams or regular expressions
OccToEvents3 <- function(o, EventMapName,EVENT_CF, compare_CF,TN, CF, rx, KeepIrregularEvents){
  # print(rx)
  # keep track of the length of each pattern
  for (i in 1:nrow(rx))
  {rx$patLength[i] = length(unlist(strsplit(rx$pattern[i], ',')))
  # print(rx)
  # print(rx$label)
  # put this here for now
  timescale='mins'
  # Only run if eventMapName is filled in; return empty data frame otherwise
  if (EventMapName ==""){return(data.frame())}
  # get the text vector for this set of threaded occurrences, delimited by commas
  tv = thread_text_vector( o, TN, CF, ',' )
 # apply regex to tv and split on the commas
  tvrx =  replace_regex_list( tv, rx )
  tvrxs = lapply(1:length(tvrx), function(i) {unlist(strsplit(tvrx[[i]],','))})
  # print('tvrx')
  # print(tvrx[1:3])
  # print('tvrxs')
  # print(tvrxs[1:3])
# count the total number of chunks
 nChunks = length(unlist(tvrxs))
  # make the dataframe for the results.  This is the main data structure for the visualizations and comparisons.
  e = make_event_df(EVENT_CF, compare_CF, nChunks)
  # # for debugging, this is really handy
  # save(o,rx,tvrxs, file="O_and_E_3.rdata")
  #loop through the threads and fill in the data for each event
  # when it's a row number in the input data array, just copy the row
  # when it's one of the regex labels, use the numbers in the pattern to compute V_ for the new event
  chunkNo=0
  original_row=0
  for (thread in 1:length(tvrxs)){
    # the events stay in sequence
    for (sn in 1:length(tvrxs[[thread]])){
      # increment the current row numbers
      chunkNo = chunkNo+1
      original_row=original_row+1
      print(paste("original_row=",original_row))
    # assign the thread and sequence number
    e$threadNum[chunkNo] = thread
    e$seqNum[chunkNo] = sn
    # Make sure the CFs are factors
    for (cf in compare_CF){
      e[[chunkNo, cf]] =  o[[original_row, cf]]
    if (tvrxs[[thread]][sn] %in% rx$label ){
      # print(paste('thread sn = ',thread, sn))
      # print(paste('matched regex label',tvrxs[[thread]][sn]))
      # Use the ZM_1 column to store the new labels
      e$ZM_1[chunkNo] = tvrxs[[thread]][sn]
      # try just sticking in the pattern -- need to pick the correct one & convert to list of integers
      # INCORRECT: e$occurrences[[chunkNo]] = list(as.integer(unlist(strsplit( rx$pattern[which( rx$label==tvrxs[[thread]][sn])], ','))))
    # need to locate the unique for from the o dataframe and aggregate those V_cf values.
      rxLen = rx$patLength[which( rx$label==tvrxs[[thread]][sn])]
      e$occurrences[[chunkNo]] = list(seq(original_row,original_row+rxLen-1,1))
      original_row = original_row + rxLen-1
      # compute the V_ based on the occurrences
       for (cf in EVENT_CF){
         VCF = paste0("V_",cf)
         e[[chunkNo, VCF]] = list(aggregate_VCF_for_regex(o,e$occurrences[chunkNo],cf ))
      # assign timestamp and duration -- use first - last occurrence times
      # e$eventDuration[chunkNo] = difftime(o$tStamp[stop_idx], o$tStamp[start_idx],units=timescale )
      e[[chunkNo,'tStamp']] = o[[original_row,'tStamp']]
    else if (KeepIrregularEvents=='Keep') {
      # copy data from input structure
      # print(paste('no match',tvrxs[[thread]][sn]))
      # Use the ZM_1 column to store the new labels
      e$ZM_1[chunkNo] = tvrxs[[thread]][sn]
      e$occurrences[[chunkNo]] = original_row
      # copy the rest of the data
      for (cf in EVENT_CF){
      VCF = paste0("V_",cf)
      e[[chunkNo, VCF]] =  o[[original_row, VCF]]
      e[[chunkNo,'tStamp']] = o[[original_row,'tStamp']]
      e[[chunkNo,'eventDuration']] = o[[original_row,'eventDuration']]
    } # sn loop
    } # thread loop
# take out the irregular events (empty rows) if so desired
  if (KeepIrregularEvents=='Drop'){
    # keep the subset where the event is not blank
    e=subset(e, !ZM_1=='')
    # should probably re-number the sequence numbers...
  # # for debugging, this is really handy
  #   save(o,e,rx,tvrxs, file="O_and_E.rdata")
  # store the event map in the GlobalEventMappings
  eventMap = store_event_mapping(EventMapName, e)
  # print( get_event_mapping_names( GlobalEventMappings ) )
  # save(GlobalEventMappings, file="eventMappings.RData")
  #  need return the threads and also the cluster solution for display
  return(eventMap)
# new function for new tab
# e is the event list
# EventMapName is an input selected from the list of available mappings
# cluster_method is either "Sequential similarity" or "Contextual Similarity"
clusterEvents <- function(e, EventMapName, NewMapName, cluster_method, event_CF){
  # only run if something is filled in
  if (is.null(NewMapName) | NewMapName=="") return(NULL)
  if (cluster_method=="Sequential similarity")
    { dd = dist_matrix_seq(e) }
  else if (cluster_method=="Contextual Similarity")
    { dd = dist_matrix_context(e,event_CF) }
  else if (cluster_method=="Network Structure")
    { dd = dist_matrix_network(e) }
  ### Use optimal string alignment to compare the chunks.  This is O(n^^2)
  clust = hclust( dd,  method="ward.D2" )
  ######## need to delete the old ZM_ columns and append the new ones.  ###########
  e[grep("ZM_",colnames(e))]<-NULL
  # number of chunks is the number of rows (the number of events to be clustered)
  nChunks = nrow(e)
  # make new data frame with columns for cluster level
  zm = setNames(data.frame(matrix(ncol = nChunks, nrow = nChunks)), paste0("ZM_", 1:nChunks))
  ## Create a new column for each cluster solution
  for (cluster_level in 1:nChunks){
    clevelName = paste0("ZM_",cluster_level)
    zm[clevelName] = cutree(clust, k=cluster_level)
  } # for cluster_level
  # append this onto the events to allow zooming
  newmap=cbind(e, zm)
  # store the event map in the GlobalEventMappings
   eventMap = store_event_mapping( NewMapName, newmap )
   # return the cluster solution for display
  return(clust)
# this function pulls computes their similarity of chunks based on sequence
dist_matrix_seq <- function(e){
  nChunks = nrow(e)
  evector=vector(mode="list", length = nChunks)
  for (i in 1:nChunks){
    evector[i]=unique(as.integer(unlist(e$occurrences[[i]])))
  return( stringdistmatrix( evector, method="osa") )
# this function pulls computes their similarity of chunks based on context
# e = events, with V_columns
# CF = event CFs
# w = weights (0-1)
dist_matrix_context <- function( e, CF ){
  nChunks = nrow(e)
  evector= VCF_matrix( e, paste0( "V_",CF ))
  return( dist( evector, method="euclidean") )
# this function pulls computes their similarity of chunks based on network
dist_matrix_network <- function(e){
  nChunks = nrow(e)
  evector=vector(mode="list", length = nChunks)
  for (i in 1:nChunks){
    evector[i]=unique(as.integer(unlist(e$occurrences[[i]])))
  return( stringdistmatrix( evector, method="osa") )
net_adj_matrix <- function(edges){
  return(as_adj(graph_from_edgelist(as.matrix(edges))))
# new data structure for events (BTP 3/28)
make_event_df <- function(event_CF,compare_CF,N){
  # Make a data frame with columns for each CF, and put one vector into each column
  e = data.frame(
    tStamp = numeric(N),  # this is the event start time
    eventDuration = numeric(N),
    occurrences = integer(N),
    threadNum = integer(N),
    seqNum = integer(N))
  # add columns for each of the context factors used to define events
  # first make the dataframes for each
#  cf1=setNames(data.frame(matrix(ncol = length(event_CF), nrow = N)), event_CF)
  cf1v=setNames(data.frame(matrix(ncol = length(event_CF), nrow = N)), paste0("V_",event_CF))
  cf2=setNames(data.frame(matrix(ncol = length(compare_CF), nrow = N)), compare_CF)
  # cf2v=setNames(data.frame(matrix(ncol = length(compare_CF), nrow = N)), paste0("V_", compare_CF))
  # Then combine them
  e = cbind(e, cf2,cf1v)
  # and add one more column for the event code/description
  e$ZM_1 = character(N)
  return(e)
# this will convert the context factor into a list (like this: 0 0 0 0 0 0 0 0 1 0 0)
# o is the dataframe of occurrences
# CF is the context factor (column)
# r is the row (occurrence number in the One-to-One mapping)
convert_CF_to_vector <- function(o,CF,r){
  return(as.integer((levels(o[[CF]]) ==o[[r,CF]])*1))
# Aggregate the VCF (CF vector) for that CF
# There are two layers to this.
# 1) aggregate_VCF_for_event
#   Within an single event, aggregate the VCF for the occurrences that make up that event.
#   This function will only get used when creating from the fuctions that convert occurrences to events
# 2) aggregate_VCF_for_cluster
#   For a cluster level, aggregate the events at that cluster level (e.g., ZM_n)
#   This function will work on any event, even the one_to_one mapping.
# o is a dataframe of occurrences.  The values of V_ (the VCF) does not have to be filled in.  It gets re-computed here for each occurrence.
# occlist is the list of occurrences of that event (e$occurrences)
# cf is the name of the contextual factor to create the VCF
aggregate_VCF_for_event <- function(o, occList, cf){
  # get the column name for the VCF
  VCF = paste0("V_",cf)
  # start with the first one so the dimension of the vector is correct
  aggCF = convert_CF_to_vector(o, cf, unlist(occList)[1])
   print( aggCF)
  # now add the rest, if there are any
  if (length(unlist(occList)) > 1){
    for (idx in seq(2,length(unlist(occList)),1)){
       print( aggCF)
      aggCF = aggCF + convert_CF_to_vector(o, cf, unlist(occList)[idx])
  return(aggCF)
# this version  assumes that the VCF is already computed.
# Might come in handy, but it's not correct...
aggregate_VCF_for_regex <- function(o, occList, cf){
  # get the column name for the VCF
  VCF = paste0("V_",cf)
  # start with the first one so the dimension of the vector is correct
  aggCF = unlist(o[unlist(occList)[1],VCF])
   # print( aggCF)
  # now add the rest, if there are any
   if (length(unlist(occList)) > 1){
  for (idx in seq(2,length(unlist(occList)),1)){
    # print( aggCF)
    aggCF = aggCF+unlist(o[[unlist(occList)[idx],VCF]])
  return(aggCF)
# Same basic idea, but works on a set of events within a cluster, rather than a set of occurrences within an event
# so you get get a subset of rows, convert to a matrix and add them up
# e holds the events
# cf holds a single contextual factor, so you need to call this in a loop
# zoom_col and z are used to subset the data.  They could actually be anything.
aggregate_VCF_for_cluster <- function(e, cf, eclust, zoom_col){
  # get the column name for the VCF
  VCF = paste0("V_",cf)
  # get the matrix for each
  # get the subset of events for that cluster  -- just the VCF column
 # s =  e[ which(e[[zoom_col]]==eclust), VCF]   This version uses the
  s =  e[ which(as.integer(e[[zoom_col]])==eclust), VCF]
   # print (s)
   # print(paste("length(s)",length(s)))
  if ( is.null(unlist(s) ))
    return(NULL)
    return( colSums( matrix( unlist(s), nrow = length(s), byrow = TRUE) ))
# this one takes the whole list
VCF_matrix <- function(e, vcf ){
  m = one_vcf_matrix(e, vcf[1] )
  if (length(vcf)>1){
      for (idx in seq(2,length(vcf),1)){
         m = cbind( m, one_vcf_matrix(e, vcf[idx] ) )
  return(m)
# this one takes a single column as an argument
one_vcf_matrix <- function(e, vcf){
  return(  matrix( unlist( e[[vcf]] ), nrow = length( e[[vcf]] ), byrow = TRUE)  )
# this is used for the regex pages to show the threads.
# similar code is used in count_ngrams and to make networks, but with different delimiters
# and with a minimum sequence length (ngram size), but this can be filtered after this function3
thread_text_vector <- function(o, TN, CF, delimiter){
# Initialize text vector
tv = vector(mode="character")
# Loop through the unique thread numbers
for (i in unique(o[[TN]])){
    txt =o[o[[TN]]==i,CF]
    tv[j] = str_replace_all(concatenate(o[o[[TN]]==i,CF] ),' ',delimiter)
 return(tv)
# use this to replace patterns for regex and ngrams
# tv is the text vector for the set of threads
# rx is the dataframe for regexpressions ($pattern, $label)
replace_regex_list <- function(tv, rx ){
  for (i in 1:length(tv)) {
    for (j in 1:nrow(rx) ) {
     tv[i] = str_replace_all(tv[i],rx$pattern[j],rx$label[j])
  return(tv)
# same function, but with lapply -- but does not work.
# replace_regex_list_lapply <- function(tv, rx){
#   lapply(1:length(tv), function(i){
#     lapply(1:nrow(rx),function(j){
#       str_replace_all(tv[i], rx$pattern[j], rx$label[j] )  }
#     )  })
# No longer needed?
# selectize_frequent_ngrams<- function(e, TN, CF, minN, maxN, threshold){
#   f=str_replace_all(trimws(frequent_ngrams(e, TN, CF, minN, maxN, threshold,TRUE)[,'ngrams'], which=c('right')), ' ',',')
#   return(f)
# combined set of frequent ngrams
# add parameter to make maximal a choice
frequent_ngrams <- function(e, TN, CF, minN, maxN, onlyMaximal=TRUE){
  # initialize the output
  ng = count_ngrams(e,TN, CF,minN)
  if (maxN > minN){
      for (i in seq(minN+1,maxN,1)){
        ng = rbind(ng,count_ngrams(e,TN, CF,i)) }
  # remove the rows that happen once and only keep the columns we want
   ng=ng[ng$freq>1,c('ngrams','freq', 'len')]
  # just take the maximal ones if so desired
  if (onlyMaximal) { ng=maximal_ngrams(ng)  }
  # return the set sorted by most frequent
  return(ng[order(-ng$freq),])
# this filters out ngrams that are contained within others ('2 2' is part of '2 2 2')
maximal_ngrams <- function(ng){
  # find out if each ngram is contained in all the others
  w = lapply(1:nrow(ng), function(i){
         grep(ng$ngrams[i],ng$ngrams)}
  # get howMany times each one appears
  howMany = lapply(1:length(w), function(i){
                length(w[[i]])}
  # return the ones that are unique
  return(ng[which(howMany==1),])
# compute support level for each ngram
# tv = text vectors for the threads
# ng = frequent ngrams data frame
# returns ng data frame with support level added
support_level <- function(tv, ng) {
  # change the commas back to spaces
  tv=str_replace_all(tv, ',' , ' ')
  totalN = length(tv)
  # need to remove whitespace from the trailing edge of the ngrams
  ng$ngrams = trimws(ng$ngrams)
  # find out how many times each ngram is contained in each TV
  ng$support = unlist(lapply(1:nrow(ng), function(i){
    length(grep(ng$ngrams[i],tv)) })
        )/totalN
  # toss in the generativity level
  ng = generativity_level(tv,ng)
  return(ng)
# compute the generativity = in-degree and out-degree
generativity_level<- function(tv, ng){
  # for each ngram, look at the next longer size
  # Find the n+1-grams that match (as in the code for maximal ngrams).
  # There are two possibilities -- matching in the first or second position
  # The number of matches in the first position =  the out-degree
  # The number of matches in the second position =  the in-degree
  # if so desired, it should be possible to keep a list.
  # problem is that the tokens can be 1-3 characters long, and there are spaces...
  # Big Idea for frequent n-grams: use the DT:: and let people sort, select and apply all the ngrams they want.
  # Name them using the tokens but with a different delimiter to avoid confusion.  Go Crazy!
  # convert to spaces
  tv=str_replace_all(tv, ',',' ')
  # first get the range we are looking for
  nList = unique(ng$len)
  # loop through them
  for (n in nList){
    # print(paste('n = ',n))
    #pick the ngrams of length n from the list given
    ngn= ng[ng$len==n,]
    # get ngrams of length n+1 -- make sure the threads are long enough
    ngplus = get.phrasetable(ngram( long_enough(tv,n+1, ' '), n+1))
    # this picks out the ones that match
  w = lapply(1:nrow(ngn), function(i){
    grep(ngn$ngrams[i],ngplus$ngrams)} )
  #print(w)
  # print('z = ')
  zplus = lapply(1:nrow(ngn), function(i){
    str_locate(ngplus$ngrams[w[[i]]],ngn$ngrams[i])  } )
  # print(z)
  z = c(z,zplus)
  # compute the in and out degree
  ng$in_degree = unlist(lapply(1:nrow(ng), function(i){
    zm=z[[i]]
    length( zm[zm[,1]>1,1] )  } ))
  ng$out_degree = unlist( lapply(1:nrow(ng), function(i){
    zm=z[[i]]
    length( zm[zm[,1]==1,1] )  } ))
  # ng$generativity = lapply(1:nrow(ng), function(i) {ng$out_degree[i] * ng$in_degree[i]})
  return(ng)
# to avoid errors in count_ngrams, make sure the length of each thread in the text_vector tv is longer than the n-gram size, n
# this gets used in various places so need to pass in the delimiter
long_enough = function(tv,n,delimiter){
return(tv[ unlist(lapply(1:length(tv), function(i) {length(unlist(strsplit(tv[[i]],delimiter)))>=n})) ])
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

ThreadNet_Core.R

Latest commit

History

ThreadNet_Core.R

File metadata and controls