-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathThreadNet_Core.R
More file actions
956 lines (706 loc) · 30.1 KB
/
ThreadNet_Core.R
File metadata and controls
956 lines (706 loc) · 30.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
##########################################################################################################
# THREADNET: Core functions
# This software may be used according to the terms provided in the
# GNU General Public License (GPL-3.0) https://opensource.org/licenses/GPL-3.0?
# Absolutely no warranty!
##########################################################################################################
# These are the basic functions that convert threads to networks, etc.
#' Converts threads to network
#'
#' Converts a sequentially ordered streams of ;events (threads) and creates a unimodal, unidimensional network.
#' Sequentially adjacent pairs of events become edges in the resulting network.
#' @family ThreadNet_Core
#'
#' @param et dataframe containing threads
#' @param TN name of column in dataframe that contains a unique thread number for each thread
#' @param CF name of the column in dataframe that contains the events that will form the nodes of the network
#' @param timesplit time measure
#'
#' @return a list containing two dataframes, one for the nodes (nodeDF) and one for the edges (edgeDF)
#'
#' @export
threads_to_network <- function(et,TN,CF,timesplit){
et$time = et[[timesplit]]
#et$time = et$POVseqNum
#et$time<-as.numeric(et$tStamp)
# First get the node names & remove the spaces
node_label = unique(et[[CF]])
node_label=str_replace_all(node_label," ","_")
# print("node_label")
# print(node_label)
# set up the data frames we need to draw the network
nodes = data.frame(
id = 1:length(node_label),
label = node_label,
title=node_label)
node_position_y = data.frame(table(et[[CF]]))
colnames(node_position_y) <- c('label', 'y_pos')
node_position_x = aggregate(et$time, list(et[[CF]]), mean)
colnames(node_position_x) <- c('label', 'x_pos')
nodes = merge(nodes, node_position_y, by=c("label"))
nodes = merge(nodes, node_position_x, by=c("label"))
# get the 2 grams for the edges
ngdf = count_ngrams(et,TN, CF, 2)
# need to split 2-grams into from and to
from_to_str = str_split(str_trim(ngdf$ngrams), " ", n=2)
# need to find a better way to do this...
nEdges = length(from_to_str)
from_labels=matrix(data="", nrow=nEdges,ncol=1)
to_labels =matrix(data="", nrow=nEdges,ncol=1)
from=integer(nEdges)
to=integer(nEdges)
for (i in 1:length(from_to_str)){
# Get from and to by spliting the 2-gram
from_labels[i] = str_split(from_to_str[[i]]," ")[1]
to_labels[i] = str_split(from_to_str[[i]]," ")[2]
# use match to lookup the nodeID from the label...
from[i] = match(from_labels[i], nodes$label)
to[i] = match(to_labels[i], nodes$label)
}
edges = data.frame(
from,
to,
label = paste(ngdf$freq)
)
edges = merge(edges, nodes[,c('id', 'y_pos', 'x_pos')], by.x=c('from'), by.y=c('id'))
edges = merge(edges, nodes[,c('id', 'y_pos', 'x_pos')], by.x=c('to'), by.y=c('id'))
colnames(edges)<-c('from', 'to', 'label', 'from_y', 'from_x', 'to_y', 'to_x')
return(list(nodeDF = nodes, edgeDF = edges))
}
# Counting ngrams is essential to several ThreadNet functions
#' Counts ngrams in a set of threads
#'
#' This function counts n-grams within threads where the length of the thread is greater than n.
#' @family ThreadNet_Core
#'
#' @param o dataframe containing threads
#' @param TN name of column in dataframe that contains a unique thread number for each thread
#' @param CF name of the column in dataframe that contains the events that will form the nodes of the network
#' @param n length of ngrams to count
#'
#' @return a dataframe with ngram, frequency and proportion in descending order
#'
#' @export
count_ngrams <- function(o,TN,CF,n){
# Need a vector of strings, one for each thread, delimited by spaces
# the function long_enough filters out the threads that are shorter than n
# use space for the delimiter here
text_vector = long_enough( thread_text_vector(o,TN,CF,' '), n, ' ')
# print("text_vector")
# print(text_vector)
ng = get.phrasetable(ngram(text_vector,n))
# add a column here for the length of the ngram -- useful later!
ng$len = n
return(ng)
}
#################################################################
#
#' Make new threads from a new POV
#'
#' Take the raw occurrences from the input file and sort them by time stamp within
#' a set of contextual factors that remain constant for each thread.
#' @family ThreadNet_Core
#'
#' @param o is the dataframe of cleaned ocurrences
#' #' @param THREAD_CF is a list of 1 or more context factors that define the threads (and stay constant during each thread)
#' @param EVENT_CF is a list of 1 or more context factors that define events (and change during threads)
#'
#' @return dataframe containing the same occurrences sorted from a different point of view
#'
#'@export
ThreadOccByPOV <- function(o,THREAD_CF,EVENT_CF){
# make sure there is a value
if (length(THREAD_CF) == 0){return(o)}
# Sort by POV and timestamp. The idea is to get the stream of activities from
# a particular point of view (e.g., actor, location, etc.)
# add the new column that combines CFs, if necessary
# get a new column name based on the thread_CF -- use this to define threads
nPOV = newColName(THREAD_CF)
occ = combineContextFactors(o,THREAD_CF,newColName(THREAD_CF))
# print("nPOV")
# print(nPOV)
#
# print("THREAD_CF")
# print(THREAD_CF)
# The event context factors define the new category of events within those threads
occ = combineContextFactors(occ,EVENT_CF,newColName(EVENT_CF))
occ = occ[order(occ[nPOV],occ$tStamp),]
# add two columns to the data frame
tNum = integer(nrow(occ))
oNum = integer(nrow(occ))
# CONSIDER USING variable normal names (drop POV here)
occ$POVthreadNum = tNum
occ$POVseqNum = oNum
# Also add columns for the time gaps and handoff gaps that appear from this POV
timeGap = diff_tStamp(occ$tStamp)
handoffGap = diff_handoffs(occ[EVENT_CF])
occ$timeGap = timeGap
occ$handoffGap = handoffGap
# create new column for relative time stamp. Initialize to absolute tStamp and adjust below
occ$relativeTime = lubridate::mdy_hms(occ$tStamp)
# then get the unique values in that POV
occ[nPOV] = as.factor(occ[,nPOV])
pov_list = levels(occ[[nPOV]])
# now loop through the pov_list and assign values to the new columns
start_row=1
thrd=1
for (p in pov_list){
# get the length of the thread
tlen = sum(occ[[nPOV]]==p)
# guard against error
if (tlen>0){
#compute the index of the end row
end_row = start_row+tlen-1
# they all get the same thread number and incrementing seqNum
occ[start_row:end_row, "POVthreadNum"] <- as.matrix(rep(as.integer(thrd),tlen))
occ[start_row:end_row, "POVseqNum"] <- as.matrix(c(1:tlen))
# split occ data frame by POVthreadNum to find earliest time value for that thread
# then substract that from initiated relativeTime from above
occ_split = lapply(split(occ, occ$POVthreadNum), function(x) {x$relativeTime = x$relativeTime - min(lubridate::mdy_hms(x$tStamp)); x})
# row bind data frame back together
occ_comb= do.call(rbind, occ_split)
occ_comb = data.frame(occ_comb)
# increment the counters for the next thread
start_row = end_row + 1
thrd=thrd+1
} # tlen>0
}
return(occ_comb)
}
##############################################################################################################
#' Maps occurrences into events
#'
#' Thus function provides a place to map occurrences into events, so is is not necessary to interpret individual
#' occurrences in isolation. There are many ways to accomplish this mapping.
#' @family ThreadNet_Core
#'
#' @param o a dataframe of occurrences
#' @param mapping = one-to-one or clustering
#' @param m = method parameter = one of c('Variable chunks','Uniform chunks')
#' @param uniform_chunk_size = used to identify breakpoints -- from input slider
#' @param tThreshold = used to identify breakpoints -- from input slider
#' @param EventMapName = used to store this mapping in an environment (not used yet)
#' @param chunk_CF - context factors used to delineate chunks
#' @param EVENT_CF - context factors used to define events
#' @param compare_CF = context factors used for comparison -- need to be copied over here when the thread is created.
#' @param timescale hours, min or sec
#'
#' @result event data frame, with occurrences aggregated into events. Dataframe includes: threadNum, seqNum,
#' and set of columns E_1, E_2, ..., that indicate the membership of events in clusters of events.
#'
#' @export
OccToEvents1 <- function(o,EventMapName,EVENT_CF, compare_CF){
# Only run if eventMapName is filled in; return empty data frame otherwise
if (EventMapName ==""){return(data.frame())}
# we are mapping one-to-one, so copy the input to the output and then add/rename some other columns as needed
e=o
# occurrences have no duration
e$eventDuration = 0
# rename the threadNum and seqNum columns
names(e)[names(e)=="POVthreadNum"] <- "threadNum"
names(e)[names(e)=="POVseqNum"] <- "seqNum"
# these are just equal to the row numbers -- one occurrence per event
e["occurrences"] = 1:nrow(e)
# now go through and change each of the CF values to a vector (0,0,0,1,0,0,0,0)
for (cf in EVENT_CF){
#make a new column for each CF
VCF = paste0("V_",cf)
e[[VCF]]= vector(mode = "integer",length=nrow(e))
for (r in 1:nrow(e)){
e[[r,VCF]] = list(convert_CF_to_vector(e,cf,r))
}
}
# just add the one column with the combined values
# e["ZM_1"] = as.factor(e[,newColName(EVENT_CF)])
e["ZM_1"] = as.integer(e[,newColName(EVENT_CF)])
# Add the mapping to the global list of mappings. No longer storing the cluster solution here.
# map = list(name = paste(EventMapName), threads = e)
#
# GlobalEventMappings <<- append(list(map), GlobalEventMappings )
# store the event map in the GlobalEventMappings
eventMap = store_event_mapping(EventMapName, e)
# print( get_event_mapping_names( GlobalEventMappings ) )
# save(GlobalEventMappings, file="eventMappings.RData")
# for debugging, this is really handy
# save(o,e,file="O_and_E_1.rdata")
return(eventMap)
}
# this one creates chunks based on the handoff index
OccToEvents2 <- function(o, EventMapName,EVENT_CF, compare_CF){
# put this here for now
timescale='mins'
# Only run if eventMapName is filled in; return empty data frame otherwise
if (EventMapName ==""){return(data.frame())}
#### First get the break points between the events
# whenever there is a zero handoff gap that means everything has changed
breakpoints = which(o$handoffGap == 0)
# Grab the breakpoints from the beginning of the threads as well
threadbreaks = which(o$seqNum == 1)
breakpoints = sort(union(threadbreaks,breakpoints))
### Use the break points to find the chunks -- just store the index back to the raw data
nChunks = length(breakpoints)
# print(paste("nChunks=",nChunks))
# make the dataframe for the results. This is the main data structure for the visualizations and comparisons.
e = make_event_df(EVENT_CF, compare_CF, nChunks)
# need to create chunks WITHIN threads. Need to respect thread boundaries
# take union of the breakpoints, plus thread boundaries, plus 1st and last row
# counters for assigning thread and sequence numbers
thisThread=1 # just for counting in this loop
lastThread=0
seqNo=0 # resets for each new thread
for (chunkNo in 1:nChunks){
# Chunks start at the current breakpoint
start_idx=breakpoints[chunkNo]
# Chunks end at the next breakpoint, minus one
# for the last chunk,the stop_idx is the last row
if (chunkNo < nChunks){
stop_idx = breakpoints[chunkNo+1] - 1
} else if (chunkNo==nChunks){
stop_idx = nrow(o)
}
# assign the occurrences
e$occurrences[[chunkNo]] = list(start_idx:stop_idx)
# e$eventStop[chunkNo] = as.integer(stop_idx)
# e$eventStart[chunkNo] = as.integer(start_idx)
# assign timestamp and duration
e$tStamp[chunkNo] = o$tStamp[start_idx]
e$eventDuration[chunkNo] = difftime(o$tStamp[stop_idx], o$tStamp[start_idx],units=timescale )
# copy in the threadNum and assign sequence number
e$threadNum[chunkNo] = o$POVthreadNum[start_idx]
thisThread = o$POVthreadNum[start_idx]
# fill in data for each of the context factors
for (cf in compare_CF){
e[chunkNo,cf] = as.character(o[start_idx,cf])
}
for (cf in EVENT_CF){
VCF = paste0("V_",cf)
e[[chunkNo, VCF]] = list(aggregate_VCF_for_event(o,e$occurrences[chunkNo],cf ))
}
# Advance or reset the seq counters
if (thisThread == lastThread){
seqNo = seqNo +1
} else if (thisThread != lastThread){
lastThread = thisThread
seqNo = 1
}
e$seqNum[chunkNo] = seqNo
}
# convert them to factors
for (cf in compare_CF){
e[cf] = as.factor(e[,cf])
}
### Use optimal string alignment to compare the chunks. This is O(n^^2)
clust = hclust( dist_matrix_seq(e), method="ward.D2" )
## Create a new column for each cluster solution -- would be faster with data.table
for (cluster_level in 1:nChunks){
clevelName = paste0("ZM_",cluster_level)
e[clevelName] = cutree(clust, k=cluster_level)
} # for cluster_level
# for debugging, this is really handy
# save(o,e,file="O_and_E_2.rdata")
# store the event map in the GlobalEventMappings
eventMap = store_event_mapping(EventMapName, e)
# print( get_event_mapping_names( GlobalEventMappings ) )
# save(GlobalEventMappings, file="eventMappings.RData")
# need return the threads and also the cluster solution for display
return(eventMap)
}
# this one creates events based on frequent ngrams or regular expressions
OccToEvents3 <- function(o, EventMapName,EVENT_CF, compare_CF,TN, CF, rx, KeepIrregularEvents){
# print(rx)
# keep track of the length of each pattern
for (i in 1:nrow(rx))
{rx$patLength[i] = length(unlist(strsplit(rx$pattern[i], ',')))
}
# print(rx)
# print(rx$label)
# put this here for now
timescale='mins'
# Only run if eventMapName is filled in; return empty data frame otherwise
if (EventMapName ==""){return(data.frame())}
# get the text vector for this set of threaded occurrences, delimited by commas
tv = thread_text_vector( o, TN, CF, ',' )
# apply regex to tv and split on the commas
tvrx = replace_regex_list( tv, rx )
tvrxs = lapply(1:length(tvrx), function(i) {unlist(strsplit(tvrx[[i]],','))})
# print('tvrx')
# print(tvrx[1:3])
# print('tvrxs')
# print(tvrxs[1:3])
# count the total number of chunks
nChunks = length(unlist(tvrxs))
# make the dataframe for the results. This is the main data structure for the visualizations and comparisons.
e = make_event_df(EVENT_CF, compare_CF, nChunks)
# # for debugging, this is really handy
# save(o,rx,tvrxs, file="O_and_E_3.rdata")
#loop through the threads and fill in the data for each event
# when it's a row number in the input data array, just copy the row
# when it's one of the regex labels, use the numbers in the pattern to compute V_ for the new event
chunkNo=0
original_row=0
for (thread in 1:length(tvrxs)){
# the events stay in sequence
for (sn in 1:length(tvrxs[[thread]])){
# increment the current row numbers
chunkNo = chunkNo+1
original_row=original_row+1
print(paste("original_row=",original_row))
# assign the thread and sequence number
e$threadNum[chunkNo] = thread
e$seqNum[chunkNo] = sn
# Make sure the CFs are factors
for (cf in compare_CF){
e[[chunkNo, cf]] = o[[original_row, cf]]
}
if (tvrxs[[thread]][sn] %in% rx$label ){
# print(paste('thread sn = ',thread, sn))
# print(paste('matched regex label',tvrxs[[thread]][sn]))
# Use the ZM_1 column to store the new labels
e$ZM_1[chunkNo] = tvrxs[[thread]][sn]
# try just sticking in the pattern -- need to pick the correct one & convert to list of integers
# INCORRECT: e$occurrences[[chunkNo]] = list(as.integer(unlist(strsplit( rx$pattern[which( rx$label==tvrxs[[thread]][sn])], ','))))
# need to locate the unique for from the o dataframe and aggregate those V_cf values.
rxLen = rx$patLength[which( rx$label==tvrxs[[thread]][sn])]
e$occurrences[[chunkNo]] = list(seq(original_row,original_row+rxLen-1,1))
original_row = original_row + rxLen-1
# compute the V_ based on the occurrences
for (cf in EVENT_CF){
VCF = paste0("V_",cf)
e[[chunkNo, VCF]] = list(aggregate_VCF_for_regex(o,e$occurrences[chunkNo],cf ))
}
# assign timestamp and duration -- use first - last occurrence times
# e$eventDuration[chunkNo] = difftime(o$tStamp[stop_idx], o$tStamp[start_idx],units=timescale )
e[[chunkNo,'tStamp']] = o[[original_row,'tStamp']]
}
else if (KeepIrregularEvents=='Keep') {
# copy data from input structure
# print(paste('no match',tvrxs[[thread]][sn]))
# Use the ZM_1 column to store the new labels
e$ZM_1[chunkNo] = tvrxs[[thread]][sn]
e$occurrences[[chunkNo]] = original_row
# copy the rest of the data
for (cf in EVENT_CF){
VCF = paste0("V_",cf)
e[[chunkNo, VCF]] = o[[original_row, VCF]]
}
e[[chunkNo,'tStamp']] = o[[original_row,'tStamp']]
e[[chunkNo,'eventDuration']] = o[[original_row,'eventDuration']]
}
} # sn loop
} # thread loop
# take out the irregular events (empty rows) if so desired
if (KeepIrregularEvents=='Drop'){
# keep the subset where the event is not blank
e=subset(e, !ZM_1=='')
# should probably re-number the sequence numbers...
}
# # for debugging, this is really handy
# save(o,e,rx,tvrxs, file="O_and_E.rdata")
# store the event map in the GlobalEventMappings
eventMap = store_event_mapping(EventMapName, e)
# print( get_event_mapping_names( GlobalEventMappings ) )
# save(GlobalEventMappings, file="eventMappings.RData")
# need return the threads and also the cluster solution for display
return(eventMap)
}
# new function for new tab
# e is the event list
# EventMapName is an input selected from the list of available mappings
# cluster_method is either "Sequential similarity" or "Contextual Similarity"
clusterEvents <- function(e, EventMapName, NewMapName, cluster_method, event_CF){
# only run if something is filled in
if (is.null(NewMapName) | NewMapName=="") return(NULL)
if (cluster_method=="Sequential similarity")
{ dd = dist_matrix_seq(e) }
else if (cluster_method=="Contextual Similarity")
{ dd = dist_matrix_context(e,event_CF) }
else if (cluster_method=="Network Structure")
{ dd = dist_matrix_network(e) }
### Use optimal string alignment to compare the chunks. This is O(n^^2)
clust = hclust( dd, method="ward.D2" )
######## need to delete the old ZM_ columns and append the new ones. ###########
e[grep("ZM_",colnames(e))]<-NULL
# number of chunks is the number of rows (the number of events to be clustered)
nChunks = nrow(e)
# make new data frame with columns for cluster level
zm = setNames(data.frame(matrix(ncol = nChunks, nrow = nChunks)), paste0("ZM_", 1:nChunks))
## Create a new column for each cluster solution
for (cluster_level in 1:nChunks){
clevelName = paste0("ZM_",cluster_level)
zm[clevelName] = cutree(clust, k=cluster_level)
} # for cluster_level
# append this onto the events to allow zooming
newmap=cbind(e, zm)
# store the event map in the GlobalEventMappings
eventMap = store_event_mapping( NewMapName, newmap )
# return the cluster solution for display
return(clust)
}
# this function pulls computes their similarity of chunks based on sequence
dist_matrix_seq <- function(e){
nChunks = nrow(e)
evector=vector(mode="list", length = nChunks)
for (i in 1:nChunks){
evector[i]=unique(as.integer(unlist(e$occurrences[[i]])))
}
return( stringdistmatrix( evector, method="osa") )
}
# this function pulls computes their similarity of chunks based on context
# e = events, with V_columns
# CF = event CFs
# w = weights (0-1)
#
dist_matrix_context <- function( e, CF ){
nChunks = nrow(e)
evector= VCF_matrix( e, paste0( "V_",CF ))
return( dist( evector, method="euclidean") )
}
# this function pulls computes their similarity of chunks based on network
dist_matrix_network <- function(e){
nChunks = nrow(e)
evector=vector(mode="list", length = nChunks)
for (i in 1:nChunks){
evector[i]=unique(as.integer(unlist(e$occurrences[[i]])))
}
return( stringdistmatrix( evector, method="osa") )
}
net_adj_matrix <- function(edges){
return(as_adj(graph_from_edgelist(as.matrix(edges))))
}
# new data structure for events (BTP 3/28)
make_event_df <- function(event_CF,compare_CF,N){
# Make a data frame with columns for each CF, and put one vector into each column
e = data.frame(
tStamp = numeric(N), # this is the event start time
eventDuration = numeric(N),
occurrences = integer(N),
threadNum = integer(N),
seqNum = integer(N))
# add columns for each of the context factors used to define events
# first make the dataframes for each
# cf1=setNames(data.frame(matrix(ncol = length(event_CF), nrow = N)), event_CF)
cf1v=setNames(data.frame(matrix(ncol = length(event_CF), nrow = N)), paste0("V_",event_CF))
cf2=setNames(data.frame(matrix(ncol = length(compare_CF), nrow = N)), compare_CF)
# cf2v=setNames(data.frame(matrix(ncol = length(compare_CF), nrow = N)), paste0("V_", compare_CF))
# Then combine them
e = cbind(e, cf2,cf1v)
# and add one more column for the event code/description
e$ZM_1 = character(N)
return(e)
}
# this will convert the context factor into a list (like this: 0 0 0 0 0 0 0 0 1 0 0)
# o is the dataframe of occurrences
# CF is the context factor (column)
# r is the row (occurrence number in the One-to-One mapping)
convert_CF_to_vector <- function(o,CF,r){
return(as.integer((levels(o[[CF]]) ==o[[r,CF]])*1))
}
# Aggregate the VCF (CF vector) for that CF
# There are two layers to this.
# 1) aggregate_VCF_for_event
# Within an single event, aggregate the VCF for the occurrences that make up that event.
# This function will only get used when creating from the fuctions that convert occurrences to events
# 2) aggregate_VCF_for_cluster
# For a cluster level, aggregate the events at that cluster level (e.g., ZM_n)
# This function will work on any event, even the one_to_one mapping.
#
# o is a dataframe of occurrences. The values of V_ (the VCF) does not have to be filled in. It gets re-computed here for each occurrence.
# occlist is the list of occurrences of that event (e$occurrences)
# cf is the name of the contextual factor to create the VCF
aggregate_VCF_for_event <- function(o, occList, cf){
# get the column name for the VCF
VCF = paste0("V_",cf)
# start with the first one so the dimension of the vector is correct
aggCF = convert_CF_to_vector(o, cf, unlist(occList)[1])
print( aggCF)
# now add the rest, if there are any
if (length(unlist(occList)) > 1){
for (idx in seq(2,length(unlist(occList)),1)){
print( aggCF)
aggCF = aggCF + convert_CF_to_vector(o, cf, unlist(occList)[idx])
}}
return(aggCF)
}
# this version assumes that the VCF is already computed.
# Might come in handy, but it's not correct...
aggregate_VCF_for_regex <- function(o, occList, cf){
# get the column name for the VCF
VCF = paste0("V_",cf)
# start with the first one so the dimension of the vector is correct
aggCF = unlist(o[unlist(occList)[1],VCF])
# print( aggCF)
# now add the rest, if there are any
if (length(unlist(occList)) > 1){
for (idx in seq(2,length(unlist(occList)),1)){
# print( aggCF)
aggCF = aggCF+unlist(o[[unlist(occList)[idx],VCF]])
}}
return(aggCF)
}
# Same basic idea, but works on a set of events within a cluster, rather than a set of occurrences within an event
# so you get get a subset of rows, convert to a matrix and add them up
# e holds the events
# cf holds a single contextual factor, so you need to call this in a loop
# zoom_col and z are used to subset the data. They could actually be anything.
aggregate_VCF_for_cluster <- function(e, cf, eclust, zoom_col){
# get the column name for the VCF
VCF = paste0("V_",cf)
# get the matrix for each
# get the subset of events for that cluster -- just the VCF column
# s = e[ which(e[[zoom_col]]==eclust), VCF] This version uses the
s = e[ which(as.integer(e[[zoom_col]])==eclust), VCF]
# print (s)
# print(paste("length(s)",length(s)))
if ( is.null(unlist(s) ))
return(NULL)
else
return( colSums( matrix( unlist(s), nrow = length(s), byrow = TRUE) ))
}
# this one takes the whole list
VCF_matrix <- function(e, vcf ){
m = one_vcf_matrix(e, vcf[1] )
if (length(vcf)>1){
for (idx in seq(2,length(vcf),1)){
m = cbind( m, one_vcf_matrix(e, vcf[idx] ) )
}}
return(m)
}
# this one takes a single column as an argument
one_vcf_matrix <- function(e, vcf){
return( matrix( unlist( e[[vcf]] ), nrow = length( e[[vcf]] ), byrow = TRUE) )
}
# this is used for the regex pages to show the threads.
# similar code is used in count_ngrams and to make networks, but with different delimiters
# and with a minimum sequence length (ngram size), but this can be filtered after this function3
thread_text_vector <- function(o, TN, CF, delimiter){
# Initialize text vector
tv = vector(mode="character")
# Loop through the unique thread numbers
j=0
for (i in unique(o[[TN]])){
txt =o[o[[TN]]==i,CF]
j=j+1
tv[j] = str_replace_all(concatenate(o[o[[TN]]==i,CF] ),' ',delimiter)
}
return(tv)
}
# use this to replace patterns for regex and ngrams
# tv is the text vector for the set of threads
# rx is the dataframe for regexpressions ($pattern, $label)
replace_regex_list <- function(tv, rx ){
for (i in 1:length(tv)) {
for (j in 1:nrow(rx) ) {
tv[i] = str_replace_all(tv[i],rx$pattern[j],rx$label[j])
}
}
return(tv)
}
# same function, but with lapply -- but does not work.
# replace_regex_list_lapply <- function(tv, rx){
#
# lapply(1:length(tv), function(i){
# lapply(1:nrow(rx),function(j){
# str_replace_all(tv[i], rx$pattern[j], rx$label[j] ) }
# ) })
# }
# No longer needed?
# selectize_frequent_ngrams<- function(e, TN, CF, minN, maxN, threshold){
#
# f=str_replace_all(trimws(frequent_ngrams(e, TN, CF, minN, maxN, threshold,TRUE)[,'ngrams'], which=c('right')), ' ',',')
# return(f)
# }
# combined set of frequent ngrams
# add parameter to make maximal a choice
frequent_ngrams <- function(e, TN, CF, minN, maxN, onlyMaximal=TRUE){
# initialize the output
ng = count_ngrams(e,TN, CF,minN)
if (maxN > minN){
for (i in seq(minN+1,maxN,1)){
ng = rbind(ng,count_ngrams(e,TN, CF,i)) }
}
# remove the rows that happen once and only keep the columns we want
ng=ng[ng$freq>1,c('ngrams','freq', 'len')]
# just take the maximal ones if so desired
if (onlyMaximal) { ng=maximal_ngrams(ng) }
# return the set sorted by most frequent
return(ng[order(-ng$freq),])
}
# this filters out ngrams that are contained within others ('2 2' is part of '2 2 2')
maximal_ngrams <- function(ng){
# find out if each ngram is contained in all the others
w = lapply(1:nrow(ng), function(i){
grep(ng$ngrams[i],ng$ngrams)}
)
# get howMany times each one appears
howMany = lapply(1:length(w), function(i){
length(w[[i]])}
)
# return the ones that are unique
return(ng[which(howMany==1),])
}
# compute support level for each ngram
# tv = text vectors for the threads
# ng = frequent ngrams data frame
# returns ng data frame with support level added
support_level <- function(tv, ng) {
# change the commas back to spaces
tv=str_replace_all(tv, ',' , ' ')
totalN = length(tv)
# need to remove whitespace from the trailing edge of the ngrams
ng$ngrams = trimws(ng$ngrams)
# find out how many times each ngram is contained in each TV
ng$support = unlist(lapply(1:nrow(ng), function(i){
length(grep(ng$ngrams[i],tv)) })
)/totalN
# toss in the generativity level
ng = generativity_level(tv,ng)
return(ng)
}
# compute the generativity = in-degree and out-degree
generativity_level<- function(tv, ng){
# for each ngram, look at the next longer size
# Find the n+1-grams that match (as in the code for maximal ngrams).
# There are two possibilities -- matching in the first or second position
# The number of matches in the first position = the out-degree
# The number of matches in the second position = the in-degree
# if so desired, it should be possible to keep a list.
# problem is that the tokens can be 1-3 characters long, and there are spaces...
# Big Idea for frequent n-grams: use the DT:: and let people sort, select and apply all the ngrams they want.
# Name them using the tokens but with a different delimiter to avoid confusion. Go Crazy!
# convert to spaces
tv=str_replace_all(tv, ',',' ')
# first get the range we are looking for
nList = unique(ng$len)
z=list()
# loop through them
for (n in nList){
# print(paste('n = ',n))
#pick the ngrams of length n from the list given
ngn= ng[ng$len==n,]
# get ngrams of length n+1 -- make sure the threads are long enough
ngplus = get.phrasetable(ngram( long_enough(tv,n+1, ' '), n+1))
# this picks out the ones that match
w = lapply(1:nrow(ngn), function(i){
grep(ngn$ngrams[i],ngplus$ngrams)} )
#print(w)
# print('z = ')
zplus = lapply(1:nrow(ngn), function(i){
str_locate(ngplus$ngrams[w[[i]]],ngn$ngrams[i]) } )
# print(z)
z = c(z,zplus)
}
# compute the in and out degree
ng$in_degree = unlist(lapply(1:nrow(ng), function(i){
zm=z[[i]]
length( zm[zm[,1]>1,1] ) } ))
ng$out_degree = unlist( lapply(1:nrow(ng), function(i){
zm=z[[i]]
length( zm[zm[,1]==1,1] ) } ))
# ng$generativity = lapply(1:nrow(ng), function(i) {ng$out_degree[i] * ng$in_degree[i]})
return(ng)
}
# to avoid errors in count_ngrams, make sure the length of each thread in the text_vector tv is longer than the n-gram size, n
# this gets used in various places so need to pass in the delimiter
long_enough = function(tv,n,delimiter){
return(tv[ unlist(lapply(1:length(tv), function(i) {length(unlist(strsplit(tv[[i]],delimiter)))>=n})) ])
}