#include using namespace Rcpp ; // #include #include #include #include class rule_interval { public: int rule_id; int start; int end; double cover; }; struct sort_intervals { bool operator()(const rule_interval &left, const rule_interval &right) { return (left.cover < right.cover); } }; double _normalized_distance(int start1, int end1, int start2, int end2, std::vector *series){ double res = 0; int count = 0; int len1 = end1 - start1; int len2 = end2 - start2; if(len1 == len2){ for(int i=0; iat(start1+i) - series->at(start2+i)) * (series->at(start1+i) - series->at(start2+i)); count++; } return sqrt(res) / (double) count; } int min_length = std::min(len1, len2); if(len1 == min_length){ std::vector subseries(len2); for(int i=0; iat(start2+i); } std::vector subseries_paa = _paa2(subseries, len1); for(int i=0; iat(start1+i) - subseries_paa[i]) * (series->at(start1+i) - subseries_paa[i]); count++; } return sqrt(res) / (double) count; } else { std::vector subseries(len1); for(int i=0; iat(start1+i); } std::vector subseries_paa = _paa2(subseries, len2); for(int i=0; iat(start2+i)) * (subseries_paa[i] - series->at(start2+i)); count++; } return sqrt(res) / (double) count; } } double _shrinked_distance(int start1, int end1, int start2, int end2, std::vector *series){ double res = 0; int count = 0; int len1 = end1 - start1; int len2 = end2 - start2; if(len1 == len2) { for(int i=0; iat(start1+i) - series->at(start2+i)) * (series->at(start1+i) - series->at(start2+i)); count++; } return sqrt(res) / (double) count; } else { int min_length = std::min(len1, len2); for(int i=0; iat(start1+i) - series->at(start2+i)) * (series->at(start1+i) - series->at(start2+i)); count++; } return sqrt(res) / (double) count; } } double _mean(std::vector *ts, int *start, int *end){ int sum = 0; for(int i=*start; i<*end; i++){ sum = sum + ts->at(i); } return (double) sum / (double) (*end - *start); } rra_discord_record find_best_rra_discord(std::vector *ts, int w_size, std::unordered_map *grammar, std::vector *indexes, std::vector *intervals, std::unordered_set *global_visited_positions){ // ***** // std::chrono::time_point<:chrono::system_clock> tstart0, tstart, tend; // tstart0 = std::chrono::system_clock::now(); int distance_calls_counter = 0; std::vector visit_array(ts->size(), -1); // for(auto it=intervals.begin(); it!=intervals.end(); ++it){ // Rcout << "R" << it->rule_id << " covr " << it->cover << std::endl; // } // init variables int bestSoFarPosition = -1; int bestSoFarLength = -1; int bestSoFarRule = 0; double bestSoFarDistance = -1; // outer loop over all intervals for(int i = 0; i < (int) intervals->size(); i++){ // **** // tstart = std::chrono::system_clock::now(); rule_interval c_interval = intervals->at(i); // Rcout << c_interval.rule_id << ", " << c_interval.cover << std::endl; auto find = global_visited_positions->find(c_interval.start); if(find != global_visited_positions->end()){ continue; } // mark the interval location std::unordered_set visited_locations; visited_locations.reserve(ts->size()); int markStart = c_interval.start - (c_interval.end - c_interval.start); if (markStart < 0) { markStart = 0; } int markEnd = c_interval.end; if (markEnd > (int) ts->size()) { markEnd = ts->size(); } for(int j=markStart;j::max(); // by default, we engage the random search bool do_random_search = true; //Rcout << " considering interval " << c_interval.start << "-" << c_interval.end << // " for rule " << c_interval.rule_id << // ", best so far dist " << bestSoFarDistance << std::endl; auto this_rule_occurrences = grammar->at(c_interval.rule_id)->rule_intervals; // Rcout << " going to iterate over " << this_rule_occurrences.size() << // " rule occurrences first " << std::endl; for(auto it=this_rule_occurrences.begin(); it !=this_rule_occurrences.end(); ++it) { // Rcout << "0.0\n"; int start = indexes->at(it->first); // Rcout << "0.1" << start << "\n"; auto found = visited_locations.find(start); if (found == visited_locations.end()) { visited_locations.emplace(start); int end = indexes->at(it->second) + w_size; // Rcout << " examining a candidate at " << start << "-" << // end << std::endl; distance_calls_counter++; double dist = _normalized_distance(c_interval.start, c_interval.end, start, end, ts); // keep track of best so far distance if (dist < nn_distance) { // Rcout << " better nn distance found " << dist << std::endl; nn_distance = dist; } if (dist < bestSoFarDistance) { // Rcout << " R " << c_interval.rule_id << ", dist " << dist << // " is less than best so far, breaking off the search" << std::endl; do_random_search = false; break; } } else { continue; } } // **** // tend = std::chrono::system_clock::now(); // std::chrono::duration elapsed_seconds = tend - tstart; // std::cout << " . pre-search part done in " << elapsed_seconds.count() << "s\n"; // tstart = std::chrono::system_clock::now(); if(do_random_search){ // Rcout << " starting the random search ..." << // " nn dist " << nn_distance << std::endl; // Rcout << "visited locations "; // for(auto it=visited_locations.begin(); it != visited_locations.end(); ++it){ // Rcout << *it << ", "; //} //Rcout << std::endl; // init the visit array int cIndex = 0; for (unsigned j = 0; j < intervals->size(); j++) { rule_interval interval = intervals->at(j); auto found = visited_locations.find(interval.start); if (found == visited_locations.end()) { visit_array[cIndex] = j; cIndex++; //} else { // Rcout << " - skipped " << interval.start << std::endl; //} } } cIndex--; // shuffle the visit array for (int j = cIndex; j > 0; j--) { int index = armaRand() % (j + 1); int a = visit_array[index]; visit_array[index] = visit_array[j]; visit_array[j] = a; } // while there are unvisited locations while (cIndex >= 0) { rule_interval randomInterval = intervals->at(visit_array[cIndex]); cIndex--; //Rcout << " random candidate " << randomInterval.start << "-" << // randomInterval.end << ", cindex " << cIndex << std::endl; distance_calls_counter++; double dist = _normalized_distance(c_interval.start, c_interval.end, randomInterval.start, randomInterval.end, ts); if (dist < bestSoFarDistance) { // Rcout << " dist " << dist << // " is less than best so far, breaking off the search" << std::endl; nn_distance = dist; break; } if (dist < nn_distance) { // Rcout << " better nn distance found " << dist << std::endl; nn_distance = dist; } } // **** // tend = std::chrono::system_clock::now(); // std::chrono::duration elapsed_seconds = tend - tstart; // std::cout << " . random search done in " << elapsed_seconds.count() << "s\n"; // tstart = std::chrono::system_clock::now(); } // random search if(nn_distance > bestSoFarDistance){ bestSoFarDistance = nn_distance; bestSoFarPosition = c_interval.start; bestSoFarLength = c_interval.end - c_interval.start; bestSoFarRule = c_interval.rule_id; // Rcout << " updating the discord " << nn_distance << " at " << bestSoFarPosition << // " of length " << bestSoFarLength << " for rule " << bestSoFarRule << std::endl; } } // Rcout << " RRA, distance calls: " << distance_calls_counter << std::endl; rra_discord_record res; res.rule = bestSoFarRule; res.start = bestSoFarPosition; res.end = bestSoFarPosition + bestSoFarLength; res.nn_distance = bestSoFarDistance; return res; } //' Finds a discord with RRA (Rare Rule Anomaly) algorithm. //' Usually works the best with higher than that for HOT-SAX sizes of discretization parameters //' (i.e., PAA and Alphabet sizes). //' //' @param series the input timeseries. //' @param w_size the sliding window size. //' @param paa_size the PAA size. //' @param a_size the alphabet size. //' @param nr_strategy the numerosity reduction strategy ("none", "exact", "mindist"). //' @param n_threshold the normalization threshold. //' @param discords_num the number of discords to report. //' @useDynLib jmotif //' @export //' @references Senin Pavel and Malinchik Sergey, //' SAX-VSM: Interpretable Time Series Classification Using SAX and Vector Space Model., //' Data Mining (ICDM), 2013 IEEE 13th International Conference on. //' @examples //' discords = find_discords_rra(ecg0606, 100, 4, 4, "none", 0.01, 1) //' plot(ecg0606, type = "l", col = "cornflowerblue", main = "ECG 0606") //' lines(x=c(discords[1,2]:(discords[1,2]+100)), //' y=ecg0606[discords[1,2]:(discords[1,2]+100)], col="red") // [[Rcpp::export]] Rcpp::DataFrame find_discords_rra(NumericVector series, int w_size, int paa_size, int a_size, CharacterVector nr_strategy, double n_threshold, int discords_num){ // ***** // std::chrono::time_point<:chrono::system_clock> tstart0, tstart, tend; // tstart0 = std::chrono::system_clock::now(); // tstart = std::chrono::system_clock::now(); std::vector ts = Rcpp::as<:vector>>(series); std::vector visit_array(ts.size(), -1); std::unordered_map sax_map = _sax_via_window( ts, w_size, paa_size, a_size, Rcpp::as<:string>(nr_strategy), n_threshold); // **** // tend = std::chrono::system_clock::now(); // std::chrono::duration elapsed_seconds = tend - tstart; // Rcout << " sax conversion: " << elapsed_seconds.count() << "s\n"; // tstart = std::chrono::system_clock::now(); // sax_map maps time-series positions to corresponding SAX words // to compose the string we need to order keys // std::vector indexes(sax_map.size()); int i=0; for(auto it = sax_map.begin(); it != sax_map.end(); ++it) { indexes[i] = it->first; i++; } sort( indexes.begin(), indexes.end() ); // Rcout << " there are " << indexes.size() << " SAX words..." << std::endl; // now compose the string // std::string sax_str; for(unsigned i=0; i grammar = _str_to_repair_grammar(sax_str); // **** // tend = std::chrono::system_clock::now(); // elapsed_seconds = tend - tstart; // Rcout << " grammar inferred in: " << elapsed_seconds.count() << "s\n"; // Rcout << " there are " << grammar.size() << " RePair rules including R0..." << std::endl; // ***** // tstart = std::chrono::system_clock::now(); // making intervals and ranking by the rule use // meanwhile build the coverage curve // std::vector intervals; std::vector coverage_array(ts.size(), 0); for(auto it = grammar.begin(); it != grammar.end(); ++it) { if(0 == it->first){ continue; // skip R0 } for(auto rit = (it->second)->rule_intervals.begin(); rit != (it->second)->rule_intervals.end(); ++rit) { int t_start = rit->first; int t_end = rit->second; // start and end here is for the string tokens, not for time series points // int start = indexes[t_start]; int end = indexes[t_end] + w_size; // Rcout << start << "_" << end << std::endl; // Rcout << " * rule interval " << start << " " << end << std::endl; for(int i=start; ifirst; rr.start = start; rr.end = end; // rr.cover = it->second->rule_use; // a shortcut intervals.push_back(rr); } } // we need to examine the coverage curve for continous zero intervals and mark those // rule_record* rec_zero_cover = new rule_record(); rec_zero_cover->rule_id = -1; rec_zero_cover->rule_string = "xxx"; rec_zero_cover->expanded_rule_string = "xxx"; bool need_placement = false; int start = -1; bool in_interval = false; for (unsigned i = 0; i < coverage_array.size(); i++) { if (0 == coverage_array[i] && !in_interval) { start = i; in_interval = true; } if (coverage_array[i] > 0 && in_interval) { need_placement = true; rule_interval ri; ri.cover=0; ri.start = start; ri.end=i; ri.rule_id=-1; intervals.push_back(ri); rec_zero_cover->rule_occurrences.push_back(start); rec_zero_cover->rule_intervals.push_back(std::make_pair(start, i)); intervals.push_back(ri); in_interval = false; // Rcout << " zero coverage from " << start << " to " << i << std::endl; } } if(need_placement) { grammar.emplace(std::make_pair(-1, rec_zero_cover)); } // compute the coverage for(auto it=intervals.begin(); it !=intervals.end(); ++it){ it->cover = _mean(&coverage_array, &it->start, &it->end); } // sort the intervals rare < frequent std::sort(intervals.begin(), intervals.end(), sort_intervals()); // ****** // tend = std::chrono::system_clock::now(); // elapsed_seconds = tend - tstart; // Rcout << " there are " << intervals.size() <<" rule intervals inferred in " // << elapsed_seconds.count() << std::endl; // Rcout << " top coverage for interval of rule " << intervals[0].rule_id << " starting at " // << intervals[0].start << " ending at " << intervals[0].end << " : " << intervals[0].cover // << std::endl; // int last_idx = intervals.size() - 1; // Rcout << " bottom coverage for interval of rule " << intervals[last_idx].rule_id // << " starting at " << intervals[last_idx].start << " ending at " << intervals[last_idx].end // << " : " << intervals[last_idx].cover << std::endl; // tstart = std::chrono::system_clock::now(); // from here on we'll be calling find best discord... std::unordered_set global_visited_positions; // global_visited_positions.reserve(ts.size()); std::vector discords; while((int) discords.size() < discords_num){ // tstart = std::chrono::system_clock::now(); rra_discord_record d = find_best_rra_discord(&ts, w_size, &grammar, &indexes, &intervals, &global_visited_positions); // Rcout << d.nn_distance; if(d.nn_distance<0){ break; } discords.push_back(d); // mark the location int markStart = d.start - (d.end - d.start); if (markStart < 0) { markStart = 0; } int markEnd = d.end; if (markEnd > (int) ts.size()) { markEnd = ts.size(); } for(int j=markStart;j rule_ids; std::vector starts; std::vector ends; std::vector lengths; std::vector nn_distances; for(auto it = discords.begin(); it != discords.end(); it++) { rule_ids.push_back(it->rule); starts.push_back(it->start); ends.push_back(it->end); lengths.push_back(it->end - it->start); nn_distances.push_back(it->nn_distance); } // make results return Rcpp::DataFrame::create( Named("rule_id") = rule_ids, Named("start") = starts, Named("end") = ends, Named("length") = lengths, Named("nn_distance") = nn_distances ); }