#define _sugar_
extern "C" {
#include
#include
#include
#include
#include "cl.h"
#include "cqp.h"
#ifndef PCRE2_CODE_UNIT_WIDTH
#define PCRE2_CODE_UNIT_WIDTH 8
#endif
#include
#include "server.h"
#include "cwb/cqp/corpmanag.h"
#include "_globalvars.h"
#include "_eval.h"
/* includes for utils */
#include
#include "cwb/utils/globals.h"
#include "cwb/utils/utils.h"
}
#include
using namespace Rcpp;
// [[Rcpp::interfaces(r, cpp)]]
// [[Rcpp::export(name=".cwb_makeall")]]
int cwb_makeall(SEXP x, SEXP registry_dir, SEXP p_attribute){
char *registry_directory = strdup(Rcpp::as<:string>(registry_dir).c_str());
char *attr_name = strdup(Rcpp::as<:string>(p_attribute).c_str());
char * corpus_id = strdup(Rcpp::as<:string>(x).c_str());
int validate = 1;
ComponentID cid = CompLast;
corpus = cl_new_corpus(registry_directory, corpus_id);
Rprintf("=== Makeall: processing corpus %s ===\n", corpus_id);
Rprintf("Registry directory: %s\n", corpus->registry_dir);
Attribute *attribute = cl_new_attribute(corpus, attr_name, ATT_POS);
makeall_do_attribute(attribute, cid, validate);
Rprintf("========================================\n");
return 0;
}
// [[Rcpp::export(name=".cwb_huffcode")]]
int cwb_huffcode(SEXP x, SEXP registry_dir, SEXP p_attribute) {
char *registry_directory = strdup(Rcpp::as<:string>(registry_dir).c_str());
char *attr_name = strdup(Rcpp::as<:string>(p_attribute).c_str());
char * corpus_id = strdup(Rcpp::as<:string>(x).c_str());
char *output_fn = NULL;
Attribute *attr;
HCD hc;
int i_want_to_believe = 0; /* skip error checks? */
/* int all_attributes = 0; */
/* protocol = stdout; */ /* 'delayed' init (see top of file) */
if ((corpus = cl_new_corpus(registry_directory, corpus_id)) == NULL) {
Rprintf("Corpus %s not found in registry %s . Aborted.\n",
corpus_id,
(registry_directory ? registry_directory
: central_corpus_directory()));
return 1;
}
if ((attr = cl_new_attribute(corpus, attr_name, ATT_POS)) == NULL) {
Rprintf("Attribute %s.%s doesn't exist. Aborted.\n", corpus_id, attr_name);
return 1;
}
compute_code_lengths(attr, &hc, output_fn);
if (! i_want_to_believe) decode_check_huff(attr, corpus_id, output_fn);
cl_delete_corpus(corpus);
return 0;
}
// [[Rcpp::export(name=".cwb_compress_rdx")]]
int cwb_compress_rdx(SEXP x, SEXP registry_dir, SEXP p_attribute) {
char *registry_directory = strdup(Rcpp::as<:string>(registry_dir).c_str());
char *attr_name = strdup(Rcpp::as<:string>(p_attribute).c_str());
char *corpus_id = strdup(Rcpp::as<:string>(x).c_str());
Attribute *attr;
char *output_fn = NULL;
#ifdef _WIN32
int i_want_to_believe = 1; /* skip error on Windows, for the time being */
#else
int i_want_to_believe = 0; /* do not skip error checks on macOS and Linux */
#endif
/* debug_output = stderr; */ /* 'delayed' init (see top of file) */
int debug = 0;
if ((corpus = cl_new_corpus(registry_directory, corpus_id)) == NULL) {
Rprintf("Corpus %s not found in registry %s . Aborted.\n",
corpus_id,
(registry_directory ? registry_directory
: central_corpus_directory()));
compressrdx_cleanup(1);
}
if ((attr = find_attribute(corpus, attr_name, ATT_POS, NULL)) == NULL) {
Rprintf("Attribute %s.%s doesn't exist. Aborted.\n", corpus_id, attr_name);
compressrdx_cleanup(1);
}
compress_reversed_index(attr, output_fn, corpus_id, debug);
if (! i_want_to_believe) decompress_check_reversed_index(attr, output_fn, corpus_id, debug);
/* compressrdx_cleanup(1); */
return 0; /* to keep gcc from complaining */
}
// [[Rcpp::export(name=".cwb_encode")]]
int cwb_encode(
SEXP regfile, SEXP data_dir, SEXP vrt_dir, SEXP encoding, Rcpp::StringVector p_attributes,
Rcpp::StringVector s_attributes_anno, Rcpp::StringVector s_attributes_noanno,
int skip_blank_lines, int strip_whitespace, int xml, int quiet, int verbosity){
directory = strdup(Rcpp::as<:string>(data_dir).c_str());
registry_file = strdup(Rcpp::as<:string>(regfile).c_str());
encoding_charset_name = strdup(Rcpp::as<:string>(encoding).c_str());
/* reset global variables to initial state */
p_encoder_ix = 0;
s_encoder_ix = 0;
nr_input_files = 0;
current_input_file = 0;
current_input_file_name = NULL;
/* configure encoder */
xml_aware = xml;
skip_empty_lines = skip_blank_lines;
strip_blanks = strip_whitespace;
verbose = verbosity;
quietly = quiet;
/* declare p-attributes */
int p_attrs_n = p_attributes.length();
int m;
for (m = 0; m < p_attrs_n; m++){
/* wattr_declare(p_attributes(m), directory, 0); */
p_att_declare(p_attributes(m), directory, 0);
}
/* declare s-attribute with annotations */
int s_attrs_len = s_attributes_anno.length();
for (m = 0; m < s_attrs_len; m++){
/* range_declare(s_attributes_anno(m), directory, 1, 0); */
s_att_declare(s_attributes_anno(m), directory, 1, 0);
}
/* declare s-attribute without annotations */
s_attrs_len = s_attributes_noanno.length();
for (m = 0; m < s_attrs_len; m++){
/* range_declare(s_attributes_noanno(m), directory, 0, 0); */
s_att_declare(s_attributes_noanno(m), directory, 0, 0);
}
input_files = cl_new_string_list();
cl_string_list vrt_files = encode_scan_directory(strdup(Rcpp::as<:string>(vrt_dir).c_str()));
int i, len;
len = cl_string_list_size(vrt_files);
for (i = 0; i < len; i++)
cl_string_list_append(input_files, cl_string_list_get(vrt_files, i));
cl_delete_string_list(vrt_files); /* allocated strings have been moved into input_files, so don't free() them */
nr_input_files = cwb_encode_worker(input_files);
return nr_input_files;
}