From faa8031b02ff15b83d1791a2ef3cfe85293dd1fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Sun, 11 Jun 2023 13:23:19 +0200 Subject: [PATCH 01/90] new fn region_to_strucs() --- NEWS.md | 8 ++++++- R/RcppExports.R | 15 ++++++++++++ man/region_to_strucs.Rd | 26 ++++++++++++++++++++ src/RcppExports.cpp | 15 ++++++++++++ src/addons.cpp | 53 +++++++++++++++++++++++++++++++++++++++++ 5 files changed, 116 insertions(+), 1 deletion(-) create mode 100644 man/region_to_strucs.Rd diff --git a/NEWS.md b/NEWS.md index 52a63db..f9014b2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,10 @@ -# RcppCWB 0.5.5.9001-.9003 +# RcppCWB 0.6.0.9001ff + +* New function `region_to_strucs()` to get minimumum and maximum struc of +s-attribute within region provided. Works also for nested s-attributes. + + +# RcppCWB 0.6.0 * Rcpp wrappers for Corpus Library (CL) functions are exposed directly and can be used in C++ functions imported using `Rcpp::sourceCpp()` or diff --git a/R/RcppExports.R b/R/RcppExports.R index fc1fc24..d5c2884 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -37,6 +37,21 @@ .Call(`_RcppCWB_region_matrix_context`, corpus, registry, region_matrix, p_attribute, s_attribute, boundary, left, right) } +#' Get vector with min and max struc of s-attribute within a region +#' +#' Look up the minimum and maximum struc of a s-attribute within a region. +#' Works for nested s-attributes. If there are no regions of the s-attribute +#' within the region, a vector with (two) `NA` values is returned. +#' @param corpus ID of a CWB corpus. +#' @param registry Path of the registry directory. If `NULL` (default), value +#' of environment variable 'CORPUS_REGISTRY' will be used. +#' @param s_attribute Name of nested structural attribute. +#' @param region Vector with left and right corpus position of region. +#' @return A length-two integer vector. +region_to_strucs <- function(corpus, s_attribute, region, registry = NULL) { + .Call(`_RcppCWB_region_to_strucs`, corpus, s_attribute, region, registry) +} + .cwb_version <- function() { .Call(`_RcppCWB_cwb_version`) } diff --git a/man/region_to_strucs.Rd b/man/region_to_strucs.Rd new file mode 100644 index 0000000..52ae164 --- /dev/null +++ b/man/region_to_strucs.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{region_to_strucs} +\alias{region_to_strucs} +\title{Get vector with min and max struc of s-attribute within a region} +\usage{ +region_to_strucs(corpus, s_attribute, region, registry = NULL) +} +\arguments{ +\item{corpus}{ID of a CWB corpus.} + +\item{s_attribute}{Name of nested structural attribute.} + +\item{region}{Vector with left and right corpus position of region.} + +\item{registry}{Path of the registry directory. If \code{NULL} (default), value +of environment variable 'CORPUS_REGISTRY' will be used.} +} +\value{ +A length-two integer vector. +} +\description{ +Look up the minimum and maximum struc of a s-attribute within a region. +Works for nested s-attributes. If there are no regions of the s-attribute +within the region, a vector with (two) \code{NA} values is returned. +} diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index b7b289c..8ce2032 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -136,6 +136,20 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// region_to_strucs +Rcpp::IntegerVector region_to_strucs(SEXP corpus, SEXP s_attribute, Rcpp::IntegerVector region, SEXP registry); +RcppExport SEXP _RcppCWB_region_to_strucs(SEXP corpusSEXP, SEXP s_attributeSEXP, SEXP regionSEXP, SEXP registrySEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< SEXP >::type corpus(corpusSEXP); + Rcpp::traits::input_parameter< SEXP >::type s_attribute(s_attributeSEXP); + Rcpp::traits::input_parameter< Rcpp::IntegerVector >::type region(regionSEXP); + Rcpp::traits::input_parameter< SEXP >::type registry(registrySEXP); + rcpp_result_gen = Rcpp::wrap(region_to_strucs(corpus, s_attribute, region, registry)); + return rcpp_result_gen; +END_RCPP +} // cwb_version Rcpp::StringVector cwb_version(); static SEXP _RcppCWB_cwb_version_try() { @@ -2660,6 +2674,7 @@ static const R_CallMethodDef CallEntries[] = { {"_RcppCWB_ids_to_count_matrix", (DL_FUNC) &_RcppCWB_ids_to_count_matrix, 1}, {"_RcppCWB_region_matrix_to_count_matrix", (DL_FUNC) &_RcppCWB_region_matrix_to_count_matrix, 4}, {"_RcppCWB_region_matrix_context", (DL_FUNC) &_RcppCWB_region_matrix_context, 8}, + {"_RcppCWB_region_to_strucs", (DL_FUNC) &_RcppCWB_region_to_strucs, 4}, {"_RcppCWB_cwb_version", (DL_FUNC) &_RcppCWB_cwb_version, 0}, {"_RcppCWB_p_attr_default", (DL_FUNC) &_RcppCWB_p_attr_default, 0}, {"_RcppCWB_s_attr", (DL_FUNC) &_RcppCWB_s_attr, 3}, diff --git a/src/addons.cpp b/src/addons.cpp index 865db3c..9629a91 100644 --- a/src/addons.cpp +++ b/src/addons.cpp @@ -14,6 +14,12 @@ extern "C" { #include + +/* avoid complications with including Rinternals.h */ +#define mkString Rf_mkString +SEXP Rf_mkString(const char *); + + /* short quasi-header */ Attribute* make_s_attribute(SEXP corpus, SEXP s_attribute, SEXP registry); Attribute* make_p_attribute(SEXP corpus, SEXP p_attribute, SEXP registry); @@ -423,3 +429,50 @@ Rcpp::IntegerMatrix region_matrix_context(SEXP corpus, SEXP registry, Rcpp::Inte return cpos_matrix; } + +//' Get vector with min and max struc of s-attribute within a region +//' +//' Look up the minimum and maximum struc of a s-attribute within a region. +//' Works for nested s-attributes. If there are no regions of the s-attribute +//' within the region, a vector with (two) `NA` values is returned. +//' @param corpus ID of a CWB corpus. +//' @param registry Path of the registry directory. If `NULL` (default), value +//' of environment variable 'CORPUS_REGISTRY' will be used. +//' @param s_attribute Name of nested structural attribute. +//' @param region Vector with left and right corpus position of region. +//' @return A length-two integer vector. +// [[Rcpp::export]] +Rcpp::IntegerVector region_to_strucs(SEXP corpus, SEXP s_attribute, Rcpp::IntegerVector region, SEXP registry = R_NilValue){ + + if (region(0) > region(1)) + return Rcpp::IntegerVector::create(NA_INTEGER, NA_INTEGER); + + if (region.length() != 2) + return Rcpp::IntegerVector::create(NA_INTEGER, NA_INTEGER); + + if (registry == R_NilValue) registry = mkString(getenv("CORPUS_REGISTRY")); + + Attribute* att = make_s_attribute(corpus, s_attribute, registry); + Rcpp::IntegerVector strucs(2); + + bool more = true; + while (more){ + strucs(0) = cl_cpos2struc(att, region(0)); + if (strucs(0) >= 0) more = false; + if (region(0) > region(1)) more = false; + region(0)++; + }; + + more = true; + while (more){ + strucs(1) = cl_cpos2struc(att, region(1)); + if (strucs(1) >= 0) more = false; + if (region(1) < region(0)) more = false; + region(1)--; + }; + + if (region(0) < 0) region(0) = NA_INTEGER; + if (region(1) < 0) region(1) = NA_INTEGER; + + return strucs; +} From 8b5daedc225e6136ca6381d4c8d7b01fab62c5af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Sun, 11 Jun 2023 14:05:14 +0200 Subject: [PATCH 02/90] cl_cpos2lbound() and cl_cpos2rbound() return NA if cpos outsided struc #78 --- DESCRIPTION | 4 ++-- NEWS.md | 5 +++++ R/RcppExports.R | 10 +++++---- R/cl.R | 28 ----------------------- inst/include/RcppCWB_RcppExports.h | 28 +++++++++++------------ man/s_attributes.Rd | 16 +++---------- src/RcppExports.cpp | 36 +++++++++++++++--------------- src/cl.cpp | 35 ++++++++++++++++++++++------- 8 files changed, 75 insertions(+), 87 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index afe211b..28b89ed 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: RcppCWB Type: Package Title: 'Rcpp' Bindings for the 'Corpus Workbench' ('CWB') -Version: 0.6.0 -Date: 2023-03-21 +Version: 0.6.0.9001 +Date: 2023-06-11 Author: Andreas Blaette [aut, cre], Bernard Desgraupes [aut], Sylvain Loiseau [aut], diff --git a/NEWS.md b/NEWS.md index f9014b2..e5a4cf4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,6 +2,11 @@ * New function `region_to_strucs()` to get minimumum and maximum struc of s-attribute within region provided. Works also for nested s-attributes. +* Functions `cl_cpos2lbound()` and `cl_cpos2rbound()` return NA if corpus +position is outside stru for given s-attribute. #78. +* Functions `cl_cpos2lbound()` and `cl_cpos2rbound()` are exposed directly from +C++ without R wrappers, improving performance. Using the environment variable +'CORPUS_REGISTRY' if argument `registry` is handled implicitly now. # RcppCWB 0.6.0 diff --git a/R/RcppExports.R b/R/RcppExports.R index d5c2884..ba8c0e7 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -228,8 +228,9 @@ id_to_cpos <- function(p_attr, id) { .Call(`_RcppCWB_id_to_cpos`, p_attr, id) } -.cl_cpos2lbound <- function(corpus, s_attribute, cpos, registry) { - .Call(`_RcppCWB__cl_cpos2lbound`, corpus, s_attribute, cpos, registry) +#' @rdname s_attributes +cl_cpos2lbound <- function(corpus, s_attribute, cpos, registry = NULL) { + .Call(`_RcppCWB_cl_cpos2lbound`, corpus, s_attribute, cpos, registry) } #' @rdname cl_rework @@ -238,8 +239,9 @@ cpos_to_lbound <- function(s_attr, cpos) { .Call(`_RcppCWB_cpos_to_lbound`, s_attr, cpos) } -.cl_cpos2rbound <- function(corpus, s_attribute, cpos, registry) { - .Call(`_RcppCWB__cl_cpos2rbound`, corpus, s_attribute, cpos, registry) +#' @rdname s_attributes +cl_cpos2rbound <- function(corpus, s_attribute, cpos, registry = NULL) { + .Call(`_RcppCWB_cl_cpos2rbound`, corpus, s_attribute, cpos, registry) } #' @rdname cl_rework diff --git a/R/cl.R b/R/cl.R index eb7ff3a..e605030 100644 --- a/R/cl.R +++ b/R/cl.R @@ -171,34 +171,6 @@ cl_struc2str <- function(corpus, s_attribute, struc, registry = Sys.getenv("CORP .cl_struc2str(corpus = corpus, s_attribute = s_attribute, struc = struc, registry = registry) } -#' @rdname s_attributes -cl_cpos2lbound <- function(corpus, s_attribute, cpos, registry = Sys.getenv("CORPUS_REGISTRY")){ - check_registry(registry) - check_corpus(corpus, registry, cqp = FALSE) - check_s_attribute(corpus = corpus, registry = registry, s_attribute = s_attribute) - - if (length(cpos) == 0L) return(integer()) - check_cpos(corpus = corpus, p_attribute = "word", cpos = cpos, registry = registry) - - .cl_cpos2lbound(corpus = corpus, s_attribute = s_attribute, cpos = cpos, registry = registry) -} - -#' @rdname s_attributes -cl_cpos2rbound <- function(corpus, s_attribute, cpos, registry = Sys.getenv("CORPUS_REGISTRY")){ - check_registry(registry) - check_corpus(corpus, registry, cqp = FALSE) - check_s_attribute(corpus = corpus, registry = registry, s_attribute = s_attribute) - - check_cpos(corpus = corpus, p_attribute = "word", cpos = cpos, registry = registry) - if (length(cpos) == 0L) return(integer()) - - .cl_cpos2rbound(corpus = corpus, s_attribute = s_attribute, cpos = cpos, registry = registry) -} - - - - - #' @title Using Positional Attributes. #' diff --git a/inst/include/RcppCWB_RcppExports.h b/inst/include/RcppCWB_RcppExports.h index 3f6d80b..1636c53 100644 --- a/inst/include/RcppCWB_RcppExports.h +++ b/inst/include/RcppCWB_RcppExports.h @@ -612,17 +612,17 @@ namespace RcppCWB { return Rcpp::as(rcpp_result_gen); } - inline Rcpp::IntegerVector _cl_cpos2lbound(SEXP corpus, SEXP s_attribute, Rcpp::IntegerVector cpos, SEXP registry) { - typedef SEXP(*Ptr__cl_cpos2lbound)(SEXP,SEXP,SEXP,SEXP); - static Ptr__cl_cpos2lbound p__cl_cpos2lbound = NULL; - if (p__cl_cpos2lbound == NULL) { - validateSignature("Rcpp::IntegerVector(*_cl_cpos2lbound)(SEXP,SEXP,Rcpp::IntegerVector,SEXP)"); - p__cl_cpos2lbound = (Ptr__cl_cpos2lbound)R_GetCCallable("RcppCWB", "_RcppCWB__cl_cpos2lbound"); + inline Rcpp::IntegerVector cl_cpos2lbound(SEXP corpus, SEXP s_attribute, Rcpp::IntegerVector cpos, SEXP registry = R_NilValue) { + typedef SEXP(*Ptr_cl_cpos2lbound)(SEXP,SEXP,SEXP,SEXP); + static Ptr_cl_cpos2lbound p_cl_cpos2lbound = NULL; + if (p_cl_cpos2lbound == NULL) { + validateSignature("Rcpp::IntegerVector(*cl_cpos2lbound)(SEXP,SEXP,Rcpp::IntegerVector,SEXP)"); + p_cl_cpos2lbound = (Ptr_cl_cpos2lbound)R_GetCCallable("RcppCWB", "_RcppCWB_cl_cpos2lbound"); } RObject rcpp_result_gen; { RNGScope RCPP_rngScope_gen; - rcpp_result_gen = p__cl_cpos2lbound(Shield(Rcpp::wrap(corpus)), Shield(Rcpp::wrap(s_attribute)), Shield(Rcpp::wrap(cpos)), Shield(Rcpp::wrap(registry))); + rcpp_result_gen = p_cl_cpos2lbound(Shield(Rcpp::wrap(corpus)), Shield(Rcpp::wrap(s_attribute)), Shield(Rcpp::wrap(cpos)), Shield(Rcpp::wrap(registry))); } if (rcpp_result_gen.inherits("interrupted-error")) throw Rcpp::internal::InterruptedException(); @@ -654,17 +654,17 @@ namespace RcppCWB { return Rcpp::as(rcpp_result_gen); } - inline Rcpp::IntegerVector _cl_cpos2rbound(SEXP corpus, SEXP s_attribute, Rcpp::IntegerVector cpos, SEXP registry) { - typedef SEXP(*Ptr__cl_cpos2rbound)(SEXP,SEXP,SEXP,SEXP); - static Ptr__cl_cpos2rbound p__cl_cpos2rbound = NULL; - if (p__cl_cpos2rbound == NULL) { - validateSignature("Rcpp::IntegerVector(*_cl_cpos2rbound)(SEXP,SEXP,Rcpp::IntegerVector,SEXP)"); - p__cl_cpos2rbound = (Ptr__cl_cpos2rbound)R_GetCCallable("RcppCWB", "_RcppCWB__cl_cpos2rbound"); + inline Rcpp::IntegerVector cl_cpos2rbound(SEXP corpus, SEXP s_attribute, Rcpp::IntegerVector cpos, SEXP registry = R_NilValue) { + typedef SEXP(*Ptr_cl_cpos2rbound)(SEXP,SEXP,SEXP,SEXP); + static Ptr_cl_cpos2rbound p_cl_cpos2rbound = NULL; + if (p_cl_cpos2rbound == NULL) { + validateSignature("Rcpp::IntegerVector(*cl_cpos2rbound)(SEXP,SEXP,Rcpp::IntegerVector,SEXP)"); + p_cl_cpos2rbound = (Ptr_cl_cpos2rbound)R_GetCCallable("RcppCWB", "_RcppCWB_cl_cpos2rbound"); } RObject rcpp_result_gen; { RNGScope RCPP_rngScope_gen; - rcpp_result_gen = p__cl_cpos2rbound(Shield(Rcpp::wrap(corpus)), Shield(Rcpp::wrap(s_attribute)), Shield(Rcpp::wrap(cpos)), Shield(Rcpp::wrap(registry))); + rcpp_result_gen = p_cl_cpos2rbound(Shield(Rcpp::wrap(corpus)), Shield(Rcpp::wrap(s_attribute)), Shield(Rcpp::wrap(cpos)), Shield(Rcpp::wrap(registry))); } if (rcpp_result_gen.inherits("interrupted-error")) throw Rcpp::internal::InterruptedException(); diff --git a/man/s_attributes.Rd b/man/s_attributes.Rd index debbc19..5eaec3a 100644 --- a/man/s_attributes.Rd +++ b/man/s_attributes.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/cl.R +% Please edit documentation in R/cl.R, R/RcppExports.R \name{CL: s_attributes} \alias{CL: s_attributes} \alias{cl_cpos2struc} @@ -30,19 +30,9 @@ cl_struc2str( registry = Sys.getenv("CORPUS_REGISTRY") ) -cl_cpos2lbound( - corpus, - s_attribute, - cpos, - registry = Sys.getenv("CORPUS_REGISTRY") -) +cl_cpos2lbound(corpus, s_attribute, cpos, registry = NULL) -cl_cpos2rbound( - corpus, - s_attribute, - cpos, - registry = Sys.getenv("CORPUS_REGISTRY") -) +cl_cpos2rbound(corpus, s_attribute, cpos, registry = NULL) } \arguments{ \item{corpus}{name of a CWB corpus (upper case)} diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 8ce2032..39ad28c 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -1148,24 +1148,24 @@ RcppExport SEXP _RcppCWB_id_to_cpos(SEXP p_attrSEXP, SEXP idSEXP) { UNPROTECT(1); return rcpp_result_gen; } -// _cl_cpos2lbound -Rcpp::IntegerVector _cl_cpos2lbound(SEXP corpus, SEXP s_attribute, Rcpp::IntegerVector cpos, SEXP registry); -static SEXP _RcppCWB__cl_cpos2lbound_try(SEXP corpusSEXP, SEXP s_attributeSEXP, SEXP cposSEXP, SEXP registrySEXP) { +// cl_cpos2lbound +Rcpp::IntegerVector cl_cpos2lbound(SEXP corpus, SEXP s_attribute, Rcpp::IntegerVector cpos, SEXP registry); +static SEXP _RcppCWB_cl_cpos2lbound_try(SEXP corpusSEXP, SEXP s_attributeSEXP, SEXP cposSEXP, SEXP registrySEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::traits::input_parameter< SEXP >::type corpus(corpusSEXP); Rcpp::traits::input_parameter< SEXP >::type s_attribute(s_attributeSEXP); Rcpp::traits::input_parameter< Rcpp::IntegerVector >::type cpos(cposSEXP); Rcpp::traits::input_parameter< SEXP >::type registry(registrySEXP); - rcpp_result_gen = Rcpp::wrap(_cl_cpos2lbound(corpus, s_attribute, cpos, registry)); + rcpp_result_gen = Rcpp::wrap(cl_cpos2lbound(corpus, s_attribute, cpos, registry)); return rcpp_result_gen; END_RCPP_RETURN_ERROR } -RcppExport SEXP _RcppCWB__cl_cpos2lbound(SEXP corpusSEXP, SEXP s_attributeSEXP, SEXP cposSEXP, SEXP registrySEXP) { +RcppExport SEXP _RcppCWB_cl_cpos2lbound(SEXP corpusSEXP, SEXP s_attributeSEXP, SEXP cposSEXP, SEXP registrySEXP) { SEXP rcpp_result_gen; { Rcpp::RNGScope rcpp_rngScope_gen; - rcpp_result_gen = PROTECT(_RcppCWB__cl_cpos2lbound_try(corpusSEXP, s_attributeSEXP, cposSEXP, registrySEXP)); + rcpp_result_gen = PROTECT(_RcppCWB_cl_cpos2lbound_try(corpusSEXP, s_attributeSEXP, cposSEXP, registrySEXP)); } Rboolean rcpp_isInterrupt_gen = Rf_inherits(rcpp_result_gen, "interrupted-error"); if (rcpp_isInterrupt_gen) { @@ -1220,24 +1220,24 @@ RcppExport SEXP _RcppCWB_cpos_to_lbound(SEXP s_attrSEXP, SEXP cposSEXP) { UNPROTECT(1); return rcpp_result_gen; } -// _cl_cpos2rbound -Rcpp::IntegerVector _cl_cpos2rbound(SEXP corpus, SEXP s_attribute, Rcpp::IntegerVector cpos, SEXP registry); -static SEXP _RcppCWB__cl_cpos2rbound_try(SEXP corpusSEXP, SEXP s_attributeSEXP, SEXP cposSEXP, SEXP registrySEXP) { +// cl_cpos2rbound +Rcpp::IntegerVector cl_cpos2rbound(SEXP corpus, SEXP s_attribute, Rcpp::IntegerVector cpos, SEXP registry); +static SEXP _RcppCWB_cl_cpos2rbound_try(SEXP corpusSEXP, SEXP s_attributeSEXP, SEXP cposSEXP, SEXP registrySEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::traits::input_parameter< SEXP >::type corpus(corpusSEXP); Rcpp::traits::input_parameter< SEXP >::type s_attribute(s_attributeSEXP); Rcpp::traits::input_parameter< Rcpp::IntegerVector >::type cpos(cposSEXP); Rcpp::traits::input_parameter< SEXP >::type registry(registrySEXP); - rcpp_result_gen = Rcpp::wrap(_cl_cpos2rbound(corpus, s_attribute, cpos, registry)); + rcpp_result_gen = Rcpp::wrap(cl_cpos2rbound(corpus, s_attribute, cpos, registry)); return rcpp_result_gen; END_RCPP_RETURN_ERROR } -RcppExport SEXP _RcppCWB__cl_cpos2rbound(SEXP corpusSEXP, SEXP s_attributeSEXP, SEXP cposSEXP, SEXP registrySEXP) { +RcppExport SEXP _RcppCWB_cl_cpos2rbound(SEXP corpusSEXP, SEXP s_attributeSEXP, SEXP cposSEXP, SEXP registrySEXP) { SEXP rcpp_result_gen; { Rcpp::RNGScope rcpp_rngScope_gen; - rcpp_result_gen = PROTECT(_RcppCWB__cl_cpos2rbound_try(corpusSEXP, s_attributeSEXP, cposSEXP, registrySEXP)); + rcpp_result_gen = PROTECT(_RcppCWB_cl_cpos2rbound_try(corpusSEXP, s_attributeSEXP, cposSEXP, registrySEXP)); } Rboolean rcpp_isInterrupt_gen = Rf_inherits(rcpp_result_gen, "interrupted-error"); if (rcpp_isInterrupt_gen) { @@ -2548,9 +2548,9 @@ static int _RcppCWB_RcppExport_validate(const char* sig) { signatures.insert("Rcpp::IntegerVector(*id_to_freq)(SEXP,Rcpp::IntegerVector)"); signatures.insert("Rcpp::IntegerVector(*.cl_id2cpos)(SEXP,SEXP,SEXP,SEXP)"); signatures.insert("Rcpp::IntegerVector(*id_to_cpos)(SEXP,Rcpp::IntegerVector)"); - signatures.insert("Rcpp::IntegerVector(*.cl_cpos2lbound)(SEXP,SEXP,Rcpp::IntegerVector,SEXP)"); + signatures.insert("Rcpp::IntegerVector(*cl_cpos2lbound)(SEXP,SEXP,Rcpp::IntegerVector,SEXP)"); signatures.insert("Rcpp::IntegerVector(*cpos_to_lbound)(SEXP,Rcpp::IntegerVector)"); - signatures.insert("Rcpp::IntegerVector(*.cl_cpos2rbound)(SEXP,SEXP,Rcpp::IntegerVector,SEXP)"); + signatures.insert("Rcpp::IntegerVector(*cl_cpos2rbound)(SEXP,SEXP,Rcpp::IntegerVector,SEXP)"); signatures.insert("Rcpp::IntegerVector(*cpos_to_rbound)(SEXP,Rcpp::IntegerVector)"); signatures.insert("SEXP(*.cl_find_corpus)(SEXP,SEXP)"); signatures.insert("SEXP(*.cl_new_attribute)(SEXP,SEXP,int)"); @@ -2621,9 +2621,9 @@ RcppExport SEXP _RcppCWB_RcppExport_registerCCallable() { R_RegisterCCallable("RcppCWB", "_RcppCWB_id_to_freq", (DL_FUNC)_RcppCWB_id_to_freq_try); R_RegisterCCallable("RcppCWB", "_RcppCWB_.cl_id2cpos", (DL_FUNC)_RcppCWB__cl_id2cpos_try); R_RegisterCCallable("RcppCWB", "_RcppCWB_id_to_cpos", (DL_FUNC)_RcppCWB_id_to_cpos_try); - R_RegisterCCallable("RcppCWB", "_RcppCWB_.cl_cpos2lbound", (DL_FUNC)_RcppCWB__cl_cpos2lbound_try); + R_RegisterCCallable("RcppCWB", "_RcppCWB_cl_cpos2lbound", (DL_FUNC)_RcppCWB_cl_cpos2lbound_try); R_RegisterCCallable("RcppCWB", "_RcppCWB_cpos_to_lbound", (DL_FUNC)_RcppCWB_cpos_to_lbound_try); - R_RegisterCCallable("RcppCWB", "_RcppCWB_.cl_cpos2rbound", (DL_FUNC)_RcppCWB__cl_cpos2rbound_try); + R_RegisterCCallable("RcppCWB", "_RcppCWB_cl_cpos2rbound", (DL_FUNC)_RcppCWB_cl_cpos2rbound_try); R_RegisterCCallable("RcppCWB", "_RcppCWB_cpos_to_rbound", (DL_FUNC)_RcppCWB_cpos_to_rbound_try); R_RegisterCCallable("RcppCWB", "_RcppCWB_.cl_find_corpus", (DL_FUNC)_RcppCWB__cl_find_corpus_try); R_RegisterCCallable("RcppCWB", "_RcppCWB_.cl_new_attribute", (DL_FUNC)_RcppCWB__cl_new_attribute_try); @@ -2703,9 +2703,9 @@ static const R_CallMethodDef CallEntries[] = { {"_RcppCWB_id_to_freq", (DL_FUNC) &_RcppCWB_id_to_freq, 2}, {"_RcppCWB__cl_id2cpos", (DL_FUNC) &_RcppCWB__cl_id2cpos, 4}, {"_RcppCWB_id_to_cpos", (DL_FUNC) &_RcppCWB_id_to_cpos, 2}, - {"_RcppCWB__cl_cpos2lbound", (DL_FUNC) &_RcppCWB__cl_cpos2lbound, 4}, + {"_RcppCWB_cl_cpos2lbound", (DL_FUNC) &_RcppCWB_cl_cpos2lbound, 4}, {"_RcppCWB_cpos_to_lbound", (DL_FUNC) &_RcppCWB_cpos_to_lbound, 2}, - {"_RcppCWB__cl_cpos2rbound", (DL_FUNC) &_RcppCWB__cl_cpos2rbound, 4}, + {"_RcppCWB_cl_cpos2rbound", (DL_FUNC) &_RcppCWB_cl_cpos2rbound, 4}, {"_RcppCWB_cpos_to_rbound", (DL_FUNC) &_RcppCWB_cpos_to_rbound, 2}, {"_RcppCWB__cl_find_corpus", (DL_FUNC) &_RcppCWB__cl_find_corpus, 2}, {"_RcppCWB__cl_new_attribute", (DL_FUNC) &_RcppCWB__cl_new_attribute, 3}, diff --git a/src/cl.cpp b/src/cl.cpp index 9d7c6ad..7fec6fc 100644 --- a/src/cl.cpp +++ b/src/cl.cpp @@ -28,6 +28,13 @@ extern "C" { using namespace Rcpp; // [[Rcpp::interfaces(r, cpp)]] +/* avoid complications with including Rinternals.h */ +#define mkString Rf_mkString +SEXP Rf_mkString(const char *); +/* end of quasi-header */ + + + char* cl_get_version(); char* cl_get_p_attr_default(); @@ -456,16 +463,22 @@ Rcpp::IntegerVector _cl_cpos2lbound(Attribute * att, Rcpp::IntegerVector cpos){ for (i = 0; i < len; i++){ struc = cl_cpos2struc(att, cpos(i)); - cl_struc2cpos(att, struc, &lb, &rb); - result(i) = lb; + if (struc >= 0){ + cl_struc2cpos(att, struc, &lb, &rb); + result(i) = lb; + } else { + result(i) = NA_INTEGER; + } } return( result ); } -// [[Rcpp::export(name=".cl_cpos2lbound")]] -Rcpp::IntegerVector _cl_cpos2lbound(SEXP corpus, SEXP s_attribute, Rcpp::IntegerVector cpos, SEXP registry){ +//' @rdname s_attributes +// [[Rcpp::export]] +Rcpp::IntegerVector cl_cpos2lbound(SEXP corpus, SEXP s_attribute, Rcpp::IntegerVector cpos, SEXP registry = R_NilValue){ + if (registry == R_NilValue) registry = mkString(getenv("CORPUS_REGISTRY")); Attribute* att = make_s_attribute(corpus, s_attribute, registry); return(_cl_cpos2lbound(att, cpos)); } @@ -489,16 +502,22 @@ Rcpp::IntegerVector _cl_cpos2rbound(Attribute* att, Rcpp::IntegerVector cpos){ for (i = 0; i < len; i++){ struc = cl_cpos2struc(att, cpos(i)); - cl_struc2cpos(att, struc, &lb, &rb); - result(i) = rb; + if (struc >= 0){ + cl_struc2cpos(att, struc, &lb, &rb); + result(i) = rb; + } else { + result(i) = NA_INTEGER; + } } return( result ); } -// [[Rcpp::export(name=".cl_cpos2rbound")]] -Rcpp::IntegerVector _cl_cpos2rbound(SEXP corpus, SEXP s_attribute, Rcpp::IntegerVector cpos, SEXP registry){ +//' @rdname s_attributes +// [[Rcpp::export]] +Rcpp::IntegerVector cl_cpos2rbound(SEXP corpus, SEXP s_attribute, Rcpp::IntegerVector cpos, SEXP registry = R_NilValue){ + if (registry == R_NilValue) registry = mkString(getenv("CORPUS_REGISTRY")); Attribute* att = make_s_attribute(corpus, s_attribute, registry); return(_cl_cpos2rbound(att, cpos)); } From d85f2ef34f08a3e094ad6c6c6476fca10385c565 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Sun, 11 Jun 2023 14:10:20 +0200 Subject: [PATCH 03/90] documentation of cl_cpos2rbound()/cl_cpos2lbound() #78 --- R/RcppExports.R | 2 ++ man/s_attributes.Rd | 4 ++++ src/cl.cpp | 2 ++ 3 files changed, 8 insertions(+) diff --git a/R/RcppExports.R b/R/RcppExports.R index ba8c0e7..9849694 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -240,6 +240,8 @@ cpos_to_lbound <- function(s_attr, cpos) { } #' @rdname s_attributes +#' @details `cl_cpos2rbound()` and `cl_cpos2lbound()` return `NA` for values of +#' cpos that are outside a struc for the structural attribute given. cl_cpos2rbound <- function(corpus, s_attribute, cpos, registry = NULL) { .Call(`_RcppCWB_cl_cpos2rbound`, corpus, s_attribute, cpos, registry) } diff --git a/man/s_attributes.Rd b/man/s_attributes.Rd index 5eaec3a..b21af4c 100644 --- a/man/s_attributes.Rd +++ b/man/s_attributes.Rd @@ -54,6 +54,10 @@ a right corpus position. The corpus library (CL) offers a set of functions to make the translations between corpus positions (cpos) and strucs (struc). } +\details{ +\code{cl_cpos2rbound()} and \code{cl_cpos2lbound()} return \code{NA} for values of +cpos that are outside a struc for the structural attribute given. +} \examples{ # get metadata for matches of token # scenario: id of the texts with occurrence of 'oil' diff --git a/src/cl.cpp b/src/cl.cpp index 7fec6fc..b05abb5 100644 --- a/src/cl.cpp +++ b/src/cl.cpp @@ -515,6 +515,8 @@ Rcpp::IntegerVector _cl_cpos2rbound(Attribute* att, Rcpp::IntegerVector cpos){ //' @rdname s_attributes +//' @details `cl_cpos2rbound()` and `cl_cpos2lbound()` return `NA` for values of +//' cpos that are outside a struc for the structural attribute given. // [[Rcpp::export]] Rcpp::IntegerVector cl_cpos2rbound(SEXP corpus, SEXP s_attribute, Rcpp::IntegerVector cpos, SEXP registry = R_NilValue){ if (registry == R_NilValue) registry = mkString(getenv("CORPUS_REGISTRY")); From b762fcd0b5680922d226aebc9c3cd95cc69006b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Sun, 11 Jun 2023 14:19:56 +0200 Subject: [PATCH 04/90] minimal modification of cl_struc_values() --- R/cl.R | 3 +-- src/cl.cpp | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/R/cl.R b/R/cl.R index e605030..7a146bb 100644 --- a/R/cl.R +++ b/R/cl.R @@ -373,8 +373,7 @@ cl_charset_name <- function(corpus, registry = Sys.getenv("CORPUS_REGISTRY")){ cl_struc_values <- function(corpus, s_attribute, registry = Sys.getenv("CORPUS_REGISTRY")){ check_corpus(corpus = corpus, registry = registry, cqp = FALSE) registry <- normalizePath(path.expand(registry)) - i <- .cl_struc_values(corpus = corpus, s_attribute = s_attribute, registry = registry) - if (i == 1L) TRUE else if (i == 0L) FALSE else if (i < 0L) as.integer(NA) + .cl_struc_values(corpus = corpus, s_attribute = s_attribute, registry = registry) } #' Get information from registry file diff --git a/src/cl.cpp b/src/cl.cpp index b05abb5..48aef7e 100644 --- a/src/cl.cpp +++ b/src/cl.cpp @@ -649,8 +649,7 @@ Rcpp::StringVector _cl_charset_name(SEXP corpus, SEXP registry){ // [[Rcpp::export(name=".cl_struc_values")]] int _cl_struc_values(SEXP corpus, SEXP s_attribute, SEXP registry){ Attribute* att = make_s_attribute(corpus, s_attribute, registry); - int y = cl_struc_values(att); - return y; + return cl_struc_values(att); } From ac323ad5b8bf9c99abf645cb3c330434c3f2fc2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Mon, 12 Jun 2023 10:34:08 +0200 Subject: [PATCH 05/90] fn region_matrix_to_struc_matrix() drafted --- DESCRIPTION | 4 +- NEWS.md | 1 + R/RcppExports.R | 8 ++++ ...gion_to_strucs.Rd => regions_to_strucs.Rd} | 11 ++++++ src/RcppExports.cpp | 15 +++++++ src/addons.cpp | 39 +++++++++++++++++++ 6 files changed, 76 insertions(+), 2 deletions(-) rename man/{region_to_strucs.Rd => regions_to_strucs.Rd} (76%) diff --git a/DESCRIPTION b/DESCRIPTION index 28b89ed..566ea26 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: RcppCWB Type: Package Title: 'Rcpp' Bindings for the 'Corpus Workbench' ('CWB') -Version: 0.6.0.9001 -Date: 2023-06-11 +Version: 0.6.0.9002 +Date: 2023-06-12 Author: Andreas Blaette [aut, cre], Bernard Desgraupes [aut], Sylvain Loiseau [aut], diff --git a/NEWS.md b/NEWS.md index e5a4cf4..587ca67 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,6 +2,7 @@ * New function `region_to_strucs()` to get minimumum and maximum struc of s-attribute within region provided. Works also for nested s-attributes. +* New function `region_matrix_to_struc_matrix()`. * Functions `cl_cpos2lbound()` and `cl_cpos2rbound()` return NA if corpus position is outside stru for given s-attribute. #78. * Functions `cl_cpos2lbound()` and `cl_cpos2rbound()` are exposed directly from diff --git a/R/RcppExports.R b/R/RcppExports.R index 9849694..d078a09 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -48,10 +48,18 @@ #' @param s_attribute Name of nested structural attribute. #' @param region Vector with left and right corpus position of region. #' @return A length-two integer vector. +#' @rdname regions_to_strucs region_to_strucs <- function(corpus, s_attribute, region, registry = NULL) { .Call(`_RcppCWB_region_to_strucs`, corpus, s_attribute, region, registry) } +#' @param region_matrix A two-column `matrix` with regions, left corpus +#' positions in column 1, right corpus positions in column 2. +#' @rdname regions_to_strucs +region_matrix_to_struc_matrix <- function(corpus, s_attribute, region_matrix, registry = NULL) { + .Call(`_RcppCWB_region_matrix_to_struc_matrix`, corpus, s_attribute, region_matrix, registry) +} + .cwb_version <- function() { .Call(`_RcppCWB_cwb_version`) } diff --git a/man/region_to_strucs.Rd b/man/regions_to_strucs.Rd similarity index 76% rename from man/region_to_strucs.Rd rename to man/regions_to_strucs.Rd index 52ae164..d89fa70 100644 --- a/man/region_to_strucs.Rd +++ b/man/regions_to_strucs.Rd @@ -2,9 +2,17 @@ % Please edit documentation in R/RcppExports.R \name{region_to_strucs} \alias{region_to_strucs} +\alias{region_matrix_to_struc_matrix} \title{Get vector with min and max struc of s-attribute within a region} \usage{ region_to_strucs(corpus, s_attribute, region, registry = NULL) + +region_matrix_to_struc_matrix( + corpus, + s_attribute, + region_matrix, + registry = NULL +) } \arguments{ \item{corpus}{ID of a CWB corpus.} @@ -15,6 +23,9 @@ region_to_strucs(corpus, s_attribute, region, registry = NULL) \item{registry}{Path of the registry directory. If \code{NULL} (default), value of environment variable 'CORPUS_REGISTRY' will be used.} + +\item{region_matrix}{A two-column \code{matrix} with regions, left corpus +positions in column 1, right corpus positions in column 2.} } \value{ A length-two integer vector. diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 39ad28c..975ab42 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -150,6 +150,20 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// region_matrix_to_struc_matrix +Rcpp::IntegerMatrix region_matrix_to_struc_matrix(SEXP corpus, SEXP s_attribute, Rcpp::IntegerMatrix region_matrix, SEXP registry); +RcppExport SEXP _RcppCWB_region_matrix_to_struc_matrix(SEXP corpusSEXP, SEXP s_attributeSEXP, SEXP region_matrixSEXP, SEXP registrySEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< SEXP >::type corpus(corpusSEXP); + Rcpp::traits::input_parameter< SEXP >::type s_attribute(s_attributeSEXP); + Rcpp::traits::input_parameter< Rcpp::IntegerMatrix >::type region_matrix(region_matrixSEXP); + Rcpp::traits::input_parameter< SEXP >::type registry(registrySEXP); + rcpp_result_gen = Rcpp::wrap(region_matrix_to_struc_matrix(corpus, s_attribute, region_matrix, registry)); + return rcpp_result_gen; +END_RCPP +} // cwb_version Rcpp::StringVector cwb_version(); static SEXP _RcppCWB_cwb_version_try() { @@ -2675,6 +2689,7 @@ static const R_CallMethodDef CallEntries[] = { {"_RcppCWB_region_matrix_to_count_matrix", (DL_FUNC) &_RcppCWB_region_matrix_to_count_matrix, 4}, {"_RcppCWB_region_matrix_context", (DL_FUNC) &_RcppCWB_region_matrix_context, 8}, {"_RcppCWB_region_to_strucs", (DL_FUNC) &_RcppCWB_region_to_strucs, 4}, + {"_RcppCWB_region_matrix_to_struc_matrix", (DL_FUNC) &_RcppCWB_region_matrix_to_struc_matrix, 4}, {"_RcppCWB_cwb_version", (DL_FUNC) &_RcppCWB_cwb_version, 0}, {"_RcppCWB_p_attr_default", (DL_FUNC) &_RcppCWB_p_attr_default, 0}, {"_RcppCWB_s_attr", (DL_FUNC) &_RcppCWB_s_attr, 3}, diff --git a/src/addons.cpp b/src/addons.cpp index 9629a91..d13aa0b 100644 --- a/src/addons.cpp +++ b/src/addons.cpp @@ -441,6 +441,7 @@ Rcpp::IntegerMatrix region_matrix_context(SEXP corpus, SEXP registry, Rcpp::Inte //' @param s_attribute Name of nested structural attribute. //' @param region Vector with left and right corpus position of region. //' @return A length-two integer vector. +//' @rdname regions_to_strucs // [[Rcpp::export]] Rcpp::IntegerVector region_to_strucs(SEXP corpus, SEXP s_attribute, Rcpp::IntegerVector region, SEXP registry = R_NilValue){ @@ -476,3 +477,41 @@ Rcpp::IntegerVector region_to_strucs(SEXP corpus, SEXP s_attribute, Rcpp::Intege return strucs; } + + +//' @param region_matrix A two-column `matrix` with regions, left corpus +//' positions in column 1, right corpus positions in column 2. +//' @rdname regions_to_strucs +// [[Rcpp::export]] +Rcpp::IntegerMatrix region_matrix_to_struc_matrix(SEXP corpus, SEXP s_attribute, Rcpp::IntegerMatrix region_matrix, SEXP registry = R_NilValue){ + + if (registry == R_NilValue) registry = mkString(getenv("CORPUS_REGISTRY")); + Attribute* att = make_s_attribute(corpus, s_attribute, registry); + + Rcpp::IntegerMatrix struc_matrix(region_matrix.nrow(), 2); + bool more; + int i; + + for (i = 0; i < region_matrix.nrow(); i++){ + more = true; + while (more){ + struc_matrix(i,0) = cl_cpos2struc(att, region_matrix(i,0)); + if (struc_matrix(i,0) >= 0) more = false; + if (region_matrix(i,0) > region_matrix(i,1)) more = false; + region_matrix(i,0)++; + }; + + more = true; + while (more){ + struc_matrix(i,1) = cl_cpos2struc(att, region_matrix(i,1)); + if (struc_matrix(i,1) >= 0) more = false; + if (region_matrix(i,1) < region_matrix(i,0)) more = false; + region_matrix(i,1)--; + }; + + if (region_matrix(i,0) < 0) region_matrix(i,0) = NA_INTEGER; + if (region_matrix(i,1) < 0) region_matrix(i,1) = NA_INTEGER; + } + + return struc_matrix; +} From 4ccd12543d05596acce88e7523a9d556705fa7e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Mon, 12 Jun 2023 16:32:00 +0200 Subject: [PATCH 06/90] region_to_strucs() as special case of region_matrix_to_struc_matrix() --- DESCRIPTION | 2 +- R/RcppExports.R | 26 +++++++----- man/regions_to_strucs.Rd | 28 +++++++------ src/RcppExports.cpp | 22 +++++----- src/addons.cpp | 90 ++++++++++++++++++++-------------------- 5 files changed, 88 insertions(+), 80 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 566ea26..572d361 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: RcppCWB Type: Package Title: 'Rcpp' Bindings for the 'Corpus Workbench' ('CWB') -Version: 0.6.0.9002 +Version: 0.6.0.9003 Date: 2023-06-12 Author: Andreas Blaette [aut, cre], Bernard Desgraupes [aut], diff --git a/R/RcppExports.R b/R/RcppExports.R index d078a09..82d81a7 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -37,22 +37,21 @@ .Call(`_RcppCWB_region_matrix_context`, corpus, registry, region_matrix, p_attribute, s_attribute, boundary, left, right) } -#' Get vector with min and max struc of s-attribute within a region +#' Get min and max strucs of s-attribute present in region +#' +#' Look up the minimum and maximum struc of a s-attribute within a region, +#' including scenario of nested s-attributes. If there are no regions of the +#' s-attribute within the region, `NA` values are returned. +#' #' -#' Look up the minimum and maximum struc of a s-attribute within a region. -#' Works for nested s-attributes. If there are no regions of the s-attribute -#' within the region, a vector with (two) `NA` values is returned. #' @param corpus ID of a CWB corpus. #' @param registry Path of the registry directory. If `NULL` (default), value #' of environment variable 'CORPUS_REGISTRY' will be used. -#' @param s_attribute Name of nested structural attribute. +#' @param s_attribute Name of structural attribute. The attribute may be +#' nested. #' @param region Vector with left and right corpus position of region. -#' @return A length-two integer vector. -#' @rdname regions_to_strucs -region_to_strucs <- function(corpus, s_attribute, region, registry = NULL) { - .Call(`_RcppCWB_region_to_strucs`, corpus, s_attribute, region, registry) -} - +#' @return Depending whether input is a vector (argument `region`) or a matrix +#' (argument `region_matrix`), a vector or a matrix. #' @param region_matrix A two-column `matrix` with regions, left corpus #' positions in column 1, right corpus positions in column 2. #' @rdname regions_to_strucs @@ -60,6 +59,11 @@ region_matrix_to_struc_matrix <- function(corpus, s_attribute, region_matrix, re .Call(`_RcppCWB_region_matrix_to_struc_matrix`, corpus, s_attribute, region_matrix, registry) } +#' @rdname regions_to_strucs +region_to_strucs <- function(corpus, s_attribute, region, registry = NULL) { + .Call(`_RcppCWB_region_to_strucs`, corpus, s_attribute, region, registry) +} + .cwb_version <- function() { .Call(`_RcppCWB_cwb_version`) } diff --git a/man/regions_to_strucs.Rd b/man/regions_to_strucs.Rd index d89fa70..7b01a19 100644 --- a/man/regions_to_strucs.Rd +++ b/man/regions_to_strucs.Rd @@ -1,37 +1,39 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/RcppExports.R -\name{region_to_strucs} -\alias{region_to_strucs} +\name{region_matrix_to_struc_matrix} \alias{region_matrix_to_struc_matrix} -\title{Get vector with min and max struc of s-attribute within a region} +\alias{region_to_strucs} +\title{Get min and max strucs of s-attribute present in region} \usage{ -region_to_strucs(corpus, s_attribute, region, registry = NULL) - region_matrix_to_struc_matrix( corpus, s_attribute, region_matrix, registry = NULL ) + +region_to_strucs(corpus, s_attribute, region, registry = NULL) } \arguments{ \item{corpus}{ID of a CWB corpus.} -\item{s_attribute}{Name of nested structural attribute.} +\item{s_attribute}{Name of structural attribute. The attribute may be +nested.} -\item{region}{Vector with left and right corpus position of region.} +\item{region_matrix}{A two-column \code{matrix} with regions, left corpus +positions in column 1, right corpus positions in column 2.} \item{registry}{Path of the registry directory. If \code{NULL} (default), value of environment variable 'CORPUS_REGISTRY' will be used.} -\item{region_matrix}{A two-column \code{matrix} with regions, left corpus -positions in column 1, right corpus positions in column 2.} +\item{region}{Vector with left and right corpus position of region.} } \value{ -A length-two integer vector. +Depending whether input is a vector (argument \code{region}) or a matrix +(argument \code{region_matrix}), a vector or a matrix. } \description{ -Look up the minimum and maximum struc of a s-attribute within a region. -Works for nested s-attributes. If there are no regions of the s-attribute -within the region, a vector with (two) \code{NA} values is returned. +Look up the minimum and maximum struc of a s-attribute within a region, +including scenario of nested s-attributes. If there are no regions of the +s-attribute within the region, \code{NA} values are returned. } diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 975ab42..af24686 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -136,31 +136,31 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } -// region_to_strucs -Rcpp::IntegerVector region_to_strucs(SEXP corpus, SEXP s_attribute, Rcpp::IntegerVector region, SEXP registry); -RcppExport SEXP _RcppCWB_region_to_strucs(SEXP corpusSEXP, SEXP s_attributeSEXP, SEXP regionSEXP, SEXP registrySEXP) { +// region_matrix_to_struc_matrix +Rcpp::IntegerMatrix region_matrix_to_struc_matrix(SEXP corpus, SEXP s_attribute, Rcpp::IntegerMatrix region_matrix, SEXP registry); +RcppExport SEXP _RcppCWB_region_matrix_to_struc_matrix(SEXP corpusSEXP, SEXP s_attributeSEXP, SEXP region_matrixSEXP, SEXP registrySEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< SEXP >::type corpus(corpusSEXP); Rcpp::traits::input_parameter< SEXP >::type s_attribute(s_attributeSEXP); - Rcpp::traits::input_parameter< Rcpp::IntegerVector >::type region(regionSEXP); + Rcpp::traits::input_parameter< Rcpp::IntegerMatrix >::type region_matrix(region_matrixSEXP); Rcpp::traits::input_parameter< SEXP >::type registry(registrySEXP); - rcpp_result_gen = Rcpp::wrap(region_to_strucs(corpus, s_attribute, region, registry)); + rcpp_result_gen = Rcpp::wrap(region_matrix_to_struc_matrix(corpus, s_attribute, region_matrix, registry)); return rcpp_result_gen; END_RCPP } -// region_matrix_to_struc_matrix -Rcpp::IntegerMatrix region_matrix_to_struc_matrix(SEXP corpus, SEXP s_attribute, Rcpp::IntegerMatrix region_matrix, SEXP registry); -RcppExport SEXP _RcppCWB_region_matrix_to_struc_matrix(SEXP corpusSEXP, SEXP s_attributeSEXP, SEXP region_matrixSEXP, SEXP registrySEXP) { +// region_to_strucs +Rcpp::IntegerVector region_to_strucs(SEXP corpus, SEXP s_attribute, Rcpp::IntegerVector region, SEXP registry); +RcppExport SEXP _RcppCWB_region_to_strucs(SEXP corpusSEXP, SEXP s_attributeSEXP, SEXP regionSEXP, SEXP registrySEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< SEXP >::type corpus(corpusSEXP); Rcpp::traits::input_parameter< SEXP >::type s_attribute(s_attributeSEXP); - Rcpp::traits::input_parameter< Rcpp::IntegerMatrix >::type region_matrix(region_matrixSEXP); + Rcpp::traits::input_parameter< Rcpp::IntegerVector >::type region(regionSEXP); Rcpp::traits::input_parameter< SEXP >::type registry(registrySEXP); - rcpp_result_gen = Rcpp::wrap(region_matrix_to_struc_matrix(corpus, s_attribute, region_matrix, registry)); + rcpp_result_gen = Rcpp::wrap(region_to_strucs(corpus, s_attribute, region, registry)); return rcpp_result_gen; END_RCPP } @@ -2688,8 +2688,8 @@ static const R_CallMethodDef CallEntries[] = { {"_RcppCWB_ids_to_count_matrix", (DL_FUNC) &_RcppCWB_ids_to_count_matrix, 1}, {"_RcppCWB_region_matrix_to_count_matrix", (DL_FUNC) &_RcppCWB_region_matrix_to_count_matrix, 4}, {"_RcppCWB_region_matrix_context", (DL_FUNC) &_RcppCWB_region_matrix_context, 8}, - {"_RcppCWB_region_to_strucs", (DL_FUNC) &_RcppCWB_region_to_strucs, 4}, {"_RcppCWB_region_matrix_to_struc_matrix", (DL_FUNC) &_RcppCWB_region_matrix_to_struc_matrix, 4}, + {"_RcppCWB_region_to_strucs", (DL_FUNC) &_RcppCWB_region_to_strucs, 4}, {"_RcppCWB_cwb_version", (DL_FUNC) &_RcppCWB_cwb_version, 0}, {"_RcppCWB_p_attr_default", (DL_FUNC) &_RcppCWB_p_attr_default, 0}, {"_RcppCWB_s_attr", (DL_FUNC) &_RcppCWB_s_attr, 3}, diff --git a/src/addons.cpp b/src/addons.cpp index d13aa0b..acf5907 100644 --- a/src/addons.cpp +++ b/src/addons.cpp @@ -430,55 +430,21 @@ Rcpp::IntegerMatrix region_matrix_context(SEXP corpus, SEXP registry, Rcpp::Inte return cpos_matrix; } -//' Get vector with min and max struc of s-attribute within a region +//' Get min and max strucs of s-attribute present in region +//' +//' Look up the minimum and maximum struc of a s-attribute within a region, +//' including scenario of nested s-attributes. If there are no regions of the +//' s-attribute within the region, `NA` values are returned. +//' //' -//' Look up the minimum and maximum struc of a s-attribute within a region. -//' Works for nested s-attributes. If there are no regions of the s-attribute -//' within the region, a vector with (two) `NA` values is returned. //' @param corpus ID of a CWB corpus. //' @param registry Path of the registry directory. If `NULL` (default), value //' of environment variable 'CORPUS_REGISTRY' will be used. -//' @param s_attribute Name of nested structural attribute. +//' @param s_attribute Name of structural attribute. The attribute may be +//' nested. //' @param region Vector with left and right corpus position of region. -//' @return A length-two integer vector. -//' @rdname regions_to_strucs -// [[Rcpp::export]] -Rcpp::IntegerVector region_to_strucs(SEXP corpus, SEXP s_attribute, Rcpp::IntegerVector region, SEXP registry = R_NilValue){ - - if (region(0) > region(1)) - return Rcpp::IntegerVector::create(NA_INTEGER, NA_INTEGER); - - if (region.length() != 2) - return Rcpp::IntegerVector::create(NA_INTEGER, NA_INTEGER); - - if (registry == R_NilValue) registry = mkString(getenv("CORPUS_REGISTRY")); - - Attribute* att = make_s_attribute(corpus, s_attribute, registry); - Rcpp::IntegerVector strucs(2); - - bool more = true; - while (more){ - strucs(0) = cl_cpos2struc(att, region(0)); - if (strucs(0) >= 0) more = false; - if (region(0) > region(1)) more = false; - region(0)++; - }; - - more = true; - while (more){ - strucs(1) = cl_cpos2struc(att, region(1)); - if (strucs(1) >= 0) more = false; - if (region(1) < region(0)) more = false; - region(1)--; - }; - - if (region(0) < 0) region(0) = NA_INTEGER; - if (region(1) < 0) region(1) = NA_INTEGER; - - return strucs; -} - - +//' @return Depending whether input is a vector (argument `region`) or a matrix +//' (argument `region_matrix`), a vector or a matrix. //' @param region_matrix A two-column `matrix` with regions, left corpus //' positions in column 1, right corpus positions in column 2. //' @rdname regions_to_strucs @@ -493,6 +459,13 @@ Rcpp::IntegerMatrix region_matrix_to_struc_matrix(SEXP corpus, SEXP s_attribute, int i; for (i = 0; i < region_matrix.nrow(); i++){ + + if (region_matrix(i,0) > region_matrix(i,1)){ + struc_matrix(i,0) = NA_INTEGER; + struc_matrix(i,1) = NA_INTEGER; + continue; + } + more = true; while (more){ struc_matrix(i,0) = cl_cpos2struc(att, region_matrix(i,0)); @@ -515,3 +488,32 @@ Rcpp::IntegerMatrix region_matrix_to_struc_matrix(SEXP corpus, SEXP s_attribute, return struc_matrix; } + + +//' @rdname regions_to_strucs +// [[Rcpp::export]] +Rcpp::IntegerVector region_to_strucs(SEXP corpus, SEXP s_attribute, Rcpp::IntegerVector region, SEXP registry = R_NilValue){ + + /* The default case is you want to get a struc matrix for a region matrix, + * so to keep the times the attribute is instantiated to a minimum, + * this simple case maps arguments on the more complex case */ + + Rcpp::IntegerMatrix region_matrix(1,2); + Rcpp::IntegerMatrix struc_matrix; + Rcpp::IntegerVector strucs(2); + + region_matrix(0,0) = region(0); + region_matrix(0,1) = region(1); + + struc_matrix = region_matrix_to_struc_matrix( + corpus, + s_attribute, + region_matrix, + registry + ); + + strucs(0) = struc_matrix(0,0); + strucs(1) = struc_matrix(0,1); + + return strucs; +} From 366bdc1ebb01c8ca8fd969a46a1df507992aa31d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Tue, 13 Jun 2023 00:58:08 +0200 Subject: [PATCH 07/90] overcounting bug removed --- DESCRIPTION | 4 ++-- src/addons.cpp | 32 +++++++++++++++----------------- 2 files changed, 17 insertions(+), 19 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 572d361..b39b9f7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: RcppCWB Type: Package Title: 'Rcpp' Bindings for the 'Corpus Workbench' ('CWB') -Version: 0.6.0.9003 -Date: 2023-06-12 +Version: 0.6.0.9004 +Date: 2023-06-13 Author: Andreas Blaette [aut, cre], Bernard Desgraupes [aut], Sylvain Loiseau [aut], diff --git a/src/addons.cpp b/src/addons.cpp index acf5907..6d99ee4 100644 --- a/src/addons.cpp +++ b/src/addons.cpp @@ -455,35 +455,33 @@ Rcpp::IntegerMatrix region_matrix_to_struc_matrix(SEXP corpus, SEXP s_attribute, Attribute* att = make_s_attribute(corpus, s_attribute, registry); Rcpp::IntegerMatrix struc_matrix(region_matrix.nrow(), 2); - bool more; + Rcpp::IntegerMatrix regions = clone(region_matrix); int i; - for (i = 0; i < region_matrix.nrow(); i++){ + for (i = 0; i < regions.nrow(); i++){ - if (region_matrix(i,0) > region_matrix(i,1)){ + if (regions(i,0) > regions(i,1)){ struc_matrix(i,0) = NA_INTEGER; struc_matrix(i,1) = NA_INTEGER; continue; } - more = true; - while (more){ - struc_matrix(i,0) = cl_cpos2struc(att, region_matrix(i,0)); - if (struc_matrix(i,0) >= 0) more = false; - if (region_matrix(i,0) > region_matrix(i,1)) more = false; - region_matrix(i,0)++; + while (true){ + struc_matrix(i,0) = cl_cpos2struc(att, regions(i,0)); + if (struc_matrix(i,0) >= 0) break; + if (regions(i,0) >= regions(i,1)) break; + regions(i,0)++; }; - more = true; - while (more){ - struc_matrix(i,1) = cl_cpos2struc(att, region_matrix(i,1)); - if (struc_matrix(i,1) >= 0) more = false; - if (region_matrix(i,1) < region_matrix(i,0)) more = false; - region_matrix(i,1)--; + while (true){ + struc_matrix(i,1) = cl_cpos2struc(att, regions(i,1)); + if (struc_matrix(i,1) >= 0) break; + if (regions(i,1) < regions(i,0)) break; + regions(i,1)--; }; - if (region_matrix(i,0) < 0) region_matrix(i,0) = NA_INTEGER; - if (region_matrix(i,1) < 0) region_matrix(i,1) = NA_INTEGER; + if (regions(i,0) < 0) regions(i,0) = NA_INTEGER; + if (regions(i,1) < 0) regions(i,1) = NA_INTEGER; } return struc_matrix; From 8a47af0c5917a9f6b5f6b4441ebc986805982fe9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Tue, 13 Jun 2023 13:04:54 +0200 Subject: [PATCH 08/90] release v0.6.1 --- DESCRIPTION | 2 +- R/cl.R | 2 ++ cran-comments.md | 21 +++++---------------- man/cl_rework.Rd | 2 ++ 4 files changed, 10 insertions(+), 17 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index b39b9f7..7ea88d6 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: RcppCWB Type: Package Title: 'Rcpp' Bindings for the 'Corpus Workbench' ('CWB') -Version: 0.6.0.9004 +Version: 0.6.1 Date: 2023-06-13 Author: Andreas Blaette [aut, cre], Bernard Desgraupes [aut], diff --git a/R/cl.R b/R/cl.R index 7a146bb..e975a85 100644 --- a/R/cl.R +++ b/R/cl.R @@ -542,6 +542,7 @@ cl_list_corpora <- function(){ #' @name cl_rework #' @rdname cl_rework #' @examples +#' \donttest{ #' library(Rcpp) #' #' cppFunction( @@ -561,6 +562,7 @@ cl_list_corpora <- function(){ #' ) #' #' result <- get_str("REUTERS", "word", RcppCWB::get_tmp_registry(), 0:50) +#' } NULL diff --git a/cran-comments.md b/cran-comments.md index 236a80e..57d41cf 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,21 +1,10 @@ ## General remarks -- This release replaces dependency PCRE with PCRE2 in the CWB source code -included in this package. I hereby anticipate that build machines will -successively not have PCRE installed. Fedora will presumably be first. +- This release extends auxiliary Rcpp functionality and fixes minor bugs. No +changes of fundamentals. -- Rcpp wrappers for CWB functionality is now exported such that it is possible -to write Rcpp inline C++ functions using this functionality. - -- This is the first package version that includes a vignette (explaining how to -write inline C++ functions). - -- The cleanup script is extended and reverts modifications that had previously -gone unnoticed. - -- A failure to build Windows binaries for R-oldrel is addressed: Repository -https://github.com/PolMine/libcl that is used to get static libraries has been -updated. +- An example using `cppFunction()` ran 10 secs on winbuilder, so I wrapped this +into \donttest{}. Previous aspects I repeat: @@ -33,7 +22,7 @@ change. * CI checks with GitHub Actions (Windows/macOS/Ubuntu) * R winbuilder (R 4.2 release, devel, oldrel) -* local macOS, R 4.2.2 (arm64) +* local macOS, R 4.3.0 (arm64) ## R CMD check results diff --git a/man/cl_rework.Rd b/man/cl_rework.Rd index 6a1f8af..6b6063c 100644 --- a/man/cl_rework.Rd +++ b/man/cl_rework.Rd @@ -90,6 +90,7 @@ functions with a C++ implementation that are compiled and linked using \code{Rcpp::cppFunction()} or \code{Rcpp::sourceCpp()} } \examples{ +\donttest{ library(Rcpp) cppFunction( @@ -110,3 +111,4 @@ cppFunction( result <- get_str("REUTERS", "word", RcppCWB::get_tmp_registry(), 0:50) } +} From 983a0366a7043f44f672013c79f6946001d1fb8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Tue, 13 Jun 2023 13:05:46 +0200 Subject: [PATCH 09/90] release v0.6.1 --- cran-comments.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cran-comments.md b/cran-comments.md index 57d41cf..b2b7efa 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -21,7 +21,7 @@ change. ## Test environments * CI checks with GitHub Actions (Windows/macOS/Ubuntu) -* R winbuilder (R 4.2 release, devel, oldrel) +* R winbuilder (R 4.3.0 release, devel, oldrel) * local macOS, R 4.3.0 (arm64) From de5cdcc027fbefd5675301878d54b7f063ae4a02 Mon Sep 17 00:00:00 2001 From: Sergey Fedorov Date: Thu, 15 Jun 2023 01:40:39 +0800 Subject: [PATCH 10/90] Fix building for PowerPC --- configure | 11 ++++-- patch/PatchCWB.R | 2 +- src/cwb/INSTALL-MACOS | 2 +- src/cwb/config.mk | 12 +++--- src/cwb/config/platform/darwin-ppc | 37 +++++++++++++++++++ src/cwb/config/platform/darwin-ppc64 | 37 +++++++++++++++++++ .../platform/{darwin-64 => darwin-x86_64} | 0 src/cwb/install-scripts/config-basic | 7 ++-- 8 files changed, 95 insertions(+), 13 deletions(-) create mode 100644 src/cwb/config/platform/darwin-ppc create mode 100644 src/cwb/config/platform/darwin-ppc64 rename src/cwb/config/platform/{darwin-64 => darwin-x86_64} (100%) diff --git a/configure b/configure index 32deb3f..ffb7aa1 100755 --- a/configure +++ b/configure @@ -25,10 +25,15 @@ case $OS in echo "* compiling for darwin-arm64" CWB_PLATFORM_CONFIG_FILE="darwin-arm64" CWB_PLATFORM_CONFIG_FILE_CC="unix";; - + # uname -m is a poor choice, as it shows machine arch family and has nothing to do with desired arch + # of the build; for PowerPC it reports 'Power Macintosh', which is not an arch at all. + 'Power Macintosh') + echo "* compiling for darwin-ppc; if you wish to build for ppc64, please change the value manually" + CWB_PLATFORM_CONFIG_FILE="darwin-ppc" + CWB_PLATFORM_CONFIG_FILE_CC="unix";; *) - echo "* compiling for darwin-64" - CWB_PLATFORM_CONFIG_FILE="darwin-64" + echo "* compiling for darwin-x86_64" + CWB_PLATFORM_CONFIG_FILE="darwin-x86_64" CWB_PLATFORM_CONFIG_FILE_CC="unix";; esac ;; diff --git a/patch/PatchCWB.R b/patch/PatchCWB.R index 71db770..338a1db 100644 --- a/patch/PatchCWB.R +++ b/patch/PatchCWB.R @@ -1986,7 +1986,7 @@ PatchCWB <- R6Class( ), - "src/cwb/config/platform/darwin-64" = list( + "src/cwb/config/platform/darwin-x86_64" = list( # stable r1069-r1690 replace = list("^(CFLAGS\\s*=.*?)\\s+-march=native\\s+(.*?)$", "\\1 \\2", 1L) ), diff --git a/src/cwb/INSTALL-MACOS b/src/cwb/INSTALL-MACOS index 80aff97..7ee56d7 100644 --- a/src/cwb/INSTALL-MACOS +++ b/src/cwb/INSTALL-MACOS @@ -128,7 +128,7 @@ If you've used the MacPorts package manager, specify Otherwise, make sure that the prerequisite libraries have been installed into the /usr/local tree and then specify either - PLATFORM=darwin-64 + PLATFORM=darwin-x86_64 for an Intel 64-bit build or diff --git a/src/cwb/config.mk b/src/cwb/config.mk index 2b5e164..f7f65fe 100644 --- a/src/cwb/config.mk +++ b/src/cwb/config.mk @@ -30,13 +30,15 @@ # linux-64 - configuration for 64-bit CPUs # linux-opteron - with optimimzation for AMD Opteron processor # darwin MacOS / Darwin [use one of the more specific entries below] -# darwin-brew - Intel 64-bit, natively tuned, prerequisites installed with HomeBrew (recommended) +# darwin-brew - Intel 64-bit, natively tuned, prerequisites installed with HomeBrew (recommended) # darwin-brew-m1 - same for ARM 64-bit (M1 and other Apple Silicon) -# darwin-brew-release - Intel 64-bit (Core2 and newer), statically linked for binary release +# darwin-brew-release - Intel 64-bit (Core2 and newer), statically linked for binary release # darwin-brew-release-m1 - same for ARM 64-bit (M1 and other Apple Silicon) -# darwin-64 - Intel 64-bit, natively tunes, prerequisites installed by user -# darwin-universal - universal 64-bit/32-bit build on Mac OS X 10.6 - 10.13 (deprecated) -# darwin-port - generic build, prerequisites installed with MacPorts (deprecated) +# darwin-x86_64 - Intel 64-bit, natively tunes, prerequisites installed by user +# darwin-ppc - PowerPC 32-bit, natively tunes, prerequisites installed by user +# darwin-ppc64 - PowerPC 64-bit, natively tunes, prerequisites installed by user +# darwin-universal - universal 64-bit/32-bit build on Mac OS X 10.7–10.13 (deprecated) +# darwin-port - generic build, prerequisites installed with MacPorts (deprecated) # solaris SUN Solaris 8 for SPARC CPU (unsupported) # cygwin Win32 build using Cygwin emulation layer (experimental) # mingw-cross Cross-compile for Win32-on-i586 from a *nix system with MinGW installed (experimental) diff --git a/src/cwb/config/platform/darwin-ppc b/src/cwb/config/platform/darwin-ppc new file mode 100644 index 0000000..55c1245 --- /dev/null +++ b/src/cwb/config/platform/darwin-ppc @@ -0,0 +1,37 @@ +## -*-Makefile-*- +## +## IMS Open Corpus Workbench (CWB) +## Copyright (C) 1993-2006 by IMS, University of Stuttgart +## Copyright (C) 2007- by the respective contributers (see file AUTHORS) +## +## This program is free software; you can redistribute it and/or modify it +## under the terms of the GNU General Public License as published by the +## Free Software Foundation; either version 2, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +## Public License for more details (in the file "COPYING", or available via +## WWW at http://www.gnu.org/copyleft/gpl.html). + +## +## PLATFORM CONFIGURATION FILE: +## Recent Mac OS X with Xcode 5 or later on 64-bit Intel Core2 and newer CPUs, natively tuned, +## with prerequisite libraries installed by user or package manager in standard locations +## + +## Inherits from basic Darwin configuration +include $(TOP)/config/platform/darwin + +## Use GCC compiler +CC ?= /usr/bin/gcc + +## Only build PowerPC 32-bit architecture, optimised for this system +## Notice, using -mtune=native assumes a newer GCC +CFLAGS = -Wall -O3 -arch ppc -mtune=native + +DEPEND_CFLAGS = -Wall -O3 + +## CPU architecture and operating system used to name binary releases +RELEASE_ARCH = ppc diff --git a/src/cwb/config/platform/darwin-ppc64 b/src/cwb/config/platform/darwin-ppc64 new file mode 100644 index 0000000..54b102c --- /dev/null +++ b/src/cwb/config/platform/darwin-ppc64 @@ -0,0 +1,37 @@ +## -*-Makefile-*- +## +## IMS Open Corpus Workbench (CWB) +## Copyright (C) 1993-2006 by IMS, University of Stuttgart +## Copyright (C) 2007- by the respective contributers (see file AUTHORS) +## +## This program is free software; you can redistribute it and/or modify it +## under the terms of the GNU General Public License as published by the +## Free Software Foundation; either version 2, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +## Public License for more details (in the file "COPYING", or available via +## WWW at http://www.gnu.org/copyleft/gpl.html). + +## +## PLATFORM CONFIGURATION FILE: +## Recent Mac OS X with Xcode 5 or later on 64-bit Intel Core2 and newer CPUs, natively tuned, +## with prerequisite libraries installed by user or package manager in standard locations +## + +## Inherits from basic Darwin configuration +include $(TOP)/config/platform/darwin + +## Use GCC compiler +CC ?= /usr/bin/gcc + +## Only build PowerPC 64-bit architecture, optimised for this system +## Notice, using -mtune=native assumes a newer GCC +CFLAGS = -Wall -O3 -arch ppc64 -mtune=native + +DEPEND_CFLAGS = -Wall -O3 + +## CPU architecture and operating system used to name binary releases +RELEASE_ARCH = ppc64 diff --git a/src/cwb/config/platform/darwin-64 b/src/cwb/config/platform/darwin-x86_64 similarity index 100% rename from src/cwb/config/platform/darwin-64 rename to src/cwb/config/platform/darwin-x86_64 diff --git a/src/cwb/install-scripts/config-basic b/src/cwb/install-scripts/config-basic index ed5bf84..e8f8b28 100755 --- a/src/cwb/install-scripts/config-basic +++ b/src/cwb/install-scripts/config-basic @@ -35,8 +35,9 @@ case "$uname_output" in ;; Darwin*) case "$uname_output" in - *ppc) - echo "*** MacOS is no longer supported on PowerPC processors ***" + # `uname -a` will just show 'Power Macintosh' among other info. + *Power*) + echo "*** Please use darwin-ppc or darwin-ppc64, depending on desired arch ***" exit ;; *i386) @@ -52,7 +53,7 @@ case "$uname_output" in then platform="darwin-port-core2" # GLib seems to be provided by MacPorts else - platform="darwin-64" # assume user has installed the prerequisite manually + platform="darwin-x86_64" # assume user has installed the prerequisite manually fi ;; *) From 2bd5fe2677f213e9a5261a972c0f9d87e2e4512c Mon Sep 17 00:00:00 2001 From: Sergey Fedorov Date: Thu, 15 Jun 2023 04:48:10 +0800 Subject: [PATCH 11/90] darwin-arm64: bring to sanity --- src/cwb/config/platform/darwin-arm64 | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/cwb/config/platform/darwin-arm64 b/src/cwb/config/platform/darwin-arm64 index 532a09a..8a88a9b 100644 --- a/src/cwb/config/platform/darwin-arm64 +++ b/src/cwb/config/platform/darwin-arm64 @@ -17,8 +17,7 @@ ## ## PLATFORM CONFIGURATION FILE: -## Recent Mac OS X with Xcode 5 or later on 64-bit Intel Core2 and newer CPUs, natively tuned, -## with prerequisite libraries installed by user or package manager in standard locations +## Recent Mac OS X with on Apple M CPUs. ## ## Inherits from basic Darwin configuration @@ -33,6 +32,4 @@ CFLAGS = -Wall -O3 -arch arm64 -mtune=native DEPEND_CFLAGS = -Wall -O3 ## CPU architecture and operating system used to name binary releases -RELEASE_ARCH = x86_64 -RELEASE_OS = osx-10.7 - +RELEASE_ARCH = arm64 From 215acedb10bf496ce1ad551a2ebaaffe3eed0491 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Thu, 15 Jun 2023 13:08:22 +0200 Subject: [PATCH 12/90] version updated in NEWS.md --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 587ca67..600192d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,4 @@ -# RcppCWB 0.6.0.9001ff +# RcppCWB 0.6.1 * New function `region_to_strucs()` to get minimumum and maximum struc of s-attribute within region provided. Works also for nested s-attributes. From 1f6757d75ee88d3ff557551327d669b254d5965b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Thu, 15 Jun 2023 15:07:32 +0200 Subject: [PATCH 13/90] configure add flag to avoid clang undefined symbol error --- DESCRIPTION | 2 +- NEWS.md | 5 +++++ configure | 9 ++++++++- 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 7ea88d6..3744a06 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: RcppCWB Type: Package Title: 'Rcpp' Bindings for the 'Corpus Workbench' ('CWB') -Version: 0.6.1 +Version: 0.6.1.9001 Date: 2023-06-13 Author: Andreas Blaette [aut, cre], Bernard Desgraupes [aut], diff --git a/NEWS.md b/NEWS.md index 600192d..e95791e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,8 @@ +# RcppCWB 0.6.1.9001 + +* The configure script adds flag '-fno-objc-msgsend-selector-stubs' to linker +call if clang compiler is used to avoid 'Symbol not found'-error on CRAN. + # RcppCWB 0.6.1 * New function `region_to_strucs()` to get minimumum and maximum struc of diff --git a/configure b/configure index 32deb3f..1ee660f 100755 --- a/configure +++ b/configure @@ -13,6 +13,13 @@ CC_R=`$R_HOME/bin/R CMD config CC` CARBON="" SOCKETLIB="" +# see https://github.com/PolMine/RcppCWB/issues/82 +if [[ $CC_R = clang* ]]; then + NOSTUBS="-fno-objc-msgsend-selector-stubs" +else + NOSTUBS="" +fi + OS=`uname -s` echo "* operating system detected for CWB configuration: $OS" @@ -197,7 +204,7 @@ CWB_DIR="`pwd`/src/cwb" if [ -f ./src/Makevars ]; then rm ./src/Makevars; fi printf "PKG_CPPFLAGS=-I%s/src/cwb/cqp -I%s/src/cwb/cl -I%s/src/cwb/CQi %s\n" ${BUILD_DIR} ${BUILD_DIR} ${BUILD_DIR} "$PCRE2_CFLAGS" > ./src/Makevars -printf "PKG_LIBS=-L%s/cl -L%s/cqp -L%s/utils -lcwb -lcqp -lcl %s %s %s %s\n" ${CWB_DIR} ${CWB_DIR} ${CWB_DIR} "$GLIB_LINKER_FLAGS" "$PCRE2_LIBDIRS" "$SOCKETLIB" "$CARBON" >> ./src/Makevars +printf "PKG_LIBS=-L%s/cl -L%s/cqp -L%s/utils -lcwb -lcqp -lcl %s %s %s %s %s\n" ${CWB_DIR} ${CWB_DIR} ${CWB_DIR} "$GLIB_LINKER_FLAGS" "$PCRE2_LIBDIRS" "$SOCKETLIB" "$CARBON" "$NOSTUBS">> ./src/Makevars printf "\${SHLIB}: libcl.a libcqp.a libcwb.a\n" >>./src/Makevars printf "libcl.a: depend\n" >> ./src/Makevars printf "\tcd cwb; R_PACKAGE_SOURCE=%s PKG_CONFIG_PATH=%s \${MAKE} cl\n" ${CWB_DIR} ${PKG_CONFIG_PATH} >> ./src/Makevars From 7c65efee43557db5250d37e9ac8f5dd4f73633d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Thu, 15 Jun 2023 21:31:20 +0200 Subject: [PATCH 14/90] avoid symbol not found error without bashism #82 --- configure | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/configure b/configure index 1ee660f..d4f8179 100755 --- a/configure +++ b/configure @@ -14,11 +14,10 @@ CARBON="" SOCKETLIB="" # see https://github.com/PolMine/RcppCWB/issues/82 -if [[ $CC_R = clang* ]]; then - NOSTUBS="-fno-objc-msgsend-selector-stubs" -else - NOSTUBS="" -fi +case $CC_R in + clang*) NOSTUBS="-fno-objc-msgsend-selector-stubs";; + *) NOSTUBS="" +esac OS=`uname -s` echo "* operating system detected for CWB configuration: $OS" From c6df1cc42f545732404de4740d0e0bbef1c41cb5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Fri, 16 Jun 2023 14:11:21 +0200 Subject: [PATCH 15/90] arg "s" added to ar call in Makefiles --- src/cwb/cl/Makefile | 2 +- src/cwb/cqp/Makefile | 2 +- src/cwb/utils/Makefile | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/cwb/cl/Makefile b/src/cwb/cl/Makefile index 75b1c17..69a8fd4 100644 --- a/src/cwb/cl/Makefile +++ b/src/cwb/cl/Makefile @@ -105,7 +105,7 @@ all: libcl.a $(EXTRA_OBJS) libcl.a: $(OBJS) $(RM) $@ - $(AR) cq $@ $^ + $(AR) cqs $@ $^ install: libcl.a ifndef __MINGW__ diff --git a/src/cwb/cqp/Makefile b/src/cwb/cqp/Makefile index 436a6a5..eea3efb 100644 --- a/src/cwb/cqp/Makefile +++ b/src/cwb/cqp/Makefile @@ -102,7 +102,7 @@ all: libcqp.a parser.tab.c parser.tab.h: parser.y libcqp.a: $(OBJS) $(CQI_OBJS) $(RM) $@ - $(AR) cq $@ $^ + $(AR) cqs $@ $^ cqp$(EXEC_SUFFIX): $(OBJS) $(CQP_OBJS) llquery.o $(LIBCL_PATH) @$(ECHO) " .... link executable" $@ diff --git a/src/cwb/utils/Makefile b/src/cwb/utils/Makefile index c6abc4f..8db0c1e 100644 --- a/src/cwb/utils/Makefile +++ b/src/cwb/utils/Makefile @@ -48,7 +48,7 @@ all: libcwb.a libcwb.a: $(OBJS) @$(ECHO) "--------------------------------- CREATING ARCHIVE" $(RM) $@ - $(AR) cq $@ $^ + $(AR) cqs $@ $^ cwb-encode.o: cwb-encode.c ${CC} -c $(CFLAGS_ALL) -o cwb-encode.o cwb-encode.c From 2074af6d108973d4d7165d094f170bd1552ab3fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Fri, 16 Jun 2023 14:52:26 +0200 Subject: [PATCH 16/90] =?UTF-8?q?v0.6.2=20first=20release=C3=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- DESCRIPTION | 4 ++-- NEWS.md | 6 +++++- cran-comments.md | 18 ++++++++++++++---- 3 files changed, 21 insertions(+), 7 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 3744a06..2e1183a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: RcppCWB Type: Package Title: 'Rcpp' Bindings for the 'Corpus Workbench' ('CWB') -Version: 0.6.1.9001 -Date: 2023-06-13 +Version: 0.6.2 +Date: 2023-06-16 Author: Andreas Blaette [aut, cre], Bernard Desgraupes [aut], Sylvain Loiseau [aut], diff --git a/NEWS.md b/NEWS.md index e95791e..7e20884 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,7 +1,11 @@ -# RcppCWB 0.6.1.9001 +# RcppCWB 0.6.1.9001-9002 * The configure script adds flag '-fno-objc-msgsend-selector-stubs' to linker call if clang compiler is used to avoid 'Symbol not found'-error on CRAN. +* The configure script now covers the case of Power PCs. Files for the power pc +scenario have been added to src/cwb/config/platform; darwin-64 has been renamed +to darwin-x86_64 as a matter of consistency #79. + # RcppCWB 0.6.1 diff --git a/cran-comments.md b/cran-comments.md index b2b7efa..fe95c81 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,10 +1,20 @@ ## General remarks -- This release extends auxiliary Rcpp functionality and fixes minor bugs. No -changes of fundamentals. +This is a quick follow up to v0.6.1. Check results report ERRORs on macOS build +machines. At the linker stage, you see: +"Symbol not found: _objc_msgSend$UTF8String" +for r-release-macos-arm64 and r-oldrel-macos-arm64. + +The errors most likely result from a scenario when clang and Xcode are not +aligned, see: https://github.com/xamarin/xamarin-macios/issues/16223 + +My (temporary) solution is to add flag -fno-objc-msgsend-selector-stubs to clang +to PKG_LIBS. I hope that macOS errors on CRAN build machines will not occurr +any more. + +A further modification of the configure script is that it now deals with the +scenario of Power PCs. -- An example using `cppFunction()` ran 10 secs on winbuilder, so I wrapped this -into \donttest{}. Previous aspects I repeat: From 1c183f4a20df06690c1cce27327a9ecd56a3b1a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Fri, 16 Jun 2023 14:54:25 +0200 Subject: [PATCH 17/90] version adjusted in NEWS file --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 7e20884..63b27cf 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,4 @@ -# RcppCWB 0.6.1.9001-9002 +# RcppCWB 0.6.2 * The configure script adds flag '-fno-objc-msgsend-selector-stubs' to linker call if clang compiler is used to avoid 'Symbol not found'-error on CRAN. From c5a011790a7e4014d7541aeb4b01915a19ab051a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Fri, 23 Jun 2023 11:36:35 +0200 Subject: [PATCH 18/90] drop flag -fno-objc-msgsend-selector-stubs again --- NEWS.md | 6 ++++-- configure | 8 +------- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/NEWS.md b/NEWS.md index 63b27cf..0613b68 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,10 +1,12 @@ # RcppCWB 0.6.2 -* The configure script adds flag '-fno-objc-msgsend-selector-stubs' to linker -call if clang compiler is used to avoid 'Symbol not found'-error on CRAN. * The configure script now covers the case of Power PCs. Files for the power pc scenario have been added to src/cwb/config/platform; darwin-64 has been renamed to darwin-x86_64 as a matter of consistency #79. +* Warning "variable 'nr_targets' set but not used" for files newly reported by +Apple clang version 14.0.3 (clang-1403.0.22.14.1) is addressed by commenting out +respective lines #83. + # RcppCWB 0.6.1 diff --git a/configure b/configure index 5649375..da6d8bb 100755 --- a/configure +++ b/configure @@ -13,12 +13,6 @@ CC_R=`$R_HOME/bin/R CMD config CC` CARBON="" SOCKETLIB="" -# see https://github.com/PolMine/RcppCWB/issues/82 -case $CC_R in - clang*) NOSTUBS="-fno-objc-msgsend-selector-stubs";; - *) NOSTUBS="" -esac - OS=`uname -s` echo "* operating system detected for CWB configuration: $OS" @@ -208,7 +202,7 @@ CWB_DIR="`pwd`/src/cwb" if [ -f ./src/Makevars ]; then rm ./src/Makevars; fi printf "PKG_CPPFLAGS=-I%s/src/cwb/cqp -I%s/src/cwb/cl -I%s/src/cwb/CQi %s\n" ${BUILD_DIR} ${BUILD_DIR} ${BUILD_DIR} "$PCRE2_CFLAGS" > ./src/Makevars -printf "PKG_LIBS=-L%s/cl -L%s/cqp -L%s/utils -lcwb -lcqp -lcl %s %s %s %s %s\n" ${CWB_DIR} ${CWB_DIR} ${CWB_DIR} "$GLIB_LINKER_FLAGS" "$PCRE2_LIBDIRS" "$SOCKETLIB" "$CARBON" "$NOSTUBS">> ./src/Makevars +printf "PKG_LIBS=-L%s/cl -L%s/cqp -L%s/utils -lcwb -lcqp -lcl %s %s %s %s %s\n" ${CWB_DIR} ${CWB_DIR} ${CWB_DIR} "$GLIB_LINKER_FLAGS" "$PCRE2_LIBDIRS" "$SOCKETLIB" "$CARBON" >> ./src/Makevars printf "\${SHLIB}: libcl.a libcqp.a libcwb.a\n" >>./src/Makevars printf "libcl.a: depend\n" >> ./src/Makevars printf "\tcd cwb; R_PACKAGE_SOURCE=%s PKG_CONFIG_PATH=%s \${MAKE} cl\n" ${CWB_DIR} ${PKG_CONFIG_PATH} >> ./src/Makevars From 99a429f98198dcea1706a198b0fecd93a31b1ede Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Wed, 28 Jun 2023 22:42:45 +0200 Subject: [PATCH 19/90] avoid misleading indentation warning #85 tilde expansion for cwb_* funs #84 --- R/cwb.R | 56 ++++++++++++++++++++++++++++++++++---------- man/cwb_utils.Rd | 14 +++++------ src/cwb/cqp/lex.yy.c | 11 +++++---- 3 files changed, 56 insertions(+), 25 deletions(-) diff --git a/R/cwb.R b/R/cwb.R index 75e684d..37ddda7 100644 --- a/R/cwb.R +++ b/R/cwb.R @@ -1,14 +1,14 @@ #' CWB Tools for Creating Corpora #' -#' Wrappers for the CWB tools (\code{cwb-makeall}, \code{cwb-huffcode}, -#' \code{cwb-compress-rdx}). Unlike the 'original' command line tools, these -#' wrappers will always perform a specific indexing/compression step on one -#' positional attribute, and produce all components. +#' Wrappers for the CWB tools (`cwb-makeall`, `cwb-huffcode`, +#' `cwb-compress-rdx`). Unlike the 'original' command line tools, these wrappers +#' will always perform a specific indexing/compression step on one positional +#' attribute, and produce all components. #' -#' @param corpus name of a CWB corpus (upper case) -#' @param p_attribute name p-attribute -#' @param registry path to the registry directory, defaults to the value of the -#' environment variable CORPUS_REGISTRY +#' @param corpus Name of a CWB corpus (upper case). +#' @param p_attribute Name of p-attribute. +#' @param registry Path to the registry directory, defaults to the value of the +#' environment variable CORPUS_REGISTRY. #' @param quietly A `logical` value, whether to turn off messages (including #' warnings). #' @param verbose A `logical` value, whether to show progress information @@ -67,21 +67,32 @@ #' @rdname cwb_utils #' @export cwb_makeall #' @importFrom utils capture.output +#' @importFrom fs path path_expand cwb_makeall <- function(corpus, p_attribute, registry = Sys.getenv("CORPUS_REGISTRY"), quietly = FALSE){ + + registry <- path_expand(path(registry)) check_registry(registry) - regfile <- file.path(normalizePath(registry, winslash = "/"), tolower(corpus), fsep = "/") + regfile <- path(registry, tolower(corpus)) if (!file.exists(regfile)){ - stop(sprintf("No registry file for corpus '%s' in registry directory '%s'.", corpus, registry)) + stop( + sprintf( + "No registry file for corpus '%s' in registry directory '%s'.", + corpus, registry + ) + ) } - # The registry directory provided is ignored if the corpus has already been loaded, resulting - # in unexpected behavior. Therefore, we unload the corpus and force reloading corpora. + # The registry directory provided is ignored if the corpus has already been + # loaded, resulting in unexpected behavior. Therefore, we unload the corpus + # and force reloading corpora. if (toupper(corpus) %in% cqp_list_corpora()){ cl_delete_corpus(corpus, registry = registry) cqp_reset_registry(registry = registry) } - makeall <- function() .cwb_makeall(x = corpus, p_attribute = p_attribute, registry_dir = registry) + makeall <- function() + .cwb_makeall(x = corpus, p_attribute = p_attribute, registry_dir = registry) + if (quietly){ capture.output({success <- makeall()}, type = "output") } else { @@ -96,6 +107,17 @@ cwb_makeall <- function(corpus, p_attribute, registry = Sys.getenv("CORPUS_REGIS #' @param delete A `logical` value, whether to remove redundant files after #' compression. cwb_huffcode <- function(corpus, p_attribute, registry = Sys.getenv("CORPUS_REGISTRY"), quietly = FALSE, delete = TRUE){ + + registry <- path_expand(path(registry)) + check_registry(registry) + regfile <- path(registry, tolower(corpus)) + if (!file.exists(regfile)){ + stop(sprintf( + "No registry file for corpus '%s' in registry directory '%s'.", + corpus, registry + )) + } + huffcode <- function() .cwb_huffcode(x = corpus, p_attribute = p_attribute, registry_dir = registry) @@ -126,6 +148,14 @@ cwb_huffcode <- function(corpus, p_attribute, registry = Sys.getenv("CORPUS_REGI #' registry = get_tmp_registry() #' ) cwb_compress_rdx <- function(corpus, p_attribute, registry = Sys.getenv("CORPUS_REGISTRY"), quietly = FALSE, delete = TRUE){ + + registry <- path_expand(path(registry)) + check_registry(registry) + regfile <- path(registry, tolower(corpus)) + if (!file.exists(regfile)){ + stop(sprintf("No registry file for corpus '%s' in registry directory '%s'.", corpus, registry)) + } + compress_rdx <-function() .cwb_compress_rdx(x = corpus, p_attribute = p_attribute, registry_dir = registry) diff --git a/man/cwb_utils.Rd b/man/cwb_utils.Rd index 0b1b0fd..a865571 100644 --- a/man/cwb_utils.Rd +++ b/man/cwb_utils.Rd @@ -46,12 +46,12 @@ cwb_encode( ) } \arguments{ -\item{corpus}{name of a CWB corpus (upper case)} +\item{corpus}{Name of a CWB corpus (upper case).} -\item{p_attribute}{name p-attribute} +\item{p_attribute}{Name of p-attribute.} -\item{registry}{path to the registry directory, defaults to the value of the -environment variable CORPUS_REGISTRY} +\item{registry}{Path to the registry directory, defaults to the value of the +environment variable CORPUS_REGISTRY.} \item{quietly}{A \code{logical} value, whether to turn off messages (including warnings).} @@ -93,9 +93,9 @@ tokens} } \description{ Wrappers for the CWB tools (\code{cwb-makeall}, \code{cwb-huffcode}, -\code{cwb-compress-rdx}). Unlike the 'original' command line tools, these -wrappers will always perform a specific indexing/compression step on one -positional attribute, and produce all components. +\code{cwb-compress-rdx}). Unlike the 'original' command line tools, these wrappers +will always perform a specific indexing/compression step on one positional +attribute, and produce all components. } \examples{ # The package includes and 'unfinished' corpus of debates in the UN General diff --git a/src/cwb/cqp/lex.yy.c b/src/cwb/cqp/lex.yy.c index 1317339..94843ac 100644 --- a/src/cwb/cqp/lex.yy.c +++ b/src/cwb/cqp/lex.yy.c @@ -1394,10 +1394,11 @@ YY_DECL #endif /* Create the reject buffer large enough to save one state per allowed character. */ - if ( ! (yy_state_buf) ) - (yy_state_buf) = (yy_state_type *)yyalloc(YY_STATE_BUF_SIZE ); - if ( ! (yy_state_buf) ) - YY_FATAL_ERROR( "out of dynamic memory in yylex()" ); + if ( ! (yy_state_buf) ) + (yy_state_buf) = (yy_state_type *)yyalloc(YY_STATE_BUF_SIZE ); + + if ( ! (yy_state_buf) ) + YY_FATAL_ERROR( "out of dynamic memory in yylex()" ); if ( ! (yy_start) ) (yy_start) = 1; /* first start state */ @@ -2454,7 +2455,7 @@ static int yy_get_next_buffer (void) if ( ! yy_is_jam ) *(yy_state_ptr)++ = yy_current_state; - return yy_is_jam ? 0 : yy_current_state; + return yy_is_jam ? 0 : yy_current_state; } #ifndef YY_NO_UNPUT From 4bd1cb7f69402791de0ca46cfa044242b71d94a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Wed, 28 Jun 2023 23:16:53 +0200 Subject: [PATCH 20/90] no unused variable warning #83 --- src/cwb/cqp/html-print.c | 2 +- src/cwb/cqp/latex-print.c | 2 +- src/cwb/cqp/sgml-print.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/cwb/cqp/html-print.c b/src/cwb/cqp/html-print.c index 691e47c..09f8c6c 100644 --- a/src/cwb/cqp/html-print.c +++ b/src/cwb/cqp/html-print.c @@ -490,7 +490,7 @@ html_print_group(Group *group, FILE *dest) html_puts(dest, target_s, SUBST_ALL); Rprintf("%d\n", count); - nr_targets++; + nr_targets = nr_targets + 1; /* replaces nr_targets++; #83 */ } Rprintf("\n\n"); diff --git a/src/cwb/cqp/latex-print.c b/src/cwb/cqp/latex-print.c index 85e6645..d92ba39 100644 --- a/src/cwb/cqp/latex-print.c +++ b/src/cwb/cqp/latex-print.c @@ -457,7 +457,7 @@ latex_print_group(Group *group, FILE *dest) Rprintf(" & %s & %d \\\\\n", latex_convert_string(target_s), count); - nr_targets++; + nr_targets = nr_targets + 1; /* replaces nr_targets++; #83 */ } Rprintf("\\end{tabular}\n"); diff --git a/src/cwb/cqp/sgml-print.c b/src/cwb/cqp/sgml-print.c index 481c4c2..f4bd5e1 100644 --- a/src/cwb/cqp/sgml-print.c +++ b/src/cwb/cqp/sgml-print.c @@ -425,7 +425,7 @@ sgml_print_group(Group *group, FILE *dest) Rprintf("%d\n", count); - nr_targets++; + nr_targets = nr_targets + 1; /* replaces nr_targets++; #83 */ } Rprintf("\n"); From 6b6a10ef30c511be593ac94f695ca074b74d3db6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Wed, 28 Jun 2023 23:29:37 +0200 Subject: [PATCH 21/90] tilde expansion of registry in cwb_encode() #84 --- NEWS.md | 7 +++++-- R/cwb.R | 5 +++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/NEWS.md b/NEWS.md index 0613b68..e8e9627 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,8 +4,11 @@ scenario have been added to src/cwb/config/platform; darwin-64 has been renamed to darwin-x86_64 as a matter of consistency #79. * Warning "variable 'nr_targets' set but not used" for files newly reported by -Apple clang version 14.0.3 (clang-1403.0.22.14.1) is addressed by commenting out -respective lines #83. +Apple clang version 14.0.3 (clang-1403.0.22.14.1) is addressed #83. +* Misleading indentation warning issued by clang-15 addressed #85. +* `cwb_encode()`, `cwb_makeall()`, `cwb_huffcode()` and `cwb_compress_rdx()` +perform tilde expansion on filename provided by argument `registry`, avoiding +a crash #84. diff --git a/R/cwb.R b/R/cwb.R index 37ddda7..d32422b 100644 --- a/R/cwb.R +++ b/R/cwb.R @@ -245,8 +245,9 @@ cwb_encode <- function( ) ) - data_dir <- path.expand(data_dir) - vrt_dir <- path.expand(vrt_dir) + registry <- fs::path_expand(vrt_dir) + data_dir <- fs::path_expand(data_dir) + vrt_dir <- fs::path_expand(vrt_dir) stopifnot( is.character(corpus), length(corpus) == 1L, From 4f6153e66441677fc7e39d49dab0c451da08b8a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Thu, 29 Jun 2023 00:04:41 +0200 Subject: [PATCH 22/90] rm unintended side effect of tilde expansion #84 --- R/cwb.R | 10 +++------- cran-comments.md | 25 +++++++++++-------------- tests/testthat/test_cwb_encode.R | 4 +++- 3 files changed, 17 insertions(+), 22 deletions(-) diff --git a/R/cwb.R b/R/cwb.R index d32422b..4f644ca 100644 --- a/R/cwb.R +++ b/R/cwb.R @@ -245,10 +245,6 @@ cwb_encode <- function( ) ) - registry <- fs::path_expand(vrt_dir) - data_dir <- fs::path_expand(data_dir) - vrt_dir <- fs::path_expand(vrt_dir) - stopifnot( is.character(corpus), length(corpus) == 1L, is.character(registry), length(registry) == 1L, dir.exists(registry), @@ -278,9 +274,9 @@ cwb_encode <- function( ) # Ensure that paths are standardized - regfile <- as.character(fs::path(file.path(registry, tolower(corpus)))) - data_dir <- as.character(fs::path(data_dir)) - vrt_dir <- as.character(fs::path(vrt_dir)) + regfile <- as.character(fs::path(fs::path_expand(registry), tolower(corpus))) + data_dir <- as.character(fs::path(fs::path_expand(data_dir))) + vrt_dir <- as.character(fs::path(fs::path_expand(vrt_dir))) .cwb_encode( regfile = regfile, diff --git a/cran-comments.md b/cran-comments.md index fe95c81..5af0e5f 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,20 +1,16 @@ ## General remarks -This is a quick follow up to v0.6.1. Check results report ERRORs on macOS build -machines. At the linker stage, you see: -"Symbol not found: _objc_msgSend$UTF8String" -for r-release-macos-arm64 and r-oldrel-macos-arm64. +This is a quick follow up to v0.6.1. I address robustness issues and warnings +that are newly reported by clang (starting with clang 14). -The errors most likely result from a scenario when clang and Xcode are not -aligned, see: https://github.com/xamarin/xamarin-macios/issues/16223 - -My (temporary) solution is to add flag -fno-objc-msgsend-selector-stubs to clang -to PKG_LIBS. I hope that macOS errors on CRAN build machines will not occurr -any more. - -A further modification of the configure script is that it now deals with the -scenario of Power PCs. +Considerung check results, there are ERRORs on macOS build machines: At the +linker stage, you see: "Symbol not found: _objc_msgSend$UTF8String" +(r-release-macos-arm64 and r-oldrel-macos-arm64). +I sent a message to the R-package-devel list and Simon Urbanek explained that +an update of macOS would be required to solve this issue. If this update has +been implemented, I hope that this release will trigger new macOS tests that +succeed. Previous aspects I repeat: @@ -32,7 +28,8 @@ change. * CI checks with GitHub Actions (Windows/macOS/Ubuntu) * R winbuilder (R 4.3.0 release, devel, oldrel) -* local macOS, R 4.3.0 (arm64) +* local macOS, R 4.3.1 (arm64) +* Debian with R-devel and clang (14.0.6 and 15.0.6) ## R CMD check results diff --git a/tests/testthat/test_cwb_encode.R b/tests/testthat/test_cwb_encode.R index 56293c1..3fb4d0b 100644 --- a/tests/testthat/test_cwb_encode.R +++ b/tests/testthat/test_cwb_encode.R @@ -49,10 +49,12 @@ test_that( tmp_data_dir <- file.path(tempdir(), "bt") dir.create(tmp_data_dir) + + regdir <- get_tmp_registry() cwb_encode( corpus = "BT", - registry = get_tmp_registry(), + registry = regdir, vrt_dir = system.file(package = "RcppCWB", "extdata", "vrt"), data_dir = tmp_data_dir, encoding = "utf8", From 9dd10c5b2d145249d0db63ff86b1b125b38a512a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Thu, 29 Jun 2023 09:34:43 +0200 Subject: [PATCH 23/90] release v0.6.2 --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 2e1183a..9518db7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,7 +2,7 @@ Package: RcppCWB Type: Package Title: 'Rcpp' Bindings for the 'Corpus Workbench' ('CWB') Version: 0.6.2 -Date: 2023-06-16 +Date: 2023-06-29 Author: Andreas Blaette [aut, cre], Bernard Desgraupes [aut], Sylvain Loiseau [aut], From a049ea751ff47f536962f78856091d53bd8a0e11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Sat, 1 Jul 2023 21:43:14 +0200 Subject: [PATCH 24/90] DOI updated in README --- README.Rmd | 2 +- README.md | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.Rmd b/README.Rmd index 6d98485..156c033 100644 --- a/README.Rmd +++ b/README.Rmd @@ -2,7 +2,7 @@ output: github_document --- -[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.7040475.svg)](https://doi.org/10.5281/zenodo.7040475) +[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.8104632.svg)](https://doi.org/10.5281/zenodo.8104632) [![License: GPL v3](http://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0) [![CRAN\_Status\_Badge](http://www.r-pkg.org/badges/version/RcppCWB)](https://cran.r-project.org/package=RcppCWB) [![R build status](https://github.com/PolMine/RcppCWB/workflows/R-CMD-check/badge.svg)](https://github.com/PolMine/RcppCWB/actions) diff --git a/README.md b/README.md index 20d994a..684a107 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ -[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.7040475.svg)](https://doi.org/10.5281/zenodo.7040475) +[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.8104632.svg)](https://doi.org/10.5281/zenodo.8104632) [![License: GPL v3](http://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0) [![CRAN_Status_Badge](http://www.r-pkg.org/badges/version/RcppCWB)](https://cran.r-project.org/package=RcppCWB) @@ -185,7 +185,7 @@ cqp_initialize(registry = registry) cqp_query(corpus = "REUTERS", query = '"crude" "oil"') ``` - ## + ## ``` r cpos <- cqp_dump_subcorpus(corpus = "REUTERS") From 5daa657fa0687cd2215609b6dbd5f480f27af091 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Fri, 27 Oct 2023 10:25:23 +0200 Subject: [PATCH 25/90] cl_struc_values() using fs::path() #77 --- DESCRIPTION | 4 ++-- NEWS.md | 5 +++++ R/cl.R | 12 ++++++------ tests/testthat/test_struc_values.R | 25 +++++++++++++++++++++++++ 4 files changed, 38 insertions(+), 8 deletions(-) create mode 100644 tests/testthat/test_struc_values.R diff --git a/DESCRIPTION b/DESCRIPTION index 9518db7..1d72ec4 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: RcppCWB Type: Package Title: 'Rcpp' Bindings for the 'Corpus Workbench' ('CWB') -Version: 0.6.2 -Date: 2023-06-29 +Version: 0.6.2.9001 +Date: 2023-10-27 Author: Andreas Blaette [aut, cre], Bernard Desgraupes [aut], Sylvain Loiseau [aut], diff --git a/NEWS.md b/NEWS.md index e8e9627..b5a6df2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,8 @@ +# RcppCWB 0.6.3 + +* `cl_struc_values()` does not duplicate registry directories any more #77. + + # RcppCWB 0.6.2 * The configure script now covers the case of Power PCs. Files for the power pc diff --git a/R/cl.R b/R/cl.R index e975a85..362e668 100644 --- a/R/cl.R +++ b/R/cl.R @@ -1,10 +1,10 @@ #' Get Attribute Size (of Positional/Structural Attribute). #' -#' Use \code{cl_attribute_size} to get the total number of values of a -#' positional attribute (param \code{attribute_type} = "p"), or structural -#' attribute (param \code{attribute_type} = "s"). Note that indices are -#' zero-based, i.e. the maximum position of a positional / structural -#' attribute is attribute size minus 1 (see examples). +#' Use `cl_attribute_size()` to get the total number of values of a positional +#' attribute (param `attribute_type` = "p"), or structural attribute (param +#' `attribute_type` = "s"). Note that indices are zero-based, i.e. the maximum +#' position of a positional / structural attribute is attribute size minus 1 +#' (see examples). #' @rdname cl_attribute_size #' @param corpus name of a CWB corpus (upper case) #' @param attribute name of a p- or s-attribute @@ -372,7 +372,7 @@ cl_charset_name <- function(corpus, registry = Sys.getenv("CORPUS_REGISTRY")){ #' cl_struc_values("REUTERS", "date") # NA - attribute does not exist cl_struc_values <- function(corpus, s_attribute, registry = Sys.getenv("CORPUS_REGISTRY")){ check_corpus(corpus = corpus, registry = registry, cqp = FALSE) - registry <- normalizePath(path.expand(registry)) + registry <- path(path_expand(registry)) .cl_struc_values(corpus = corpus, s_attribute = s_attribute, registry = registry) } diff --git a/tests/testthat/test_struc_values.R b/tests/testthat/test_struc_values.R new file mode 100644 index 0000000..9d6ea44 --- /dev/null +++ b/tests/testthat/test_struc_values.R @@ -0,0 +1,25 @@ +library(RcppCWB) +use_tmp_registry() +testthat::context("struc2cpos") + +test_that( + "cl_struc_values", + { + # This addresses issue #77: cl_struc_values() would have the result that + # a corpus is loaded twice. + + regdir <- corpus_registry_dir("REUTERS") + + expect_identical( + length(corpus_registry_dir("REUTERS")), + 1L + ) + + cl_struc_values(corpus = "REUTERS", s_attribute = "id", registry = regdir) + + expect_identical( + length(corpus_registry_dir("REUTERS")), + 1L + ) + } +) From a2333425346c8848e07d499a4acd5f7475d244f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Tue, 28 Nov 2023 23:16:42 +0100 Subject: [PATCH 26/90] fix format security issue #86 --- DESCRIPTION | 4 ++-- NEWS.md | 3 +++ R/RcppExports.R | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 9518db7..355901a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: RcppCWB Type: Package Title: 'Rcpp' Bindings for the 'Corpus Workbench' ('CWB') -Version: 0.6.2 -Date: 2023-06-29 +Version: 0.6.2.1 +Date: 2023-11-28 Author: Andreas Blaette [aut, cre], Bernard Desgraupes [aut], Sylvain Loiseau [aut], diff --git a/NEWS.md b/NEWS.md index e8e9627..a3e6dd4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,6 @@ +# RcppCWB 0.6.2.9001 +* Fix format-security issue under r-devel #86. + # RcppCWB 0.6.2 * The configure script now covers the case of Power PCs. Files for the power pc diff --git a/R/RcppExports.R b/R/RcppExports.R index 82d81a7..33452e5 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -406,5 +406,5 @@ cpos_to_rbound <- function(s_attr, cpos) { # Register entry points for exported C++ functions methods::setLoadAction(function(ns) { - .Call('_RcppCWB_RcppExport_registerCCallable', PACKAGE = 'RcppCWB') + .Call(`_RcppCWB_RcppExport_registerCCallable`) }) From cba900856bddd745c502957b230b0163d4b48b8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Tue, 28 Nov 2023 23:25:15 +0100 Subject: [PATCH 27/90] fix format security issue take 2 #86 --- src/RcppExports.cpp | 134 ++++++++++++++++++++++---------------------- 1 file changed, 67 insertions(+), 67 deletions(-) diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index af24686..b3656f5 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -192,7 +192,7 @@ RcppExport SEXP _RcppCWB_cwb_version() { if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -225,7 +225,7 @@ RcppExport SEXP _RcppCWB_p_attr_default() { if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -261,7 +261,7 @@ RcppExport SEXP _RcppCWB_s_attr(SEXP corpusSEXP, SEXP s_attributeSEXP, SEXP regi if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -297,7 +297,7 @@ RcppExport SEXP _RcppCWB_p_attr(SEXP corpusSEXP, SEXP p_attributeSEXP, SEXP regi if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -334,7 +334,7 @@ RcppExport SEXP _RcppCWB_attribute_size(SEXP corpusSEXP, SEXP attributeSEXP, SEX if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -368,7 +368,7 @@ RcppExport SEXP _RcppCWB_p_attr_size(SEXP p_attrSEXP) { if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -402,7 +402,7 @@ RcppExport SEXP _RcppCWB_s_attr_size(SEXP s_attrSEXP) { if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -436,7 +436,7 @@ RcppExport SEXP _RcppCWB_p_attr_lexicon_size(SEXP p_attrSEXP) { if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -472,7 +472,7 @@ RcppExport SEXP _RcppCWB__cl_lexicon_size(SEXP corpusSEXP, SEXP p_attributeSEXP, if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -509,7 +509,7 @@ RcppExport SEXP _RcppCWB__cl_cpos2struc(SEXP corpusSEXP, SEXP s_attributeSEXP, S if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -544,7 +544,7 @@ RcppExport SEXP _RcppCWB_cpos_to_struc(SEXP s_attrSEXP, SEXP cposSEXP) { if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -581,7 +581,7 @@ RcppExport SEXP _RcppCWB_cpos2str(SEXP corpusSEXP, SEXP p_attributeSEXP, SEXP re if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -616,7 +616,7 @@ RcppExport SEXP _RcppCWB_cpos_to_str(SEXP p_attrSEXP, SEXP cposSEXP) { if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -653,7 +653,7 @@ RcppExport SEXP _RcppCWB_cpos2id(SEXP corpusSEXP, SEXP p_attributeSEXP, SEXP reg if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -688,7 +688,7 @@ RcppExport SEXP _RcppCWB_cpos_to_id(SEXP p_attrSEXP, SEXP cposSEXP) { if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -725,7 +725,7 @@ RcppExport SEXP _RcppCWB_struc2cpos(SEXP corpusSEXP, SEXP s_attributeSEXP, SEXP if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -760,7 +760,7 @@ RcppExport SEXP _RcppCWB_struc_to_cpos(SEXP s_attrSEXP, SEXP strucSEXP) { if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -797,7 +797,7 @@ RcppExport SEXP _RcppCWB_id2str(SEXP corpusSEXP, SEXP p_attributeSEXP, SEXP regi if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -834,7 +834,7 @@ RcppExport SEXP _RcppCWB__cl_struc2str(SEXP corpusSEXP, SEXP s_attributeSEXP, SE if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -869,7 +869,7 @@ RcppExport SEXP _RcppCWB_struc_to_str(SEXP s_attrSEXP, SEXP strucSEXP) { if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -906,7 +906,7 @@ RcppExport SEXP _RcppCWB__cl_regex2id(SEXP corpusSEXP, SEXP p_attributeSEXP, SEX if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -941,7 +941,7 @@ RcppExport SEXP _RcppCWB_regex_to_id(SEXP p_attrSEXP, SEXP regexSEXP) { if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -978,7 +978,7 @@ RcppExport SEXP _RcppCWB__cl_str2id(SEXP corpusSEXP, SEXP p_attributeSEXP, SEXP if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -1013,7 +1013,7 @@ RcppExport SEXP _RcppCWB_str_to_id(SEXP p_attrSEXP, SEXP strSEXP) { if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -1050,7 +1050,7 @@ RcppExport SEXP _RcppCWB__cl_id2freq(SEXP corpusSEXP, SEXP p_attributeSEXP, SEXP if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -1085,7 +1085,7 @@ RcppExport SEXP _RcppCWB_id_to_freq(SEXP p_attrSEXP, SEXP idSEXP) { if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -1122,7 +1122,7 @@ RcppExport SEXP _RcppCWB__cl_id2cpos(SEXP corpusSEXP, SEXP p_attributeSEXP, SEXP if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -1157,7 +1157,7 @@ RcppExport SEXP _RcppCWB_id_to_cpos(SEXP p_attrSEXP, SEXP idSEXP) { if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -1194,7 +1194,7 @@ RcppExport SEXP _RcppCWB_cl_cpos2lbound(SEXP corpusSEXP, SEXP s_attributeSEXP, S if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -1229,7 +1229,7 @@ RcppExport SEXP _RcppCWB_cpos_to_lbound(SEXP s_attrSEXP, SEXP cposSEXP) { if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -1266,7 +1266,7 @@ RcppExport SEXP _RcppCWB_cl_cpos2rbound(SEXP corpusSEXP, SEXP s_attributeSEXP, S if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -1301,7 +1301,7 @@ RcppExport SEXP _RcppCWB_cpos_to_rbound(SEXP s_attrSEXP, SEXP cposSEXP) { if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -1336,7 +1336,7 @@ RcppExport SEXP _RcppCWB__cl_find_corpus(SEXP corpusSEXP, SEXP registrySEXP) { if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -1372,7 +1372,7 @@ RcppExport SEXP _RcppCWB__cl_new_attribute(SEXP corpus_pointerSEXP, SEXP s_attri if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -1407,7 +1407,7 @@ RcppExport SEXP _RcppCWB__cl_delete_corpus(SEXP corpusSEXP, SEXP registrySEXP) { if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -1442,7 +1442,7 @@ RcppExport SEXP _RcppCWB__corpus_is_loaded(SEXP corpusSEXP, SEXP registrySEXP) { if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -1477,7 +1477,7 @@ RcppExport SEXP _RcppCWB__cl_charset_name(SEXP corpusSEXP, SEXP registrySEXP) { if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -1513,7 +1513,7 @@ RcppExport SEXP _RcppCWB__cl_struc_values(SEXP corpusSEXP, SEXP s_attributeSEXP, if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -1548,7 +1548,7 @@ RcppExport SEXP _RcppCWB__corpus_data_dir(SEXP corpusSEXP, SEXP registrySEXP) { if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -1583,7 +1583,7 @@ RcppExport SEXP _RcppCWB__corpus_info_file(SEXP corpusSEXP, SEXP registrySEXP) { if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -1618,7 +1618,7 @@ RcppExport SEXP _RcppCWB__corpus_full_name(SEXP corpusSEXP, SEXP registrySEXP) { if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -1653,7 +1653,7 @@ RcppExport SEXP _RcppCWB_corpus_p_attributes(SEXP corpusSEXP, SEXP registrySEXP) if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -1688,7 +1688,7 @@ RcppExport SEXP _RcppCWB_corpus_s_attributes(SEXP corpusSEXP, SEXP registrySEXP) if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -1723,7 +1723,7 @@ RcppExport SEXP _RcppCWB_corpus_properties(SEXP corpusSEXP, SEXP registrySEXP) { if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -1759,7 +1759,7 @@ RcppExport SEXP _RcppCWB_corpus_property(SEXP corpusSEXP, SEXP registrySEXP, SEX if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -1794,7 +1794,7 @@ RcppExport SEXP _RcppCWB_cl_load_corpus(SEXP corpusSEXP, SEXP registrySEXP) { if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -1827,7 +1827,7 @@ RcppExport SEXP _RcppCWB_cl_list_corpora() { if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -1861,7 +1861,7 @@ RcppExport SEXP _RcppCWB_corpus_registry_dir(SEXP corpusSEXP) { if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -1893,7 +1893,7 @@ RcppExport SEXP _RcppCWB_init_cqp() { if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -1927,7 +1927,7 @@ RcppExport SEXP _RcppCWB_cqp_verbosity(SEXP quietlySEXP, SEXP verboseSEXP) { if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -1960,7 +1960,7 @@ RcppExport SEXP _RcppCWB_cqp_get_registry() { if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -1993,7 +1993,7 @@ RcppExport SEXP _RcppCWB_cqp_get_status() { if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -2027,7 +2027,7 @@ RcppExport SEXP _RcppCWB_cqp_set_registry(SEXP registry_dirSEXP) { if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -2060,7 +2060,7 @@ RcppExport SEXP _RcppCWB_cqp_list_corpora() { if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -2096,7 +2096,7 @@ RcppExport SEXP _RcppCWB_cqp_query(SEXP corpusSEXP, SEXP subcorpusSEXP, SEXP que if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -2130,7 +2130,7 @@ RcppExport SEXP _RcppCWB_cqp_subcorpus_size(SEXP scorpusSEXP) { if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -2164,7 +2164,7 @@ RcppExport SEXP _RcppCWB_cqp_list_subcorpora(SEXP inCorpusSEXP) { if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -2198,7 +2198,7 @@ RcppExport SEXP _RcppCWB_cqp_dump_subcorpus(SEXP inSubcorpusSEXP) { if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -2232,7 +2232,7 @@ RcppExport SEXP _RcppCWB_cqp_subcorpus_regions(SEXP subcorpusSEXP) { if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -2266,7 +2266,7 @@ RcppExport SEXP _RcppCWB_cqp_drop_subcorpus(SEXP inSubcorpusSEXP) { if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -2300,7 +2300,7 @@ RcppExport SEXP _RcppCWB_check_corpus(SEXP corpusSEXP) { if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -2335,7 +2335,7 @@ RcppExport SEXP _RcppCWB_cqp_load_corpus(SEXP corpusSEXP, SEXP registrySEXP) { if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -2371,7 +2371,7 @@ RcppExport SEXP _RcppCWB_region_matrix_to_subcorpus(SEXP region_matrixSEXP, SEXP if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -2407,7 +2407,7 @@ RcppExport SEXP _RcppCWB_cwb_makeall(SEXP xSEXP, SEXP registry_dirSEXP, SEXP p_a if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -2443,7 +2443,7 @@ RcppExport SEXP _RcppCWB_cwb_huffcode(SEXP xSEXP, SEXP registry_dirSEXP, SEXP p_ if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -2479,7 +2479,7 @@ RcppExport SEXP _RcppCWB_cwb_compress_rdx(SEXP xSEXP, SEXP registry_dirSEXP, SEX if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; @@ -2524,7 +2524,7 @@ RcppExport SEXP _RcppCWB_cwb_encode(SEXP regfileSEXP, SEXP data_dirSEXP, SEXP vr if (rcpp_isError_gen) { SEXP rcpp_msgSEXP_gen = Rf_asChar(rcpp_result_gen); UNPROTECT(1); - Rf_error(CHAR(rcpp_msgSEXP_gen)); + Rf_error("%s", CHAR(rcpp_msgSEXP_gen)); } UNPROTECT(1); return rcpp_result_gen; From a29de0433fab4ba57138708ca435ddd57c7eccd4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Tue, 28 Nov 2023 23:35:46 +0100 Subject: [PATCH 28/90] push version to 0.6.3 --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 9098c45..82306b2 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: RcppCWB Type: Package Title: 'Rcpp' Bindings for the 'Corpus Workbench' ('CWB') -Version: 0.6.2.9002 +Version: 0.6.3 Date: 2023-11-28 Author: Andreas Blaette [aut, cre], Bernard Desgraupes [aut], From 0185c3c7a44f0f6980a08c975732fcbb7292e238 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Tue, 28 Nov 2023 23:39:11 +0100 Subject: [PATCH 29/90] cran comments for release v0.6.3 --- cran-comments.md | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/cran-comments.md b/cran-comments.md index 5af0e5f..a9add27 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,16 +1,6 @@ ## General remarks -This is a quick follow up to v0.6.1. I address robustness issues and warnings -that are newly reported by clang (starting with clang 14). - -Considerung check results, there are ERRORs on macOS build machines: At the -linker stage, you see: "Symbol not found: _objc_msgSend$UTF8String" -(r-release-macos-arm64 and r-oldrel-macos-arm64). - -I sent a message to the R-package-devel list and Simon Urbanek explained that -an update of macOS would be required to solve this issue. If this update has -been implemented, I hope that this release will trigger new macOS tests that -succeed. +Fixes -Wformat-security issue under r-devel caused by Rcpp. Previous aspects I repeat: From 289f031db229bb48a355b59f489d96967024ee94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Wed, 29 Nov 2023 13:13:39 +0100 Subject: [PATCH 30/90] test and modification of get_region_matrix() for negative struc values #87 --- R/region_matrix.R | 25 ++++++++++++++----------- src/addons.cpp | 11 ++++++++--- tests/testthat/test_get_region_matrix.R | 15 +++++++++++++++ 3 files changed, 37 insertions(+), 14 deletions(-) diff --git a/R/region_matrix.R b/R/region_matrix.R index 1605866..5086aa9 100644 --- a/R/region_matrix.R +++ b/R/region_matrix.R @@ -1,21 +1,24 @@ #' Get Matrix with Regions for Strucs. #' -#' The return value is an integer matrix with the left and right corpus positions -#' of the strucs in columns one and two, respectively. +#' The return value is an `integer` matrix with the left and right corpus +#' positions of the strucs in columns one and two, respectively. For negative +#' struc values in the input vector, the matrix reports `NA` values. #' -#' @param corpus a CWB corpus -#' @param s_attribute a structural attribute -#' @param strucs strucs -#' @param registry the registry directory +#' @param corpus A CWB corpus (length-one `character` vector). +#' @param s_attribute A structural attribute (length-one `character` vector). +#' @param strucs Integer vector with strucs. +#' @param registry Registry directory with registry file. #' @rdname get_region_matrix #' @export get_region_matrix -#' @return A matrix with integer values indicating left and right corpus positions -#' (columns 1 and 2, respectively). +#' @return A matrix with integer values indicating left and right corpus +#' positions (columns 1 and 2, respectively). #' @examples #' y <- get_region_matrix( -#' corpus = "REUTERS", s_attribute = "id", -#' strucs = 0L:5L, registry = get_tmp_registry() -#' ) +#' corpus = "REUTERS", +#' s_attribute = "id", +#' strucs = 0L:5L, +#' registry = get_tmp_registry() +#' ) get_region_matrix <- function(corpus, s_attribute, strucs, registry = Sys.getenv("CORPUS_REGISTRY")){ check_registry(registry) check_corpus(corpus, registry) diff --git a/src/addons.cpp b/src/addons.cpp index 6d99ee4..9685389 100644 --- a/src/addons.cpp +++ b/src/addons.cpp @@ -80,9 +80,14 @@ Rcpp::IntegerMatrix get_region_matrix(SEXP corpus, SEXP s_attribute, SEXP strucs Rcpp::IntegerMatrix cpos_matrix(strucs_length,2); for (n = 0; n < strucs_length; n++){ - cl_struc2cpos(att, strucs_int[n], &start, &end); - cpos_matrix(n,0) = start; - cpos_matrix(n,1) = end; + if (strucs_int[n] >= 0){ + cl_struc2cpos(att, strucs_int[n], &start, &end); + cpos_matrix(n,0) = start; + cpos_matrix(n,1) = end; + } else { + cpos_matrix(n,0) = NA_INTEGER; + cpos_matrix(n,1) = NA_INTEGER; + } } return cpos_matrix; } diff --git a/tests/testthat/test_get_region_matrix.R b/tests/testthat/test_get_region_matrix.R index 2884cef..02aaf97 100644 --- a/tests/testthat/test_get_region_matrix.R +++ b/tests/testthat/test_get_region_matrix.R @@ -15,3 +15,18 @@ test_that( } ) +test_that( + "NA for negative values", + { + regions <- get_region_matrix( + corpus = "REUTERS", + registry = RcppCWB::corpus_registry_dir("REUTERS")[[1]], + s_attribute = "id", + strucs = c(-1, 0:2) + ) + + expect_identical(regions[1,1], NA_integer_) + expect_identical(regions[1,2], NA_integer_) + + } +) \ No newline at end of file From 37a6928e0c5180081fb243d2210efc6419250def Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Wed, 29 Nov 2023 15:09:23 +0100 Subject: [PATCH 31/90] ranges_to_cpos() drops rows with NA values --- DESCRIPTION | 2 +- NEWS.md | 11 ++++++++-- R/checks.R | 6 +++++ R/region_matrix.R | 20 ++++++++++++----- README.Rmd | 2 +- README.md | 6 ++--- src/addons.cpp | 4 ++-- .../test_region_matrix_to_struc_matrix.R | 22 +++++++++++++++++++ 8 files changed, 59 insertions(+), 14 deletions(-) create mode 100644 tests/testthat/test_region_matrix_to_struc_matrix.R diff --git a/DESCRIPTION b/DESCRIPTION index 82306b2..706db7a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,7 +2,7 @@ Package: RcppCWB Type: Package Title: 'Rcpp' Bindings for the 'Corpus Workbench' ('CWB') Version: 0.6.3 -Date: 2023-11-28 +Date: 2023-11-29 Author: Andreas Blaette [aut, cre], Bernard Desgraupes [aut], Sylvain Loiseau [aut], diff --git a/NEWS.md b/NEWS.md index 04ad444..2356470 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,7 +1,14 @@ -# RcppCWB 0.6.2.9002 +# RcppCWB 0.6.3 -* Fix format-security issue under r-devel #86. * `cl_struc_values()` does not duplicate registry directories any more #77. +* Fix format-security issue under r-devel #86. +* `get_region_matrix()` reports NA values for negative strucs #87. +* `region_matrix_to_struc_matrix()` returns NA values for regions without +nested region as declared in the documentation #88. +* `check_strucs()` issues warning if negative values are passed and if lenght of +input vector is 0. +* `ranges_to_cpos()` drops rows from input matrix with NA values and issues +a respective warning. # RcppCWB 0.6.2 diff --git a/R/checks.R b/R/checks.R index e5d8d77..896f10b 100644 --- a/R/checks.R +++ b/R/checks.R @@ -103,10 +103,16 @@ check_p_attribute <- function(p_attribute, corpus, registry = Sys.getenv("CORPUS check_strucs <- function(corpus, s_attribute, strucs, registry){ if (!is.numeric(strucs)) stop("strucs needs to be a integer vector") + if (length(strucs) == 0L){ + warning("struc vector has length 0") + return(TRUE) + } if (max(strucs) > (cl_attribute_size(corpus, attribute = s_attribute, "s", registry = registry) - 1L)) stop("highest value of strucs may not be larger than size of structural attribute") if (any(is.na(strucs))) stop("there is an NA value among strucs") + if (any(strucs < 0)) + warning("struc vector includes negative values, result may include NA values") return( TRUE ) } diff --git a/R/region_matrix.R b/R/region_matrix.R index 5086aa9..1bd2840 100644 --- a/R/region_matrix.R +++ b/R/region_matrix.R @@ -122,12 +122,22 @@ region_matrix_context <- function(corpus, registry = Sys.getenv("CORPUS_REGISTRY #' positions in first and second column, respectively). #' @export ranges_to_cpos <- function(ranges){ - stopifnot( - is.integer(ranges), - is.matrix(ranges), - all(ranges[,2] >= ranges[,1]) - ) + stopifnot(is.integer(ranges), is.matrix(ranges)) + + if (any(is.na(ranges))){ + drop <- unique(c(which(is.na(ranges[,1])), which(is.na(ranges[,2])))) + warning( + sprintf("matrix includes NA values, dropping %d rows", length(drop)) + ) + ranges <- ranges[-drop,] + # if only one row left, we have a vector to be turned into matrix again + if (length(ranges) == 2L) ranges <- matrix(ranges, nrow = 1L) + } + if (nrow(ranges) == 0L) return(integer()) + + stopifnot(all(ranges[,2] >= ranges[,1])) + if (ncol(ranges) != 2L){ warning( "ranges_to_cpos() requires two-column integer matrix as input ", diff --git a/README.Rmd b/README.Rmd index 156c033..a075800 100644 --- a/README.Rmd +++ b/README.Rmd @@ -59,7 +59,7 @@ devtools::install_github("PolMine/RcppCWB", ref = "dev") ## Installation on macOS -On macOS, the [pcre2](http://www.pcre.org/) and [Glib](https://docs.gtk.org/glib) libraries need to be present. We recommend to use 'Homebrew' as a package manager for macOS. To install Homebrew, follow the instructions on the [Homebrew Website](https://brew.sh/index_de.html). It may also be necessary to also install [Xcode](https://developer.apple.com/xcode/) and [XQuartz](https://www.xquartz.org). +On macOS, the [pcre2](http://www.pcre.org/) and [Glib](https://docs.gtk.org/glib) libraries need to be present. We recommend to use 'Homebrew' as a package manager for macOS. To install Homebrew, follow the instructions on the [Homebrew Website](https://brew.sh). It may also be necessary to also install [Xcode](https://developer.apple.com/xcode/) and [XQuartz](https://www.xquartz.org). The following commands then need to be executed from a terminal window. They will install the C libraries the CWB relies on: diff --git a/README.md b/README.md index 684a107..0530680 100644 --- a/README.md +++ b/README.md @@ -80,8 +80,8 @@ On macOS, the [pcre2](http://www.pcre.org/) and [Glib](https://docs.gtk.org/glib) libraries need to be present. We recommend to use ‘Homebrew’ as a package manager for macOS. To install Homebrew, follow the instructions on the [Homebrew -Website](https://brew.sh/index_de.html). It may also be necessary to -also install [Xcode](https://developer.apple.com/xcode/) and +Website](https://brew.sh). It may also be necessary to also install +[Xcode](https://developer.apple.com/xcode/) and [XQuartz](https://www.xquartz.org). The following commands then need to be executed from a terminal window. @@ -185,7 +185,7 @@ cqp_initialize(registry = registry) cqp_query(corpus = "REUTERS", query = '"crude" "oil"') ``` - ## + ## ``` r cpos <- cqp_dump_subcorpus(corpus = "REUTERS") diff --git a/src/addons.cpp b/src/addons.cpp index 9685389..bea8077 100644 --- a/src/addons.cpp +++ b/src/addons.cpp @@ -485,8 +485,8 @@ Rcpp::IntegerMatrix region_matrix_to_struc_matrix(SEXP corpus, SEXP s_attribute, regions(i,1)--; }; - if (regions(i,0) < 0) regions(i,0) = NA_INTEGER; - if (regions(i,1) < 0) regions(i,1) = NA_INTEGER; + if (struc_matrix(i,0) < 0) struc_matrix(i,0) = NA_INTEGER; + if (struc_matrix(i,1) < 0) struc_matrix(i,1) = NA_INTEGER; } return struc_matrix; diff --git a/tests/testthat/test_region_matrix_to_struc_matrix.R b/tests/testthat/test_region_matrix_to_struc_matrix.R new file mode 100644 index 0000000..a872776 --- /dev/null +++ b/tests/testthat/test_region_matrix_to_struc_matrix.R @@ -0,0 +1,22 @@ +library(RcppCWB) +use_tmp_registry() +testthat::context("region_matrix_to_struc_matrix") + +test_that( + "region_matrix_to_struc_matrix", + { + use_tmp_registry(pkg = system.file(package = "GermaParl2")) + + if ("germaparl2mini" %in% cl_list_corpora()){ + m <- RcppCWB::region_matrix_to_struc_matrix( + corpus = "GERMAPARL2MINI", + s_attribute = "ne", + registry = get_tmp_registry(), + region_matrix = matrix(c(2770, 2785), ncol = 2, byrow = TRUE) + ) + + expect_identical(m[1,1], NA_integer_) + expect_identical(m[1,2], NA_integer_) + } + } +) From 74cb53ca949dd89099c227157c273b4d19801528 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Wed, 29 Nov 2023 16:09:19 +0100 Subject: [PATCH 32/90] rm typo in NEWS for 2nd release of v0.9.3 --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 2356470..d76304b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -5,7 +5,7 @@ * `get_region_matrix()` reports NA values for negative strucs #87. * `region_matrix_to_struc_matrix()` returns NA values for regions without nested region as declared in the documentation #88. -* `check_strucs()` issues warning if negative values are passed and if lenght of +* `check_strucs()` issues warning if negative values are passed and if length of input vector is 0. * `ranges_to_cpos()` drops rows from input matrix with NA values and issues a respective warning. From 1d6e3a243404813ecab48acd45dbdb9e2bc0e196 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Sun, 18 Feb 2024 22:41:02 +0100 Subject: [PATCH 33/90] message if deleting file fails #89 --- DESCRIPTION | 4 ++-- R/cwb.R | 21 ++++++++++++++++++--- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 706db7a..f6975a2 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: RcppCWB Type: Package Title: 'Rcpp' Bindings for the 'Corpus Workbench' ('CWB') -Version: 0.6.3 -Date: 2023-11-29 +Version: 0.6.3.9001 +Date: 2024-02-18 Author: Andreas Blaette [aut, cre], Bernard Desgraupes [aut], Sylvain Loiseau [aut], diff --git a/R/cwb.R b/R/cwb.R index 4f644ca..45a173a 100644 --- a/R/cwb.R +++ b/R/cwb.R @@ -132,7 +132,12 @@ cwb_huffcode <- function(corpus, p_attribute, registry = Sys.getenv("CORPUS_REGI fname <- path(data_dir, sprintf("%s.corpus", p_attribute)) if (!file.exists(fname)) warning("cwb_huffcode: file to delete missing") removed <- file.remove(fname) - if (removed) if (!quietly) message("redundant file deleted: ", fname) + if (removed){ + if (!quietly) message("redundant file deleted: ", fname) + } else { + message("could not delete redundant file: ", fname) + print(list.files(data_dir)) + } } success @@ -171,12 +176,22 @@ cwb_compress_rdx <- function(corpus, p_attribute, registry = Sys.getenv("CORPUS_ rev_file <- path(data_dir, sprintf("%s.corpus.rev", p_attribute)) if (!file.exists(rev_file)) warning("cwb_huffcode: file to delete missing") removed <- file.remove(rev_file) - if (removed) if (!quietly) message("redundant file deleted: ", rev_file) + if (removed){ + if (!quietly) message("redundant file deleted: ", rev_file) + } else { + message("could not delete redundant file: ", rev_file) + print(list.files(data_dir)) + } rdx_file <- path(data_dir, sprintf("%s.corpus.rdx", p_attribute)) if (!file.exists(rdx_file)) warning("cwb_huffcode: file to delete missing") removed <- file.remove(rdx_file) - if (removed) if (!quietly) message("redundant file deleted: ", rdx_file) + if (removed){ + if (!quietly) message("redundant file deleted: ", rdx_file) + } else { + message("could not delete redundant file: ", rdx_file) + print(list.files(data_dir)) + } } success From 4a8cfd565f6e5f2d0f94edcee501cb5f8f69ed69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Sun, 18 Feb 2024 23:20:02 +0100 Subject: [PATCH 34/90] print file info --- R/cwb.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/R/cwb.R b/R/cwb.R index 45a173a..a7ae0f2 100644 --- a/R/cwb.R +++ b/R/cwb.R @@ -137,6 +137,7 @@ cwb_huffcode <- function(corpus, p_attribute, registry = Sys.getenv("CORPUS_REGI } else { message("could not delete redundant file: ", fname) print(list.files(data_dir)) + print(file.info(fname)) } } @@ -181,6 +182,7 @@ cwb_compress_rdx <- function(corpus, p_attribute, registry = Sys.getenv("CORPUS_ } else { message("could not delete redundant file: ", rev_file) print(list.files(data_dir)) + print(file.info(rev_file)) } rdx_file <- path(data_dir, sprintf("%s.corpus.rdx", p_attribute)) @@ -191,6 +193,7 @@ cwb_compress_rdx <- function(corpus, p_attribute, registry = Sys.getenv("CORPUS_ } else { message("could not delete redundant file: ", rdx_file) print(list.files(data_dir)) + print(file.info(rdx_file)) } } From 1db33f18a90e2599737f0b9d1a020f2714c0453d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Sun, 18 Feb 2024 23:37:57 +0100 Subject: [PATCH 35/90] check file mode --- R/cwb.R | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/R/cwb.R b/R/cwb.R index a7ae0f2..8f3711b 100644 --- a/R/cwb.R +++ b/R/cwb.R @@ -131,6 +131,7 @@ cwb_huffcode <- function(corpus, p_attribute, registry = Sys.getenv("CORPUS_REGI data_dir <- corpus_data_dir(corpus = corpus, registry = registry) fname <- path(data_dir, sprintf("%s.corpus", p_attribute)) if (!file.exists(fname)) warning("cwb_huffcode: file to delete missing") + print(file.mode(fname, mode = 2)) removed <- file.remove(fname) if (removed){ if (!quietly) message("redundant file deleted: ", fname) @@ -138,6 +139,7 @@ cwb_huffcode <- function(corpus, p_attribute, registry = Sys.getenv("CORPUS_REGI message("could not delete redundant file: ", fname) print(list.files(data_dir)) print(file.info(fname)) + print(file.mode(fname, mode = 2)) } } @@ -176,6 +178,7 @@ cwb_compress_rdx <- function(corpus, p_attribute, registry = Sys.getenv("CORPUS_ rev_file <- path(data_dir, sprintf("%s.corpus.rev", p_attribute)) if (!file.exists(rev_file)) warning("cwb_huffcode: file to delete missing") + print(file.mode(rev_file)) removed <- file.remove(rev_file) if (removed){ if (!quietly) message("redundant file deleted: ", rev_file) @@ -183,10 +186,12 @@ cwb_compress_rdx <- function(corpus, p_attribute, registry = Sys.getenv("CORPUS_ message("could not delete redundant file: ", rev_file) print(list.files(data_dir)) print(file.info(rev_file)) + print(file.mode(rev_file)) } rdx_file <- path(data_dir, sprintf("%s.corpus.rdx", p_attribute)) if (!file.exists(rdx_file)) warning("cwb_huffcode: file to delete missing") + print(file.mode(rdx_file)) removed <- file.remove(rdx_file) if (removed){ if (!quietly) message("redundant file deleted: ", rdx_file) @@ -194,6 +199,7 @@ cwb_compress_rdx <- function(corpus, p_attribute, registry = Sys.getenv("CORPUS_ message("could not delete redundant file: ", rdx_file) print(list.files(data_dir)) print(file.info(rdx_file)) + print(file.mode(rdx_file)) } } From 408a74e0fcf8db1200e0ac56b4eca47c62a9305e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Sun, 18 Feb 2024 23:46:53 +0100 Subject: [PATCH 36/90] file.access --- R/cwb.R | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/R/cwb.R b/R/cwb.R index 8f3711b..f5ffbee 100644 --- a/R/cwb.R +++ b/R/cwb.R @@ -131,7 +131,7 @@ cwb_huffcode <- function(corpus, p_attribute, registry = Sys.getenv("CORPUS_REGI data_dir <- corpus_data_dir(corpus = corpus, registry = registry) fname <- path(data_dir, sprintf("%s.corpus", p_attribute)) if (!file.exists(fname)) warning("cwb_huffcode: file to delete missing") - print(file.mode(fname, mode = 2)) + print(file.access(fname, mode = 2)) removed <- file.remove(fname) if (removed){ if (!quietly) message("redundant file deleted: ", fname) @@ -139,7 +139,7 @@ cwb_huffcode <- function(corpus, p_attribute, registry = Sys.getenv("CORPUS_REGI message("could not delete redundant file: ", fname) print(list.files(data_dir)) print(file.info(fname)) - print(file.mode(fname, mode = 2)) + print(file.access(fname, mode = 2)) } } @@ -178,7 +178,7 @@ cwb_compress_rdx <- function(corpus, p_attribute, registry = Sys.getenv("CORPUS_ rev_file <- path(data_dir, sprintf("%s.corpus.rev", p_attribute)) if (!file.exists(rev_file)) warning("cwb_huffcode: file to delete missing") - print(file.mode(rev_file)) + print(file.access(rev_file, mode = 2)) removed <- file.remove(rev_file) if (removed){ if (!quietly) message("redundant file deleted: ", rev_file) @@ -186,12 +186,12 @@ cwb_compress_rdx <- function(corpus, p_attribute, registry = Sys.getenv("CORPUS_ message("could not delete redundant file: ", rev_file) print(list.files(data_dir)) print(file.info(rev_file)) - print(file.mode(rev_file)) + print(file.access(rev_file, mode = 2)) } rdx_file <- path(data_dir, sprintf("%s.corpus.rdx", p_attribute)) if (!file.exists(rdx_file)) warning("cwb_huffcode: file to delete missing") - print(file.mode(rdx_file)) + print(file.access(rdx_file, mode = 2)) removed <- file.remove(rdx_file) if (removed){ if (!quietly) message("redundant file deleted: ", rdx_file) @@ -199,7 +199,7 @@ cwb_compress_rdx <- function(corpus, p_attribute, registry = Sys.getenv("CORPUS_ message("could not delete redundant file: ", rdx_file) print(list.files(data_dir)) print(file.info(rdx_file)) - print(file.mode(rdx_file)) + print(file.access(rdx_file, mode = 2)) } } From 182fc8e052825a9e49ad1a96ebccd7284df16092 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Sun, 18 Feb 2024 23:57:44 +0100 Subject: [PATCH 37/90] try fs::file_delete --- R/cwb.R | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/R/cwb.R b/R/cwb.R index f5ffbee..d51cc7a 100644 --- a/R/cwb.R +++ b/R/cwb.R @@ -132,15 +132,17 @@ cwb_huffcode <- function(corpus, p_attribute, registry = Sys.getenv("CORPUS_REGI fname <- path(data_dir, sprintf("%s.corpus", p_attribute)) if (!file.exists(fname)) warning("cwb_huffcode: file to delete missing") print(file.access(fname, mode = 2)) - removed <- file.remove(fname) - if (removed){ - if (!quietly) message("redundant file deleted: ", fname) - } else { - message("could not delete redundant file: ", fname) - print(list.files(data_dir)) - print(file.info(fname)) - print(file.access(fname, mode = 2)) - } + # removed <- file.remove(fname) + removed <- file_delete(fname) + print(removed) + # if (removed){ + # if (!quietly) message("redundant file deleted: ", fname) + # } else { + # message("could not delete redundant file: ", fname) + # print(list.files(data_dir)) + # print(file.info(fname)) + # print(file.access(fname, mode = 2)) + # } } success From 89dfa817f2204048567ec7979c3263f8c8efd6f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Mon, 19 Feb 2024 21:56:04 +0100 Subject: [PATCH 38/90] cwb_huffcode() and cwb_compress_rdx() delete redundant files on Windows #89 --- DESCRIPTION | 4 +-- NEWS.md | 5 +++ R/cwb.R | 93 +++++++++++++++++++++++++++++++++-------------------- 3 files changed, 65 insertions(+), 37 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index f6975a2..38f863f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: RcppCWB Type: Package Title: 'Rcpp' Bindings for the 'Corpus Workbench' ('CWB') -Version: 0.6.3.9001 -Date: 2024-02-18 +Version: 0.6.3.9002 +Date: 2024-02-19 Author: Andreas Blaette [aut, cre], Bernard Desgraupes [aut], Sylvain Loiseau [aut], diff --git a/NEWS.md b/NEWS.md index d76304b..3007dee 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,8 @@ +# RcppCWB 0.6.3.9001 + +* `cwb_huffcode()` and `cwb_compress_rdx()` did not delete redundant files on +Windows. Fixed by temporarily unloading the corpus #89. + # RcppCWB 0.6.3 * `cl_struc_values()` does not duplicate registry directories any more #77. diff --git a/R/cwb.R b/R/cwb.R index d51cc7a..b4764f7 100644 --- a/R/cwb.R +++ b/R/cwb.R @@ -1,7 +1,7 @@ #' CWB Tools for Creating Corpora #' -#' Wrappers for the CWB tools (`cwb-makeall`, `cwb-huffcode`, -#' `cwb-compress-rdx`). Unlike the 'original' command line tools, these wrappers +#' Wrappers for the CWB tools `cwb-makeall`, `cwb-huffcode` and +#' `cwb-compress-rdx`. Unlike the 'original' command line tools, these wrappers #' will always perform a specific indexing/compression step on one positional #' attribute, and produce all components. #' @@ -15,13 +15,16 @@ #' (counter of tokens processed). #' @examples #' # The package includes and 'unfinished' corpus of debates in the UN General -#' # Assembly ("UNGA"), i.e. it does not yet include the reverse index, and it is -#' # not compressed. +#' # Assembly ("UNGA"), i.e. it does not yet include the reverse index, and it +#' # is not compressed. #' # #' # The first step in the following example is to copy the raw #' # corpus to a temporary place. #' -#' home_dir <- system.file(package = "RcppCWB", "extdata", "cwb", "indexed_corpora", "unga") +#' home_dir <- system.file( +#' package = "RcppCWB", +#' "extdata", "cwb", "indexed_corpora", "unga" +#' ) #' #' tmp_data_dir <- file.path(tempdir(), "indexed_corpora") #' tmp_unga_dir <- file.path(tmp_data_dir, "unga2") @@ -104,9 +107,15 @@ cwb_makeall <- function(corpus, p_attribute, registry = Sys.getenv("CORPUS_REGIS #' @rdname cwb_utils #' @export cwb_huffcode -#' @param delete A `logical` value, whether to remove redundant files after -#' compression. -cwb_huffcode <- function(corpus, p_attribute, registry = Sys.getenv("CORPUS_REGISTRY"), quietly = FALSE, delete = TRUE){ +#' @param delete A `logical` value, whether to remove redundant file +#' (p_attribute).corpus after compression. +cwb_huffcode <- function( + corpus, + p_attribute, + registry = Sys.getenv("CORPUS_REGISTRY"), + quietly = FALSE, + delete = TRUE + ){ registry <- path_expand(path(registry)) check_registry(registry) @@ -118,9 +127,14 @@ cwb_huffcode <- function(corpus, p_attribute, registry = Sys.getenv("CORPUS_REGI )) } - huffcode <- function() - .cwb_huffcode(x = corpus, p_attribute = p_attribute, registry_dir = registry) - + huffcode <- function(){ + .cwb_huffcode( + x = corpus, + p_attribute = p_attribute, + registry_dir = registry + ) + } + if (quietly){ capture.output({success <- huffcode()}, type = "output") } else { @@ -131,18 +145,16 @@ cwb_huffcode <- function(corpus, p_attribute, registry = Sys.getenv("CORPUS_REGI data_dir <- corpus_data_dir(corpus = corpus, registry = registry) fname <- path(data_dir, sprintf("%s.corpus", p_attribute)) if (!file.exists(fname)) warning("cwb_huffcode: file to delete missing") - print(file.access(fname, mode = 2)) - # removed <- file.remove(fname) - removed <- file_delete(fname) - print(removed) - # if (removed){ - # if (!quietly) message("redundant file deleted: ", fname) - # } else { - # message("could not delete redundant file: ", fname) - # print(list.files(data_dir)) - # print(file.info(fname)) - # print(file.access(fname, mode = 2)) - # } + # removing the file will fail on Windows unless corpus is unloaded #89 + cl_delete_corpus(corpus = corpus, registry = registry) + removed <- file.remove(fname) + if (removed){ + if (!quietly) message("redundant file deleted: ", fname) + } else { + message("could not delete redundant file: ", fname) + } + # reload corpus so that it is available again + cl_load_corpus(corpus = corpus, registry = registry) } success @@ -157,17 +169,33 @@ cwb_huffcode <- function(corpus, p_attribute, registry = Sys.getenv("CORPUS_REGI #' p_attribute = "word", #' registry = get_tmp_registry() #' ) -cwb_compress_rdx <- function(corpus, p_attribute, registry = Sys.getenv("CORPUS_REGISTRY"), quietly = FALSE, delete = TRUE){ +cwb_compress_rdx <- function( + corpus, + p_attribute, + registry = Sys.getenv("CORPUS_REGISTRY"), + quietly = FALSE, + delete = TRUE + ){ registry <- path_expand(path(registry)) check_registry(registry) regfile <- path(registry, tolower(corpus)) if (!file.exists(regfile)){ - stop(sprintf("No registry file for corpus '%s' in registry directory '%s'.", corpus, registry)) + stop( + sprintf( + "No registry file for corpus '%s' in registry directory '%s'.", + corpus, + registry + ) + ) } compress_rdx <-function() - .cwb_compress_rdx(x = corpus, p_attribute = p_attribute, registry_dir = registry) + .cwb_compress_rdx( + x = corpus, + p_attribute = p_attribute, + registry_dir = registry + ) if (quietly){ capture.output({success <- compress_rdx()}, type = "output") @@ -180,29 +208,24 @@ cwb_compress_rdx <- function(corpus, p_attribute, registry = Sys.getenv("CORPUS_ rev_file <- path(data_dir, sprintf("%s.corpus.rev", p_attribute)) if (!file.exists(rev_file)) warning("cwb_huffcode: file to delete missing") - print(file.access(rev_file, mode = 2)) + # deleting *.rev file fails on Windows unless corpus is unloaded #89 + cl_delete_corpus(corpus = corpus, registry = registry) removed <- file.remove(rev_file) if (removed){ if (!quietly) message("redundant file deleted: ", rev_file) } else { message("could not delete redundant file: ", rev_file) - print(list.files(data_dir)) - print(file.info(rev_file)) - print(file.access(rev_file, mode = 2)) } - + rdx_file <- path(data_dir, sprintf("%s.corpus.rdx", p_attribute)) if (!file.exists(rdx_file)) warning("cwb_huffcode: file to delete missing") - print(file.access(rdx_file, mode = 2)) removed <- file.remove(rdx_file) if (removed){ if (!quietly) message("redundant file deleted: ", rdx_file) } else { message("could not delete redundant file: ", rdx_file) - print(list.files(data_dir)) - print(file.info(rdx_file)) - print(file.access(rdx_file, mode = 2)) } + cl_load_corpus(corpus = corpus, registry = registry) } success From bd8cec5a50cf17c7bbfec8a308aa9e339d11ad56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Mon, 19 Feb 2024 21:57:25 +0100 Subject: [PATCH 39/90] update documentation #89 --- man/cl_attribute_size.Rd | 10 +++++----- man/cwb_utils.Rd | 17 ++++++++++------- man/get_region_matrix.Rd | 25 ++++++++++++++----------- 3 files changed, 29 insertions(+), 23 deletions(-) diff --git a/man/cl_attribute_size.Rd b/man/cl_attribute_size.Rd index a0c4c05..8f1b8c0 100644 --- a/man/cl_attribute_size.Rd +++ b/man/cl_attribute_size.Rd @@ -22,11 +22,11 @@ cl_attribute_size( environment variable CORPUS_REGISTRY} } \description{ -Use \code{cl_attribute_size} to get the total number of values of a -positional attribute (param \code{attribute_type} = "p"), or structural -attribute (param \code{attribute_type} = "s"). Note that indices are -zero-based, i.e. the maximum position of a positional / structural -attribute is attribute size minus 1 (see examples). +Use \code{cl_attribute_size()} to get the total number of values of a positional +attribute (param \code{attribute_type} = "p"), or structural attribute (param +\code{attribute_type} = "s"). Note that indices are zero-based, i.e. the maximum +position of a positional / structural attribute is attribute size minus 1 +(see examples). } \examples{ token_no <- cl_attribute_size( diff --git a/man/cwb_utils.Rd b/man/cwb_utils.Rd index a865571..14313a1 100644 --- a/man/cwb_utils.Rd +++ b/man/cwb_utils.Rd @@ -56,8 +56,8 @@ environment variable CORPUS_REGISTRY.} \item{quietly}{A \code{logical} value, whether to turn off messages (including warnings).} -\item{delete}{A \code{logical} value, whether to remove redundant files after -compression.} +\item{delete}{A \code{logical} value, whether to remove redundant file +(p_attribute).corpus after compression.} \item{data_dir}{The data directory where \code{cwb_encode} will save the binary files of the indexed corpus. Tilde expansion is performed on \code{data_dir} @@ -92,20 +92,23 @@ tokens} (counter of tokens processed).} } \description{ -Wrappers for the CWB tools (\code{cwb-makeall}, \code{cwb-huffcode}, -\code{cwb-compress-rdx}). Unlike the 'original' command line tools, these wrappers +Wrappers for the CWB tools \code{cwb-makeall}, \code{cwb-huffcode} and +\code{cwb-compress-rdx}. Unlike the 'original' command line tools, these wrappers will always perform a specific indexing/compression step on one positional attribute, and produce all components. } \examples{ # The package includes and 'unfinished' corpus of debates in the UN General -# Assembly ("UNGA"), i.e. it does not yet include the reverse index, and it is -# not compressed. +# Assembly ("UNGA"), i.e. it does not yet include the reverse index, and it +# is not compressed. # # The first step in the following example is to copy the raw # corpus to a temporary place. -home_dir <- system.file(package = "RcppCWB", "extdata", "cwb", "indexed_corpora", "unga") +home_dir <- system.file( + package = "RcppCWB", + "extdata", "cwb", "indexed_corpora", "unga" +) tmp_data_dir <- file.path(tempdir(), "indexed_corpora") tmp_unga_dir <- file.path(tmp_data_dir, "unga2") diff --git a/man/get_region_matrix.Rd b/man/get_region_matrix.Rd index 8b4c84d..f3e059c 100644 --- a/man/get_region_matrix.Rd +++ b/man/get_region_matrix.Rd @@ -12,25 +12,28 @@ get_region_matrix( ) } \arguments{ -\item{corpus}{a CWB corpus} +\item{corpus}{A CWB corpus (length-one \code{character} vector).} -\item{s_attribute}{a structural attribute} +\item{s_attribute}{A structural attribute (length-one \code{character} vector).} -\item{strucs}{strucs} +\item{strucs}{Integer vector with strucs.} -\item{registry}{the registry directory} +\item{registry}{Registry directory with registry file.} } \value{ -A matrix with integer values indicating left and right corpus positions -(columns 1 and 2, respectively). +A matrix with integer values indicating left and right corpus +positions (columns 1 and 2, respectively). } \description{ -The return value is an integer matrix with the left and right corpus positions -of the strucs in columns one and two, respectively. +The return value is an \code{integer} matrix with the left and right corpus +positions of the strucs in columns one and two, respectively. For negative +struc values in the input vector, the matrix reports \code{NA} values. } \examples{ y <- get_region_matrix( - corpus = "REUTERS", s_attribute = "id", - strucs = 0L:5L, registry = get_tmp_registry() - ) + corpus = "REUTERS", + s_attribute = "id", + strucs = 0L:5L, + registry = get_tmp_registry() +) } From 41b9029e1e780216fec9bcf2019970a7871b7669 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Thu, 29 Feb 2024 09:47:28 +0100 Subject: [PATCH 40/90] cwb_encode() accepts empty list as arg s_attributes #90 --- R/cwb.R | 63 ++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 49 insertions(+), 14 deletions(-) diff --git a/R/cwb.R b/R/cwb.R index b4764f7..3e3f700 100644 --- a/R/cwb.R +++ b/R/cwb.R @@ -46,7 +46,11 @@ #' } #' #' # perform cwb_makeall (equivalent to cwb-makeall command line utility) -#' cwb_makeall(corpus = "UNGA2", p_attribute = "word", registry = get_tmp_registry()) +#' cwb_makeall( +#' corpus = "UNGA2", +#' p_attribute = "word", +#' registry = get_tmp_registry() +#' ) #' cl_load_corpus("UNGA2", registry = get_tmp_registry()) #' cqp_load_corpus("UNGA2", registry = get_tmp_registry()) #' @@ -59,7 +63,11 @@ #' corpus = "UNGA2", p_attribute = "word", #' registry = get_tmp_registry(), id = ids_sentence_1 #' ) -#' sentence <- gsub("\\s+([\\.,])", "\\1", paste(tokens_sentence_1, collapse = " ")) +#' sentence <- gsub( +#' "\\s+([\\.,])", +#' "\\1", +#' paste(tokens_sentence_1, collapse = " ") +#' ) #' #' # perform cwb_huffcode (equivalent to cwb-makeall command line utility) #' cwb_huffcode( @@ -71,7 +79,12 @@ #' @export cwb_makeall #' @importFrom utils capture.output #' @importFrom fs path path_expand -cwb_makeall <- function(corpus, p_attribute, registry = Sys.getenv("CORPUS_REGISTRY"), quietly = FALSE){ +cwb_makeall <- function( + corpus, + p_attribute, + registry = Sys.getenv("CORPUS_REGISTRY"), + quietly = FALSE + ){ registry <- path_expand(path(registry)) check_registry(registry) @@ -281,9 +294,18 @@ cwb_compress_rdx <- function( #' unlink(data_dir) #' unlink(file.path(Sys.getenv("CORPUS_REGISTRY"), "btmin")) cwb_encode <- function( - corpus, registry = Sys.getenv("CORPUS_REGISTRY"), data_dir, vrt_dir, - encoding = "utf8", p_attributes = c("word", "pos", "lemma"), s_attributes, - skip_blank_lines = TRUE, strip_whitespace = TRUE, xml = TRUE, quietly = FALSE, verbose = FALSE + corpus, + registry = Sys.getenv("CORPUS_REGISTRY"), + data_dir, + vrt_dir, + encoding = "utf8", + p_attributes = c("word", "pos", "lemma"), + s_attributes = list(), + skip_blank_lines = TRUE, + strip_whitespace = TRUE, + xml = TRUE, + quietly = FALSE, + verbose = FALSE ){ if (encoding == "UTF-8") encoding <- "utf8" @@ -300,7 +322,6 @@ cwb_encode <- function( is.character(data_dir), length(data_dir) == 1L, dir.exists(data_dir), length(list.files(data_dir)) == 0L, is.character(vrt_dir), length(vrt_dir) == 1L, dir.exists(vrt_dir), - length(list.files(vrt_dir)) > 0L, is.character(encoding), length(encoding) == 1L, is.character(p_attributes), is.logical(skip_blank_lines), length(skip_blank_lines) == 1L, @@ -308,19 +329,33 @@ cwb_encode <- function( is.logical(xml), length(xml) == 1L ) + if (length(list.files(vrt_dir)) < 1L) + stop("No files in directory 'vrt_dir'") + s_attributes_noanno <- unlist(lapply( names(s_attributes), - function(s_attr) if (length(s_attributes[[s_attr]]) == 0L) s_attr else character() + function(s_attr) + if (length(s_attributes[[s_attr]]) == 0L) s_attr else character() )) + # zero-length scenario results in NULL + if (is.null(s_attributes_noanno)) s_attributes_noanno <- character() for (s_attr in s_attributes_noanno) s_attributes[[s_attr]] <- NULL s_attributes_anno <- unname( - sapply( - names(s_attributes), - function(s_attr) paste(s_attr, ":", 0L, "+", paste(s_attributes[[s_attr]], collapse = "+"), sep = "") + unlist( + lapply( + names(s_attributes), + function(s_attr) + paste( + s_attr, ":", 0L, "+", + paste(s_attributes[[s_attr]], collapse = "+"), + sep = "" + ) + ) ) ) + if (is.null(s_attributes_anno)) s_attributes_anno <- character() # Ensure that paths are standardized regfile <- as.character(fs::path(fs::path_expand(registry), tolower(corpus))) @@ -362,9 +397,9 @@ cwb_version <- function() as.numeric_version(.cwb_version()) #' supported by the Corpus Workbench (CWB). The vector is derived from the the #' `CorpusCharset` object defined in the header file of the corpus library (CL). #' -#' Early versions of the CWB were developed for "latin1", "utf8" support has been -#' introduced with CWB v3.2. Note that RcppCWB is tested only for "latin1" and -#' "utf8" and that R uses "UTF-8" rather than utf8" (CWB) by convention. +#' Early versions of the CWB were developed for "latin1", "utf8" support has +#' been introduced with CWB v3.2. Note that RcppCWB is tested only for "latin1" +#' and "utf8" and that R uses "UTF-8" rather than utf8" (CWB) by convention. #' @export #' @examples #' cwb_charsets() From 59a544e1d73a5f55da7124566a1e75f205b209f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Thu, 29 Feb 2024 09:52:02 +0100 Subject: [PATCH 41/90] documentation updated for cwb_encode() arg s_attributes #90 --- R/cwb.R | 268 ++++++++++++++++++++++---------------------- man/cwb_charsets.Rd | 6 +- man/cwb_utils.Rd | 110 +++++++++--------- 3 files changed, 197 insertions(+), 187 deletions(-) diff --git a/R/cwb.R b/R/cwb.R index 3e3f700..96012d0 100644 --- a/R/cwb.R +++ b/R/cwb.R @@ -5,6 +5,27 @@ #' will always perform a specific indexing/compression step on one positional #' attribute, and produce all components. #' +#' @param p_attributes Positional attributes (p-attributes) to be declared. +#' @param data_dir The data directory where `cwb_encode` will save the binary +#' files of the indexed corpus. Tilde expansion is performed on `data_dir` +#' using `path.expand()` to avoid a crash. +#' @param vrt_dir Directory with input corpus files (verticalised format / file +#' ending *.vrt). Tilde expansion is performed on `vrt_dir` using +#' `path.expand()` to avoid a crash. +#' @param encoding The encoding of the files to be encoded. Needs to be an +#' encoding supported by CWB, see `cwb_charsets()`. "UTF-8" is taken as +#' "utf8". Defaults to "utf8" (recommended charset). +#' @param s_attributes A `list` of named `character` vectors to declare +#' structural attributes that shall be encoded. The names of the list are the +#' XML elements present in the corpus. Character vectors making up the list +#' declare the attributes that include the metadata of regions. To declare a +#' structural attribute without annotations, provide a zero-length character +#' vector using `character()` - see examples. +#' @param skip_blank_lines A `logical` value, whether to skip blank lines in the +#' input. +#' @param strip_whitespace A `logical` value, whether to strip whitespace from +#' tokens +#' @param xml A `logical` value, whether input is XML. #' @param corpus Name of a CWB corpus (upper case). #' @param p_attribute Name of p-attribute. #' @param registry Path to the registry directory, defaults to the value of the @@ -13,6 +34,120 @@ #' warnings). #' @param verbose A `logical` value, whether to show progress information #' (counter of tokens processed). +#' @rdname cwb_utils +#' @export cwb_encode +#' @importFrom fs path +#' @examples +#' data_dir <- file.path(tempdir(), "bt_data_dir") +#' dir.create(data_dir) +#' +#' cwb_encode( +#' corpus = "BTMIN", +#' registry = Sys.getenv("CORPUS_REGISTRY"), +#' vrt_dir = system.file(package = "RcppCWB", "extdata", "vrt"), +#' data_dir = data_dir, +#' p_attributes = c("word", "pos", "lemma"), +#' s_attributes = list( +#' plenary_protocol = c( +#' "lp", "protocol_no", "date", "year", "birthday", "version", +#' "url", "filetype" +#' ), +#' speaker = c( +#' "id", "type", "lp", "protocol_no", "date", "year", "ai_no", "ai_id", +#' "ai_type", "who", "name", "parliamentary_group", "party", "role" +#' ), +#' p = character() +#' ) +#' ) +#' +#' unlink(data_dir) +#' unlink(file.path(Sys.getenv("CORPUS_REGISTRY"), "btmin")) +cwb_encode <- function( + corpus, + registry = Sys.getenv("CORPUS_REGISTRY"), + data_dir, + vrt_dir, + encoding = "utf8", + p_attributes = c("word", "pos", "lemma"), + s_attributes = list(), + skip_blank_lines = TRUE, + strip_whitespace = TRUE, + xml = TRUE, + quietly = FALSE, + verbose = FALSE +){ + + if (encoding == "UTF-8") encoding <- "utf8" + if (!encoding %in% cwb_charsets()) stop( + sprintf( + "encoding '%' is not a valid CWB character set, see cwb_charsets() for options", + cwb_charsets + ) + ) + + stopifnot( + is.character(corpus), length(corpus) == 1L, + is.character(registry), length(registry) == 1L, dir.exists(registry), + is.character(data_dir), length(data_dir) == 1L, + dir.exists(data_dir), length(list.files(data_dir)) == 0L, + is.character(vrt_dir), length(vrt_dir) == 1L, dir.exists(vrt_dir), + is.character(encoding), length(encoding) == 1L, + is.character(p_attributes), + is.logical(skip_blank_lines), length(skip_blank_lines) == 1L, + is.logical(strip_whitespace), length(strip_whitespace) == 1L, + is.logical(xml), length(xml) == 1L + ) + + if (length(list.files(vrt_dir)) < 1L) + stop("No files in directory 'vrt_dir'") + + s_attributes_noanno <- unlist(lapply( + names(s_attributes), + function(s_attr) + if (length(s_attributes[[s_attr]]) == 0L) s_attr else character() + )) + # zero-length scenario results in NULL + if (is.null(s_attributes_noanno)) s_attributes_noanno <- character() + + for (s_attr in s_attributes_noanno) s_attributes[[s_attr]] <- NULL + + s_attributes_anno <- unname( + unlist( + lapply( + names(s_attributes), + function(s_attr) + paste( + s_attr, ":", 0L, "+", + paste(s_attributes[[s_attr]], collapse = "+"), + sep = "" + ) + ) + ) + ) + if (is.null(s_attributes_anno)) s_attributes_anno <- character() + + # Ensure that paths are standardized + regfile <- as.character(fs::path(fs::path_expand(registry), tolower(corpus))) + data_dir <- as.character(fs::path(fs::path_expand(data_dir))) + vrt_dir <- as.character(fs::path(fs::path_expand(vrt_dir))) + + .cwb_encode( + regfile = regfile, + data_dir = data_dir, + vrt_dir = vrt_dir, + encoding = encoding, + p_attributes = p_attributes, + s_attributes_anno = s_attributes_anno, + s_attributes_noanno = s_attributes_noanno, + skip_blank_lines = skip_blank_lines, + xml = xml, + strip_whitespace = strip_whitespace, + quiet = quietly, + verbosity = verbose + ) +} + + #' @examples #' # The package includes and 'unfinished' corpus of debates in the UN General #' # Assembly ("UNGA"), i.e. it does not yet include the reverse index, and it @@ -244,139 +379,6 @@ cwb_compress_rdx <- function( success } -#' @param p_attributes Positional attributes (p-attributes) to be declared. -#' @param data_dir The data directory where `cwb_encode` will save the binary -#' files of the indexed corpus. Tilde expansion is performed on `data_dir` -#' using `path.expand()` to avoid a crash. -#' @param vrt_dir Directory with input corpus files (verticalised format / file -#' ending *.vrt). Tilde expansion is performed on `vrt_dir` using -#' `path.expand()` to avoid a crash. -#' @param encoding The encoding of the files to be encoded. Needs to be an -#' encoding supported by CWB, see `cwb_charsets()`. "UTF-8" is taken as -#' "utf8". Defaults to "utf8" (recommended charset). -#' @param s_attributes A `list` of named `character` vectors to declare -#' structural attributes that shall be encoded. The names of the list are the -#' XML elements present in the corpus. Character vectors making up the list -#' declare the attributes that include the metadata of regions. To declare a -#' structural attribute without annotations, provide a zero-length character -#' vector using `character()` - see examples. -#' @param skip_blank_lines A `logical` value, whether to skip blank lines in the -#' input. -#' @param strip_whitespace A `logical` value, whether to strip whitespace from -#' tokens -#' @param xml A `logical` value, whether input is XML. -#' @rdname cwb_utils -#' @export cwb_encode -#' @importFrom fs path -#' @examples -#' data_dir <- file.path(tempdir(), "bt_data_dir") -#' dir.create(data_dir) -#' -#' cwb_encode( -#' corpus = "BTMIN", -#' registry = Sys.getenv("CORPUS_REGISTRY"), -#' vrt_dir = system.file(package = "RcppCWB", "extdata", "vrt"), -#' data_dir = data_dir, -#' p_attributes = c("word", "pos", "lemma"), -#' s_attributes = list( -#' plenary_protocol = c( -#' "lp", "protocol_no", "date", "year", "birthday", "version", -#' "url", "filetype" -#' ), -#' speaker = c( -#' "id", "type", "lp", "protocol_no", "date", "year", "ai_no", "ai_id", -#' "ai_type", "who", "name", "parliamentary_group", "party", "role" -#' ), -#' p = character() -#' ) -#' ) -#' -#' unlink(data_dir) -#' unlink(file.path(Sys.getenv("CORPUS_REGISTRY"), "btmin")) -cwb_encode <- function( - corpus, - registry = Sys.getenv("CORPUS_REGISTRY"), - data_dir, - vrt_dir, - encoding = "utf8", - p_attributes = c("word", "pos", "lemma"), - s_attributes = list(), - skip_blank_lines = TRUE, - strip_whitespace = TRUE, - xml = TRUE, - quietly = FALSE, - verbose = FALSE -){ - - if (encoding == "UTF-8") encoding <- "utf8" - if (!encoding %in% cwb_charsets()) stop( - sprintf( - "encoding '%' is not a valid CWB character set, see cwb_charsets() for options", - cwb_charsets - ) - ) - - stopifnot( - is.character(corpus), length(corpus) == 1L, - is.character(registry), length(registry) == 1L, dir.exists(registry), - is.character(data_dir), length(data_dir) == 1L, - dir.exists(data_dir), length(list.files(data_dir)) == 0L, - is.character(vrt_dir), length(vrt_dir) == 1L, dir.exists(vrt_dir), - is.character(encoding), length(encoding) == 1L, - is.character(p_attributes), - is.logical(skip_blank_lines), length(skip_blank_lines) == 1L, - is.logical(strip_whitespace), length(strip_whitespace) == 1L, - is.logical(xml), length(xml) == 1L - ) - - if (length(list.files(vrt_dir)) < 1L) - stop("No files in directory 'vrt_dir'") - - s_attributes_noanno <- unlist(lapply( - names(s_attributes), - function(s_attr) - if (length(s_attributes[[s_attr]]) == 0L) s_attr else character() - )) - # zero-length scenario results in NULL - if (is.null(s_attributes_noanno)) s_attributes_noanno <- character() - - for (s_attr in s_attributes_noanno) s_attributes[[s_attr]] <- NULL - - s_attributes_anno <- unname( - unlist( - lapply( - names(s_attributes), - function(s_attr) - paste( - s_attr, ":", 0L, "+", - paste(s_attributes[[s_attr]], collapse = "+"), - sep = "" - ) - ) - ) - ) - if (is.null(s_attributes_anno)) s_attributes_anno <- character() - - # Ensure that paths are standardized - regfile <- as.character(fs::path(fs::path_expand(registry), tolower(corpus))) - data_dir <- as.character(fs::path(fs::path_expand(data_dir))) - vrt_dir <- as.character(fs::path(fs::path_expand(vrt_dir))) - - .cwb_encode( - regfile = regfile, - data_dir = data_dir, - vrt_dir = vrt_dir, - encoding = encoding, - p_attributes = p_attributes, - s_attributes_anno = s_attributes_anno, - s_attributes_noanno = s_attributes_noanno, - skip_blank_lines = skip_blank_lines, - xml = xml, - strip_whitespace = strip_whitespace, - quiet = quietly, - verbosity = verbose - ) -} #' Get CWB version #' diff --git a/man/cwb_charsets.Rd b/man/cwb_charsets.Rd index d1f2947..d26646b 100644 --- a/man/cwb_charsets.Rd +++ b/man/cwb_charsets.Rd @@ -12,9 +12,9 @@ supported by the Corpus Workbench (CWB). The vector is derived from the the \code{CorpusCharset} object defined in the header file of the corpus library (CL). } \details{ -Early versions of the CWB were developed for "latin1", "utf8" support has been -introduced with CWB v3.2. Note that RcppCWB is tested only for "latin1" and -"utf8" and that R uses "UTF-8" rather than utf8" (CWB) by convention. +Early versions of the CWB were developed for "latin1", "utf8" support has +been introduced with CWB v3.2. Note that RcppCWB is tested only for "latin1" +and "utf8" and that R uses "UTF-8" rather than utf8" (CWB) by convention. } \examples{ cwb_charsets() diff --git a/man/cwb_utils.Rd b/man/cwb_utils.Rd index 14313a1..856ecbe 100644 --- a/man/cwb_utils.Rd +++ b/man/cwb_utils.Rd @@ -1,12 +1,27 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/cwb.R -\name{cwb_makeall} +\name{cwb_encode} +\alias{cwb_encode} \alias{cwb_makeall} \alias{cwb_huffcode} \alias{cwb_compress_rdx} -\alias{cwb_encode} \title{CWB Tools for Creating Corpora} \usage{ +cwb_encode( + corpus, + registry = Sys.getenv("CORPUS_REGISTRY"), + data_dir, + vrt_dir, + encoding = "utf8", + p_attributes = c("word", "pos", "lemma"), + s_attributes = list(), + skip_blank_lines = TRUE, + strip_whitespace = TRUE, + xml = TRUE, + quietly = FALSE, + verbose = FALSE +) + cwb_makeall( corpus, p_attribute, @@ -29,36 +44,13 @@ cwb_compress_rdx( quietly = FALSE, delete = TRUE ) - -cwb_encode( - corpus, - registry = Sys.getenv("CORPUS_REGISTRY"), - data_dir, - vrt_dir, - encoding = "utf8", - p_attributes = c("word", "pos", "lemma"), - s_attributes, - skip_blank_lines = TRUE, - strip_whitespace = TRUE, - xml = TRUE, - quietly = FALSE, - verbose = FALSE -) } \arguments{ \item{corpus}{Name of a CWB corpus (upper case).} -\item{p_attribute}{Name of p-attribute.} - \item{registry}{Path to the registry directory, defaults to the value of the environment variable CORPUS_REGISTRY.} -\item{quietly}{A \code{logical} value, whether to turn off messages (including -warnings).} - -\item{delete}{A \code{logical} value, whether to remove redundant file -(p_attribute).corpus after compression.} - \item{data_dir}{The data directory where \code{cwb_encode} will save the binary files of the indexed corpus. Tilde expansion is performed on \code{data_dir} using \code{path.expand()} to avoid a crash.} @@ -88,8 +80,16 @@ tokens} \item{xml}{A \code{logical} value, whether input is XML.} +\item{quietly}{A \code{logical} value, whether to turn off messages (including +warnings).} + \item{verbose}{A \code{logical} value, whether to show progress information (counter of tokens processed).} + +\item{p_attribute}{Name of p-attribute.} + +\item{delete}{A \code{logical} value, whether to remove redundant file +(p_attribute).corpus after compression.} } \description{ Wrappers for the CWB tools \code{cwb-makeall}, \code{cwb-huffcode} and @@ -98,6 +98,30 @@ will always perform a specific indexing/compression step on one positional attribute, and produce all components. } \examples{ +data_dir <- file.path(tempdir(), "bt_data_dir") +dir.create(data_dir) + +cwb_encode( + corpus = "BTMIN", + registry = Sys.getenv("CORPUS_REGISTRY"), + vrt_dir = system.file(package = "RcppCWB", "extdata", "vrt"), + data_dir = data_dir, + p_attributes = c("word", "pos", "lemma"), + s_attributes = list( + plenary_protocol = c( + "lp", "protocol_no", "date", "year", "birthday", "version", + "url", "filetype" + ), + speaker = c( + "id", "type", "lp", "protocol_no", "date", "year", "ai_no", "ai_id", + "ai_type", "who", "name", "parliamentary_group", "party", "role" + ), + p = character() + ) +) + +unlink(data_dir) +unlink(file.path(Sys.getenv("CORPUS_REGISTRY"), "btmin")) # The package includes and 'unfinished' corpus of debates in the UN General # Assembly ("UNGA"), i.e. it does not yet include the reverse index, and it # is not compressed. @@ -130,7 +154,11 @@ for (x in list.files(home_dir, full.names = TRUE)){ } # perform cwb_makeall (equivalent to cwb-makeall command line utility) -cwb_makeall(corpus = "UNGA2", p_attribute = "word", registry = get_tmp_registry()) +cwb_makeall( + corpus = "UNGA2", + p_attribute = "word", + registry = get_tmp_registry() +) cl_load_corpus("UNGA2", registry = get_tmp_registry()) cqp_load_corpus("UNGA2", registry = get_tmp_registry()) @@ -143,7 +171,11 @@ tokens_sentence_1 <- cl_id2str( corpus = "UNGA2", p_attribute = "word", registry = get_tmp_registry(), id = ids_sentence_1 ) -sentence <- gsub("\\\\s+([\\\\.,])", "\\\\1", paste(tokens_sentence_1, collapse = " ")) +sentence <- gsub( + "\\\\s+([\\\\.,])", + "\\\\1", + paste(tokens_sentence_1, collapse = " ") +) # perform cwb_huffcode (equivalent to cwb-makeall command line utility) cwb_huffcode( @@ -156,28 +188,4 @@ cwb_compress_rdx( p_attribute = "word", registry = get_tmp_registry() ) -data_dir <- file.path(tempdir(), "bt_data_dir") -dir.create(data_dir) - -cwb_encode( - corpus = "BTMIN", - registry = Sys.getenv("CORPUS_REGISTRY"), - vrt_dir = system.file(package = "RcppCWB", "extdata", "vrt"), - data_dir = data_dir, - p_attributes = c("word", "pos", "lemma"), - s_attributes = list( - plenary_protocol = c( - "lp", "protocol_no", "date", "year", "birthday", "version", - "url", "filetype" - ), - speaker = c( - "id", "type", "lp", "protocol_no", "date", "year", "ai_no", "ai_id", - "ai_type", "who", "name", "parliamentary_group", "party", "role" - ), - p = character() - ) -) - -unlink(data_dir) -unlink(file.path(Sys.getenv("CORPUS_REGISTRY"), "btmin")) } From 8b9a373aca67eb487645243369e21f144de08bc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Thu, 29 Feb 2024 10:14:01 +0100 Subject: [PATCH 42/90] cwb _makeall() does not reset CORPUS_REGISTRY envvar implicitly #92 --- DESCRIPTION | 4 ++-- NEWS.md | 7 ++++++- R/cwb.R | 8 +++----- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 38f863f..4931d69 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: RcppCWB Type: Package Title: 'Rcpp' Bindings for the 'Corpus Workbench' ('CWB') -Version: 0.6.3.9002 -Date: 2024-02-19 +Version: 0.6.3.9003 +Date: 2024-02-29 Author: Andreas Blaette [aut, cre], Bernard Desgraupes [aut], Sylvain Loiseau [aut], diff --git a/NEWS.md b/NEWS.md index 3007dee..62ce21b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,7 +1,12 @@ -# RcppCWB 0.6.3.9001 +# RcppCWB 0.6.3.9001-9003 * `cwb_huffcode()` and `cwb_compress_rdx()` did not delete redundant files on Windows. Fixed by temporarily unloading the corpus #89. +* `cwb_encode()` failed if argument `s_attributes` was empty list. Fixed, the +default value of `s_attributes` is now `list()` #90. +* `cwb_makeall()` will not reset CORPUS_REGISTY environment variable implicitly +if corpus to process has already been loaded #92. + # RcppCWB 0.6.3 diff --git a/R/cwb.R b/R/cwb.R index 96012d0..b936ad1 100644 --- a/R/cwb.R +++ b/R/cwb.R @@ -235,12 +235,10 @@ cwb_makeall <- function( # The registry directory provided is ignored if the corpus has already been # loaded, resulting in unexpected behavior. Therefore, we unload the corpus - # and force reloading corpora. - if (toupper(corpus) %in% cqp_list_corpora()){ + # and trigger reloading corpora. + if (tolower(corpus) %in% cl_list_corpora()) cl_delete_corpus(corpus, registry = registry) - cqp_reset_registry(registry = registry) - } - + makeall <- function() .cwb_makeall(x = corpus, p_attribute = p_attribute, registry_dir = registry) From 2fa5cb2d1e54ca930eaf7e72f5c47c68b6a1143b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Thu, 29 Feb 2024 10:29:08 +0100 Subject: [PATCH 43/90] added aarch64 architecture to config file #91 --- configure | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/configure b/configure index da6d8bb..57a165e 100755 --- a/configure +++ b/configure @@ -55,14 +55,18 @@ case $OS in echo "* Linux distribution ID: $DISTRO ($SUPPORT)"; # For Debian, Fedora and Ubuntu, the generic Linux configuration works, - # there are no known required adaptions so far. If another distribution is used, - # the previous message informs users that adaptions may be necessary. + # there are no known required adaptions so far. If another distribution is + # used, the previous message informs users that adaptions may be necessary. + # ARCH=`lscpu | head -n 1 | grep -oP '\w+$'` case "$ARCH" in x86_64) CWB_PLATFORM_CONFIG_FILE="linux-64";; i386) CWB_PLATFORM_CONFIG_FILE="linux";; + aarch64) + echo "* architecture ($ARCH), using Linux config as default" + CWB_PLATFORM_CONFIG_FILE="linux";; *) echo "* unknown architecture ($ARCH), using Linux config as default" CWB_PLATFORM_CONFIG_FILE="linux" From 549898421ce99495ec08c46137d11c562676a30a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Thu, 29 Feb 2024 10:45:21 +0100 Subject: [PATCH 44/90] address aarch64/arm for Linux #91 --- configure | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/configure b/configure index 57a165e..2b08ec7 100755 --- a/configure +++ b/configure @@ -65,7 +65,9 @@ case $OS in i386) CWB_PLATFORM_CONFIG_FILE="linux";; aarch64) - echo "* architecture ($ARCH), using Linux config as default" + # aarch64/arm64 (= Apple Silicon): E.g. when you run docker container + # on MacBook + echo "* architecture $ARCH - using default Linux config" CWB_PLATFORM_CONFIG_FILE="linux";; *) echo "* unknown architecture ($ARCH), using Linux config as default" From bfa3cfbff2e2c2d084b1408468b1369abfc7c00b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Thu, 29 Feb 2024 10:49:59 +0100 Subject: [PATCH 45/90] aarch64 as known config on Apple Silicon #91 --- configure | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/configure b/configure index 2b08ec7..8f8261a 100755 --- a/configure +++ b/configure @@ -61,16 +61,18 @@ case $OS in ARCH=`lscpu | head -n 1 | grep -oP '\w+$'` case "$ARCH" in x86_64) + echo "* architecture: $ARCH ... using config 'linux-64'" CWB_PLATFORM_CONFIG_FILE="linux-64";; i386) + echo "* architecture: $ARCH ... using config 'linux'" CWB_PLATFORM_CONFIG_FILE="linux";; aarch64) # aarch64/arm64 (= Apple Silicon): E.g. when you run docker container # on MacBook - echo "* architecture $ARCH - using default Linux config" + echo "* architecture: $ARCH ... using config 'linux'" CWB_PLATFORM_CONFIG_FILE="linux";; *) - echo "* unknown architecture ($ARCH), using Linux config as default" + echo "* unknown architecture ($ARCH) ... using default config 'linux'" CWB_PLATFORM_CONFIG_FILE="linux" ;; esac From 9a553b1ffa79fe4be8e2697ec7e69d6a403a2a7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Thu, 29 Feb 2024 11:05:39 +0100 Subject: [PATCH 46/90] clean up test_cwb_encode.R --- NEWS.md | 2 ++ tests/testthat/test_cwb_encode.R | 40 -------------------------------- 2 files changed, 2 insertions(+), 40 deletions(-) diff --git a/NEWS.md b/NEWS.md index 62ce21b..27ecfe6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,6 +6,8 @@ Windows. Fixed by temporarily unloading the corpus #89. default value of `s_attributes` is now `list()` #90. * `cwb_makeall()` will not reset CORPUS_REGISTY environment variable implicitly if corpus to process has already been loaded #92. +* Architecture "aarch64"" (equivalent to "amd64" / Apple Silicon) as known +Linux architecture (= scenario when running a Docker container on MacBook) #91. # RcppCWB 0.6.3 diff --git a/tests/testthat/test_cwb_encode.R b/tests/testthat/test_cwb_encode.R index 3fb4d0b..27ff910 100644 --- a/tests/testthat/test_cwb_encode.R +++ b/tests/testthat/test_cwb_encode.R @@ -2,46 +2,6 @@ library(RcppCWB) use_tmp_registry() testthat::context("cwb_encode") -# utils_zipfile <- tempfile() -# utils_dir <- file.path(tempdir(), "cwb_win-0.0.1", "utils") -# download.file("https://github.com/PolMine/cwb_win/archive/refs/tags/v0.0.1.zip", destfile = utils_zipfile) -# unzip(utils_zipfile, exdir = tempdir()) -# file.rename( -# from = file.path(utils_dir, "libintl-9.dll"), -# to = file.path(utils_dir, "libintl-8.dll") -# ) -# utils <- Sys.glob(sprintf("%s/*.exe", utils_dir)) -# names(utils) <- gsub("\\.exe$", "", basename(utils)) -# -# args <- sprintf( -# '%s -d "%s" -c utf8 -xsB -v -D -F "%s" -R "%s" -P pos -S plenary_protocol:0+lp+protocol_no+date+year+birthday+version+url+filetype -S speaker:0+id+type+lp+protocol_no+date+year+ai_no+ai_id+ai_type+who+name+parliamentary_group+party+role -S p:0+', -# fs::path(utils[["cwb-encode"]]), -# fs::path(tmp_data_dir), -# system.file(package = "RcppCWB", "extdata", "vrt"), -# fs::path(tmp_registry_file) -# ) -# foo <- system(shQuote(args)) -# -# for (p_attr in c("word", "pos", "lemma")){ -# cmd_makeall <- sprintf( -# "%s -r %s -P word BT", -# fs::path(utils[["cwb-makeall"]]), fs::path(tmp_registry) -# ) -# system(command = cmd_makeall) -# -# cmd_huffcode <- sprintf( -# "%s -r %s -P word BT", -# fs::path(utils[["cwb-huffcode"]]), fs::path(tmp_registry) -# ) -# system(command = cmd_huffcode) -# -# cmd_compress_rdx <- sprintf( -# "%s -r %s -P word BT", -# fs::path(utils[["cwb-compress-rdx"]]), fs::path(tmp_registry) -# ) -# system(command = cmd_compress_rdx) -# } - test_that( "identity of RcppCWB and CWB encoding result", From d6e23a17e282844fd9bee309d81d403050418934 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Thu, 29 Feb 2024 13:50:39 +0100 Subject: [PATCH 47/90] skip cwb_encode and cwb_compress_rdx() on Windows in test_encode.R #11 --- tests/testthat/test_cwb_encode.R | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/tests/testthat/test_cwb_encode.R b/tests/testthat/test_cwb_encode.R index 27ff910..5258163 100644 --- a/tests/testthat/test_cwb_encode.R +++ b/tests/testthat/test_cwb_encode.R @@ -33,33 +33,45 @@ test_that( ) for (p_attr in c("word", "pos", "lemma")){ - cwb_makeall(corpus = "BT", p_attribute = p_attr, registry = get_tmp_registry()) - cwb_huffcode(corpus = "BT", p_attribute = p_attr, registry = get_tmp_registry()) - cwb_compress_rdx(corpus = "BT", p_attribute = p_attr, registry = get_tmp_registry()) + cwb_makeall(corpus = "BT", p_attribute = p_attr, registry = regdir) + if (.Platform$OS.type != "windows"){ + cwb_huffcode(corpus = "BT", p_attribute = p_attr, registry = regdir) + cwb_compress_rdx(corpus = "BT", p_attribute = p_attr, registry = regdir) + } } - expect_true(cl_load_corpus(corpus = "BT", registry = get_tmp_registry())) + expect_true(cl_load_corpus(corpus = "BT", registry = regdir)) expect_true(tolower("BT") %in% cl_list_corpora()) # In the CQP context, corpus IDs are uppered - here we knowingly provide # a lowercase ID that is uppered internally #64 - expect_true(cqp_load_corpus(corpus = "bt", registry = get_tmp_registry())) + expect_true(cqp_load_corpus(corpus = "bt", registry = regdir)) expect_true("BT" %in% cqp_list_corpora()) for (p_attr in c("word", "pos", "lemma")){ expect_equal( - cl_attribute_size(corpus = "BT", attribute = p_attr, attribute_type = "p", registry = get_tmp_registry()), + cl_attribute_size( + corpus = "BT", + attribute = p_attr, + attribute_type = "p", + registry = regdir + ), 8402 ) } - n <- cl_attribute_size(corpus = "BT", attribute = "word", attribute_type = "p", registry = get_tmp_registry()) + n <- cl_attribute_size( + corpus = "BT", + attribute = "word", + attribute_type = "p", + registry = regdir + ) ids <- cl_cpos2id( - "BT", p_attribute = "word", registry = get_tmp_registry(), + "BT", p_attribute = "word", registry = regdir, cpos = 0L:(n-1L) ) - words <- cl_id2str("BT", p_attribute = "word", registry = get_tmp_registry(), id = ids) + words <- cl_id2str("BT", p_attribute = "word", registry = regdir, id = ids) expect_equal(table(words == "Liebe")[["TRUE"]], 6) expect_equal(table(words == "SPD")[["TRUE"]], 31) From a1af0a86df11be6a47102ff0e80e342f30852268 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Thu, 29 Feb 2024 14:03:00 +0100 Subject: [PATCH 48/90] argument logfile for cwb_* functions #65 --- NEWS.md | 3 +++ R/cwb.R | 16 ++++++++++++---- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/NEWS.md b/NEWS.md index 27ecfe6..0ed5546 100644 --- a/NEWS.md +++ b/NEWS.md @@ -8,6 +8,9 @@ default value of `s_attributes` is now `list()` #90. if corpus to process has already been loaded #92. * Architecture "aarch64"" (equivalent to "amd64" / Apple Silicon) as known Linux architecture (= scenario when running a Docker container on MacBook) #91. +* Functions `cwb_makeall()`, `cwb_huffcode()` and `cwb_compress_rdx()` have +new argument `logfile` to redirect output to this file. Requires argument +`quietly` to be `TRUE` #65. # RcppCWB 0.6.3 diff --git a/R/cwb.R b/R/cwb.R index b936ad1..65fafae 100644 --- a/R/cwb.R +++ b/R/cwb.R @@ -32,6 +32,8 @@ #' environment variable CORPUS_REGISTRY. #' @param quietly A `logical` value, whether to turn off messages (including #' warnings). +#' @param logfile Redirect messages of `cwb_makeall()`, `cwb_huffcode()` or +#' `cwb_compress_rdx()` to this file. Requires that quietly is `TRUE`. #' @param verbose A `logical` value, whether to show progress information #' (counter of tokens processed). #' @rdname cwb_utils @@ -218,7 +220,8 @@ cwb_makeall <- function( corpus, p_attribute, registry = Sys.getenv("CORPUS_REGISTRY"), - quietly = FALSE + quietly = FALSE, + logfile ){ registry <- path_expand(path(registry)) @@ -243,7 +246,8 @@ cwb_makeall <- function( .cwb_makeall(x = corpus, p_attribute = p_attribute, registry_dir = registry) if (quietly){ - capture.output({success <- makeall()}, type = "output") + log <- capture.output({success <- makeall()}, type = "output") + if (!missing(logfile)) writeLines(log, con = logfile) } else { success <- makeall() } @@ -260,6 +264,7 @@ cwb_huffcode <- function( p_attribute, registry = Sys.getenv("CORPUS_REGISTRY"), quietly = FALSE, + logfile, delete = TRUE ){ @@ -282,7 +287,8 @@ cwb_huffcode <- function( } if (quietly){ - capture.output({success <- huffcode()}, type = "output") + log <- capture.output({success <- huffcode()}, type = "output") + if (!missing(logfile)) writeLines(log, con = logfile) } else { success <- huffcode() } @@ -320,6 +326,7 @@ cwb_compress_rdx <- function( p_attribute, registry = Sys.getenv("CORPUS_REGISTRY"), quietly = FALSE, + logfile, delete = TRUE ){ @@ -344,7 +351,8 @@ cwb_compress_rdx <- function( ) if (quietly){ - capture.output({success <- compress_rdx()}, type = "output") + log <- capture.output({success <- compress_rdx()}, type = "output") + if (!missing(logfile)) writeLines(log, con = logfile) } else { success <- compress_rdx() } From b322d6b8b2ee781943b804ec153a4ef28f7ecceb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Thu, 29 Feb 2024 14:04:19 +0100 Subject: [PATCH 49/90] documentation argument for arg logfile #65 --- man/cwb_utils.Rd | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/man/cwb_utils.Rd b/man/cwb_utils.Rd index 856ecbe..c50a93a 100644 --- a/man/cwb_utils.Rd +++ b/man/cwb_utils.Rd @@ -26,7 +26,8 @@ cwb_makeall( corpus, p_attribute, registry = Sys.getenv("CORPUS_REGISTRY"), - quietly = FALSE + quietly = FALSE, + logfile ) cwb_huffcode( @@ -34,6 +35,7 @@ cwb_huffcode( p_attribute, registry = Sys.getenv("CORPUS_REGISTRY"), quietly = FALSE, + logfile, delete = TRUE ) @@ -42,6 +44,7 @@ cwb_compress_rdx( p_attribute, registry = Sys.getenv("CORPUS_REGISTRY"), quietly = FALSE, + logfile, delete = TRUE ) } @@ -88,6 +91,9 @@ warnings).} \item{p_attribute}{Name of p-attribute.} +\item{logfile}{Redirect messages of \code{cwb_makeall()}, \code{cwb_huffcode()} or +\code{cwb_compress_rdx()} to this file. Requires that quietly is \code{TRUE}.} + \item{delete}{A \code{logical} value, whether to remove redundant file (p_attribute).corpus after compression.} } From c578fe8993da5a079d51297c4542dffcb9cbfb12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Thu, 29 Feb 2024 14:16:57 +0100 Subject: [PATCH 50/90] documentation/warning that cwb_huffcode() and cwb_compress_rdx() unstable on Windows #11 --- R/cwb.R | 19 +++++++++++++++++++ man/cwb_utils.Rd | 7 +++++++ 2 files changed, 26 insertions(+) diff --git a/R/cwb.R b/R/cwb.R index 65fafae..10a8dc5 100644 --- a/R/cwb.R +++ b/R/cwb.R @@ -5,6 +5,13 @@ #' will always perform a specific indexing/compression step on one positional #' attribute, and produce all components. #' +#' @details +#' Running `cwb_huffcode()` and `cwb_compress_rdx()` is optional. Corpora can be +#' fully used without compression. It is recommended when reducing the size of +#' corpus data has relevant benefits, e.g. for sharing data. On Windows, +#' compression is not stable and not recommended. A respective warning +#' is issued when running `cwb_huffcode()` and `cwb_compress_rdx()` on Windows. +#' #' @param p_attributes Positional attributes (p-attributes) to be declared. #' @param data_dir The data directory where `cwb_encode` will save the binary #' files of the indexed corpus. Tilde expansion is performed on `data_dir` @@ -268,6 +275,12 @@ cwb_huffcode <- function( delete = TRUE ){ + if (.Platform$OS.type == "windows") + message( + "`cwb_huffcode()` is not stable on Windows. ", + "Corpus compression is optional and not recommended on Windows." + ) + registry <- path_expand(path(registry)) check_registry(registry) regfile <- path(registry, tolower(corpus)) @@ -330,6 +343,12 @@ cwb_compress_rdx <- function( delete = TRUE ){ + if (.Platform$OS.type == "windows") + message( + "`cwb_compress_rdx()` is not stable on Windows. ", + "Corpus compression is optional and not recommended on Windows." + ) + registry <- path_expand(path(registry)) check_registry(registry) regfile <- path(registry, tolower(corpus)) diff --git a/man/cwb_utils.Rd b/man/cwb_utils.Rd index c50a93a..285dfa0 100644 --- a/man/cwb_utils.Rd +++ b/man/cwb_utils.Rd @@ -103,6 +103,13 @@ Wrappers for the CWB tools \code{cwb-makeall}, \code{cwb-huffcode} and will always perform a specific indexing/compression step on one positional attribute, and produce all components. } +\details{ +Running \code{cwb_huffcode()} and \code{cwb_compress_rdx()} is optional. Corpora can be +fully used without compression. It is recommended when reducing the size of +corpus data has relevant benefits, e.g. for sharing data. On Windows, +compression is not stable and not recommended. A respective warning +is issued when running \code{cwb_huffcode()} and \code{cwb_compress_rdx()} on Windows. +} \examples{ data_dir <- file.path(tempdir(), "bt_data_dir") dir.create(data_dir) From 38551b2dfe3dc918758b127e1fee82ef7bd6e8ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Thu, 29 Feb 2024 15:16:39 +0100 Subject: [PATCH 51/90] release v0.6.4 --- DESCRIPTION | 2 +- cran-comments.md | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 4931d69..71f8d94 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: RcppCWB Type: Package Title: 'Rcpp' Bindings for the 'Corpus Workbench' ('CWB') -Version: 0.6.3.9003 +Version: 0.6.4 Date: 2024-02-29 Author: Andreas Blaette [aut, cre], Bernard Desgraupes [aut], diff --git a/cran-comments.md b/cran-comments.md index a9add27..01b4d72 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,6 +1,6 @@ ## General remarks -Fixes -Wformat-security issue under r-devel caused by Rcpp. +A set of minor bug fixes and small enhancements to improve usability. Previous aspects I repeat: @@ -17,9 +17,8 @@ change. ## Test environments * CI checks with GitHub Actions (Windows/macOS/Ubuntu) -* R winbuilder (R 4.3.0 release, devel, oldrel) +* R winbuilder (R 4.3.2 release, devel, oldrel) * local macOS, R 4.3.1 (arm64) -* Debian with R-devel and clang (14.0.6 and 15.0.6) ## R CMD check results From 22a952eab5fb1722d9aa7e2bbfe7e8548cb3f2d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Mon, 23 Sep 2024 01:03:30 +0200 Subject: [PATCH 52/90] fix size larger than warning #93 --- src/cwb/cqp/parser.tab.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cwb/cqp/parser.tab.c b/src/cwb/cqp/parser.tab.c index cbf80d4..606029e 100644 --- a/src/cwb/cqp/parser.tab.c +++ b/src/cwb/cqp/parser.tab.c @@ -641,7 +641,7 @@ YYID (i) # define YYSTACK_ALLOC YYMALLOC # define YYSTACK_FREE YYFREE # ifndef YYSTACK_ALLOC_MAXIMUM -# define YYSTACK_ALLOC_MAXIMUM YYSIZE_MAXIMUM +# define YYSTACK_ALLOC_MAXIMUM 4032 /* avoids -Walloc-size-larger-than= error */ # endif # if (defined __cplusplus && ! defined _STDLIB_H \ && ! ((defined YYMALLOC || defined malloc) \ From 547c8a1dee2968625ef3015bf4d606aa767de071 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Mon, 23 Sep 2024 20:12:16 +0200 Subject: [PATCH 53/90] release v0.6.5 --- DESCRIPTION | 4 ++-- NEWS.md | 7 ++++++- cran-comments.md | 13 +++++++------ 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 71f8d94..fd6c8df 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: RcppCWB Type: Package Title: 'Rcpp' Bindings for the 'Corpus Workbench' ('CWB') -Version: 0.6.4 -Date: 2024-02-29 +Version: 0.6.5 +Date: 2024-09-23 Author: Andreas Blaette [aut, cre], Bernard Desgraupes [aut], Sylvain Loiseau [aut], diff --git a/NEWS.md b/NEWS.md index 0ed5546..39101c8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,9 @@ -# RcppCWB 0.6.3.9001-9003 +# RcppCWB 0.6.5 + +* Fixes a 'exceeds maximum object size'-compiler warning #93. + + +# RcppCWB 0.6.4 * `cwb_huffcode()` and `cwb_compress_rdx()` did not delete redundant files on Windows. Fixed by temporarily unloading the corpus #89. diff --git a/cran-comments.md b/cran-comments.md index 01b4d72..6fc4a80 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,6 +1,8 @@ ## General remarks -A set of minor bug fixes and small enhancements to improve usability. +This release fixes a compiler warning 'argument 1 value '18446744073709551615' exceeds maximum object size 9223372036854775807 [-Walloc-size-larger-than=]'. I have been notified that the fix is expected by September 28. + +The (significant) compiler warning is newly thrown by GCC 14. I have used a Docker image with Fedora 40, R-devel (r87186) and GCC 14 to reproduce and fix the issue. Previous aspects I repeat: @@ -16,18 +18,17 @@ change. ## Test environments +* Docker image with Fedora 40, R-devel r87186 and GCC 14 * CI checks with GitHub Actions (Windows/macOS/Ubuntu) -* R winbuilder (R 4.3.2 release, devel, oldrel) +* R winbuilder (R 4.3.3, R 4.4.1, R-devel r87186 ucrt) * local macOS, R 4.3.1 (arm64) ## R CMD check results -Check status is OK on all test environments, with one exception. On -Windows-oldrel, I see SSL issues with URLs in README.md: +Check status is OK on all test environments. A warning I have seen but that I cannot reproduce results from this website: -- https://developer.apple.com/xcode/ (self signed certificate in certificate chain) -- https://txm.gitpages.huma-num.fr/textometrie/ (unable to get local issuer certificate) +https://txm.gitpages.huma-num.fr/textometrie/ (unable to get local issuer certificate) I do not see these on the R winbuilder for R release of R devel. My browsers do not show a problem with these certificates either. From 8b3642b357d08f285d420bed282222e89a30c8b4 Mon Sep 17 00:00:00 2001 From: ablaette Date: Mon, 23 Sep 2024 20:26:40 +0200 Subject: [PATCH 54/90] Update R-CMD-check.yaml --- .github/workflows/R-CMD-check.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 3c7f823..de0cdfe 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -99,7 +99,7 @@ jobs: - name: Upload Windows binary if: matrix.config.os == 'windows-latest' - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v4 with: name: RcppCWB-Windows-binary path: ./*.zip @@ -112,7 +112,7 @@ jobs: - name: Upload macOS binary if: runner.os == 'macOS' - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v4 with: name: RcppCWB-macOS-binary path: ./*.tgz From b59b4cf39252dee094f82fc1a9920d8a28f84b61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Thu, 27 Mar 2025 23:05:14 +0100 Subject: [PATCH 55/90] snprintf() replaces sprintf() in C code #95 --- cran-comments.md | 3 +-- src/cwb/CQi/auth.c | 2 +- src/cwb/CQi/cqpserver.c | 2 +- src/cwb/CQi/log.c | 4 ++-- src/cwb/CQi/server.c | 4 ++-- src/cwb/cl/cdaccess.c | 8 ++++---- src/cwb/cl/fileutils.c | 12 ++++++------ src/cwb/cl/registry.tab.c | 14 +++++++------- src/cwb/cl/registry.y | 14 +++++++------- src/cwb/cl/regopt.c | 5 +++-- src/cwb/cl/ui-helpers.c | 2 +- src/cwb/cqp/ascii-print.c | 14 +++++++------- src/cwb/cqp/concordance.c | 4 ++-- src/cwb/cqp/corpmanag.c | 12 ++++++------ src/cwb/cqp/cqp.c | 10 +++++----- src/cwb/cqp/groups.c | 2 +- src/cwb/cqp/llquery.c | 14 +++++++------- src/cwb/cqp/macro.c | 8 ++++---- src/cwb/cqp/output.c | 2 +- src/cwb/cqp/parse_actions.c | 16 ++++++++-------- src/cwb/cqp/print-modes.c | 2 +- src/cwb/cqp/ranges.c | 2 +- src/cwb/cqp/tree.c | 12 ++++++------ src/cwb/utils/cwb-align-encode.c | 4 ++-- src/cwb/utils/cwb-align-show.c | 14 +++++++------- src/cwb/utils/cwb-compress-rdx.c | 8 ++++---- src/cwb/utils/cwb-decode.c | 16 ++++++++-------- src/cwb/utils/cwb-encode.c | 22 +++++++++++----------- src/cwb/utils/cwb-huffcode.c | 12 ++++++------ src/cwb/utils/cwb-s-encode.c | 6 +++--- src/cwb/utils/cwb-scan-corpus.c | 2 +- 31 files changed, 126 insertions(+), 126 deletions(-) diff --git a/cran-comments.md b/cran-comments.md index 6fc4a80..e2249a0 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,8 +1,7 @@ ## General remarks -This release fixes a compiler warning 'argument 1 value '18446744073709551615' exceeds maximum object size 9223372036854775807 [-Walloc-size-larger-than=]'. I have been notified that the fix is expected by September 28. +- Usage of `sprintf()` has been replaced by `snprintf()` -The (significant) compiler warning is newly thrown by GCC 14. I have used a Docker image with Fedora 40, R-devel (r87186) and GCC 14 to reproduce and fix the issue. Previous aspects I repeat: diff --git a/src/cwb/CQi/auth.c b/src/cwb/CQi/auth.c index 578b469..1edbd63 100644 --- a/src/cwb/CQi/auth.c +++ b/src/cwb/CQi/auth.c @@ -144,7 +144,7 @@ add_hosts_in_subnet_to_list(char *ipsubnet) int i; for (i = 1; i <= 255; i++) { - sprintf(ipaddr, "%s%d", ipsubnet, i); + snprintf(ipaddr, strlen(ipsubnet) + 4, "%s%d", ipsubnet, i); add_host_to_list(ipaddr); } cl_free(ipaddr); diff --git a/src/cwb/CQi/cqpserver.c b/src/cwb/CQi/cqpserver.c index 6ea7f63..6f201a1 100644 --- a/src/cwb/CQi/cqpserver.c +++ b/src/cwb/CQi/cqpserver.c @@ -871,7 +871,7 @@ do_cqi_cqp_query(void) query_lock = floor(1e9 * cl_random_fraction()) + 1; /* activate query lock mode with random key */ cqiserver_log(Info, "query_lock = %d\n", query_lock); - sprintf(cqp_query, "%s = %s;", child, query); + snprintf(cqp_query, len, "%s = %s;", child, query); if (!cqp_parse_string(cqp_query)) cqi_command(CQI_CQP_ERROR_GENERAL); /* should be changed to detailed error messages */ else { diff --git a/src/cwb/CQi/log.c b/src/cwb/CQi/log.c index 74b745b..946bb5b 100644 --- a/src/cwb/CQi/log.c +++ b/src/cwb/CQi/log.c @@ -143,10 +143,10 @@ cqiserver_debug_arglist(const char *arg_list, int n_args, int int_args) if (server_debug) { for (i = 0, mark = buf ; i < n_args ; i += incr) if (int_args) - sprintf(mark += strlen(mark), "%d ", (int)arg_list[i]); + snprintf(mark += strlen(mark), CL_MAX_LINE_LENGTH, "%d ", (int)arg_list[i]); else /* super creaky typecasting needed to get the compiler to not complain here */ - sprintf(mark += strlen(mark), "'%s' ", arg_list); + snprintf(mark += strlen(mark), CL_MAX_LINE_LENGTH, "'%s' ", arg_list); } return buf; } diff --git a/src/cwb/CQi/server.c b/src/cwb/CQi/server.c index 0b711fd..c6acb22 100644 --- a/src/cwb/CQi/server.c +++ b/src/cwb/CQi/server.c @@ -210,7 +210,7 @@ accept_connection(int port) if (err != 0) { char buffer[50]; - sprintf(buffer,"ERROR WSAStartup failed with error: %d\n",err); + snprintf(buffer, 50, "ERROR WSAStartup failed with error: %d\n",err); perror(buffer); return -1; } @@ -1070,7 +1070,7 @@ combine_subcorpus_spec(char *corpus, char *subcorpus) if (!subcorpus) return cl_strdup(corpus); spec = (char *) cl_malloc(strlen(corpus) + strlen(subcorpus) + 2); - sprintf(spec, "%s:%s", corpus, subcorpus); + snprintf(spec, strlen(corpus) + strlen(subcorpus) + 2, "%s:%s", corpus, subcorpus); return spec; } diff --git a/src/cwb/cl/cdaccess.c b/src/cwb/cl/cdaccess.c index 47a28db..12ff983 100644 --- a/src/cwb/cl/cdaccess.c +++ b/src/cwb/cl/cdaccess.c @@ -2424,13 +2424,13 @@ cl_dynamic_call(Attribute *attribute, case ATTAT_INT: case ATTAT_POS: - sprintf(istr, "%d", args[ap].value.intres); + snprintf(istr, 32, "%d", args[ap].value.intres); for (k = 0; istr[k]; k++) call[ins++] = istr[k]; break; case ATTAT_FLOAT: - sprintf(istr, "%f", args[ap].value.floatres); + snprintf(istr, 32, "%f", args[ap].value.floatres); for (k = 0; istr[k]; k++) call[ins++] = istr[k]; break; @@ -2453,13 +2453,13 @@ cl_dynamic_call(Attribute *attribute, case ATTAT_INT: case ATTAT_POS: - sprintf(istr, "%d", args[ap].value.intres); + snprintf(istr, 32, "%d", args[ap].value.intres); for (k = 0; istr[k]; k++) call[ins++] = istr[k]; break; case ATTAT_FLOAT: - sprintf(istr, "%f", args[ap].value.floatres); + snprintf(istr, 32, "%f", args[ap].value.floatres); for (k = 0; istr[k]; k++) call[ins++] = istr[k]; break; diff --git a/src/cwb/cl/fileutils.c b/src/cwb/cl/fileutils.c index 2ef5b25..990e73c 100644 --- a/src/cwb/cl/fileutils.c +++ b/src/cwb/cl/fileutils.c @@ -326,26 +326,26 @@ cl_open_stream(const char *filename, int mode, int type) case CL_STREAM_GZIP: point = g_shell_quote(filename); if (mode == CL_STREAM_APPEND) { - sprintf(command, "gzip >> %s", point); + snprintf(command, 2 * CL_MAX_FILENAME_LENGTH, "gzip >> %s", point); mode_spec = (mode_spec[1] == 'b' ? "wb" : "w"); } else if (mode == CL_STREAM_WRITE) - sprintf(command, "gzip > %s", point); + snprintf(command, 2 * CL_MAX_FILENAME_LENGTH, "gzip > %s", point); else - sprintf(command, "gzip -cd %s", point); + snprintf(command, 2 * CL_MAX_FILENAME_LENGTH, "gzip -cd %s", point); handle = popen(command, mode_spec); g_free(point); break; case CL_STREAM_BZIP2: point = g_shell_quote(filename); if (mode == CL_STREAM_APPEND) { - sprintf(command, "bzip2 >> %s", point); + snprintf(command, 2 * CL_MAX_FILENAME_LENGTH, "bzip2 >> %s", point); mode_spec = (mode_spec[1] == 'b' ? "wb" : "w"); } else if (mode == CL_STREAM_WRITE) - sprintf(command, "bzip2 > %s", point); + snprintf(command, 2 * CL_MAX_FILENAME_LENGTH, "bzip2 > %s", point); else - sprintf(command, "bzip2 -cd %s", point); + snprintf(command, 2 * CL_MAX_FILENAME_LENGTH, "bzip2 -cd %s", point); handle = popen(command, mode_spec); g_free(point); break; diff --git a/src/cwb/cl/registry.tab.c b/src/cwb/cl/registry.tab.c index 857bcaf..a9beaaf 100644 --- a/src/cwb/cl/registry.tab.c +++ b/src/cwb/cl/registry.tab.c @@ -165,7 +165,7 @@ char cregestring[1024]; #define cregSetAttrComponentPath(attr, cid, path) \ { \ if (!declare_component(attr, cid, path)) { \ - sprintf(cregestring, "Component %s with path %s declared twice" \ + snprintf(cregestring, 1024, "Component %s with path %s declared twice" \ " (or internal error)", cid_name(cid), path); \ cl_free(path); \ cregerror(cregestring); \ @@ -1670,7 +1670,7 @@ yyparse () #line 259 "registry.y" { if ((cregattrib = setup_attribute(cregcorpus, (yyvsp[(2) - (2)].strval), ATT_POS, NULL)) == NULL) { - sprintf(cregestring, + snprintf(cregestring, 1024, "Positional attribute %s declared twice -- " "semantic error", (yyvsp[(2) - (2)].strval)); cl_free((yyvsp[(2) - (2)].strval)); @@ -1687,7 +1687,7 @@ yyparse () case 30: #line 271 "registry.y" { if (((yyval.attr) = setup_attribute(cregcorpus, (yyvsp[(2) - (3)].strval), ATT_ALIGN, NULL)) == NULL) { - sprintf(cregestring, "Alignment attribute %s declared twice -- " + snprintf(cregestring, 1024, "Alignment attribute %s declared twice -- " "semantic error", (yyvsp[(2) - (3)].strval)); cl_free((yyvsp[(2) - (3)].strval)); cl_free((yyvsp[(3) - (3)].storage).path); @@ -1701,7 +1701,7 @@ yyparse () case 31: #line 282 "registry.y" { if (((yyval.attr) = setup_attribute(cregcorpus, (yyvsp[(2) - (3)].strval), ATT_STRUC, NULL)) == NULL) { - sprintf(cregestring, "Structure attribute %s declared twice -- " + snprintf(cregestring, 1024, "Structure attribute %s declared twice -- " "semantic error", (yyvsp[(2) - (3)].strval)); cl_free((yyvsp[(2) - (3)].strval)); cl_free((yyvsp[(3) - (3)].storage).path); @@ -1718,7 +1718,7 @@ yyparse () DynArg *a; - sprintf(cregestring, "Dynamic attribute %s declared twice -- " + snprintf(cregestring, 1024, "Dynamic attribute %s declared twice -- " "semantic error", (yyvsp[(2) - (8)].strval)); cl_free((yyvsp[(2) - (8)].strval)); cl_free((yyvsp[(7) - (8)].args)); @@ -1792,7 +1792,7 @@ yyparse () #line 352 "registry.y" { (yyval.args) = (DynArg *)makearg((yyvsp[(1) - (1)].strval)); if ((yyval.args) == NULL) { - sprintf(cregestring, "Illegal argument type %s or " + snprintf(cregestring, 1024, "Illegal argument type %s or " "not enough memory -- FATAL ERROR", (yyvsp[(1) - (1)].strval)); cregerror(cregestring); } @@ -1867,7 +1867,7 @@ yyparse () #line 389 "registry.y" { char *nr; nr = (char *)cl_malloc(16); - sprintf(nr, "%d", (yyvsp[(1) - (1)].ival)); + snprintf(nr, 1024, "%d", (yyvsp[(1) - (1)].ival)); (yyval.strval) = nr; ;} break; diff --git a/src/cwb/cl/registry.y b/src/cwb/cl/registry.y index cbb25a9..f060473 100644 --- a/src/cwb/cl/registry.y +++ b/src/cwb/cl/registry.y @@ -44,7 +44,7 @@ char cregestring[1024]; #define cregSetAttrComponentPath(attr, cid, path) \ { \ if (!declare_component(attr, cid, path)) { \ - sprintf(cregestring, "Component %s with path %s declared twice" \ + snprintf(cregestring, 1024, "Component %s with path %s declared twice" \ " (or internal error)", cid_name(cid), path); \ cl_free(path); \ cregerror(cregestring); \ @@ -259,7 +259,7 @@ Attributes : Attribute { Attribute : ATTRIBUTE_SYM id { if ((cregattrib = setup_attribute(cregcorpus, $2, ATT_POS, NULL)) == NULL) { - sprintf(cregestring, + snprintf(cregestring, 1024, "Positional attribute %s declared twice -- " "semantic error", $2); cl_free($2); @@ -270,7 +270,7 @@ Attribute : ATTRIBUTE_SYM | ALIGNED_SYM id StorageSpec { if (($$ = setup_attribute(cregcorpus, $2, ATT_ALIGN, NULL)) == NULL) { - sprintf(cregestring, "Alignment attribute %s declared twice -- " + snprintf(cregestring, 1024, "Alignment attribute %s declared twice -- " "semantic error", $2); cl_free($2); cl_free($3.path); @@ -281,7 +281,7 @@ Attribute : ATTRIBUTE_SYM } | STRUCTURE_SYM id StorageSpec { if (($$ = setup_attribute(cregcorpus, $2, ATT_STRUC, NULL)) == NULL) { - sprintf(cregestring, "Structure attribute %s declared twice -- " + snprintf(cregestring, 1024, "Structure attribute %s declared twice -- " "semantic error", $2); cl_free($2); cl_free($3.path); @@ -297,7 +297,7 @@ Attribute : ATTRIBUTE_SYM DynArg *a; - sprintf(cregestring, "Dynamic attribute %s declared twice -- " + snprintf(cregestring, 1024, "Dynamic attribute %s declared twice -- " "semantic error", $2); cl_free($2); cl_free($7); @@ -352,7 +352,7 @@ ArgList : SingleArg { $$ = $1; } SingleArg : id { $$ = (DynArg *)makearg($1); if ($$ == NULL) { - sprintf(cregestring, "Illegal argument type %s or " + snprintf(cregestring, 1024, "Illegal argument type %s or " "not enough memory -- FATAL ERROR", $1); cregerror(cregestring); } @@ -389,7 +389,7 @@ path : id { $$ = $1; } id : IDENTIFIER { $$ = $1; } | NUMBER { char *nr; nr = (char *)cl_malloc(16); - sprintf(nr, "%d", $1); + snprintf(nr, 16, "%d", $1); $$ = nr; } ; diff --git a/src/cwb/cl/regopt.c b/src/cwb/cl/regopt.c index 76e629b..9fad870 100644 --- a/src/cwb/cl/regopt.c +++ b/src/cwb/cl/regopt.c @@ -242,8 +242,9 @@ cl_new_regex(char *regex, int flags, CorpusCharset charset) cl_free(delatexed_regex); /* add start and end anchors to improve performance of regex matcher for expressions such as ".*ung" */ - anchored_regex = (char *) cl_malloc(strlen(preprocessed_regex) + 7); - sprintf(anchored_regex, "^(?:%s)$", preprocessed_regex); + int regex_len = strlen(preprocessed_regex) + 7; + anchored_regex = (char *) cl_malloc(regex_len); + snprintf(anchored_regex, regex_len, "^(?:%s)$", preprocessed_regex); length_regex = (PCRE2_SIZE)strlen(anchored_regex); /* compile regular expression with PCRE library function */ diff --git a/src/cwb/cl/ui-helpers.c b/src/cwb/cl/ui-helpers.c index b8fce0b..7ed3bec 100644 --- a/src/cwb/cl/ui-helpers.c +++ b/src/cwb/cl/ui-helpers.c @@ -132,7 +132,7 @@ progress_bar_percentage(int pass, int total, int percentage) { /* [pass of : % complete] (uses progress_bar_message) */ char message[20]; - sprintf(message, "%3d%c complete", percentage, '%'); + snprintf(message, 20, "%3d%c complete", percentage, '%'); progress_bar_message(pass, total, message); } diff --git a/src/cwb/cqp/ascii-print.c b/src/cwb/cqp/ascii-print.c index 3093c9c..c47eb04 100644 --- a/src/cwb/cqp/ascii-print.c +++ b/src/cwb/cqp/ascii-print.c @@ -259,7 +259,7 @@ get_screen_escapes(void) /* in highlighted mode, switch off display attributes at end of line (to be on the safe side) */ ASCIIHighlightedPrintDescriptionRecord.AfterLine = cl_malloc(strlen(sc_all_out) + 2); - sprintf(ASCIIHighlightedPrintDescriptionRecord.AfterLine, + snprintf(ASCIIHighlightedPrintDescriptionRecord.AfterLine, strlen(sc_all_out) + 2, "%s\n", sc_all_out); /* print cpos in blue, "print structures" in pink if we're in coloured mode */ @@ -270,13 +270,13 @@ get_screen_escapes(void) char *bold = get_typeface_escape('b'); ASCIIHighlightedPrintDescriptionRecord.CPOSPrintFormat = cl_malloc(strlen(blue) + strlen(normal) + 8); - sprintf(ASCIIHighlightedPrintDescriptionRecord.CPOSPrintFormat, + snprintf(ASCIIHighlightedPrintDescriptionRecord.CPOSPrintFormat, strlen(blue) + strlen(normal) + 8, "%s%c9d:%s ", blue, '%', normal); ASCIIHighlightedPrintDescriptionRecord.BeforePrintStructures = cl_malloc(strlen(pink) + strlen(bold) + 4); - sprintf(ASCIIHighlightedPrintDescriptionRecord.BeforePrintStructures, + snprintf(ASCIIHighlightedPrintDescriptionRecord.BeforePrintStructures, strlen(pink) + strlen(bold) + 4, "%s%s", pink, bold); ASCIIHighlightedPrintDescriptionRecord.AfterPrintStructures = cl_malloc(strlen(normal) + 6); - sprintf(ASCIIHighlightedPrintDescriptionRecord.AfterPrintStructures, + snprintf(ASCIIHighlightedPrintDescriptionRecord.AfterPrintStructures, strlen(normal) + 6, ":%s ", normal); } } @@ -384,7 +384,7 @@ ascii_print_field(FieldType field, int at_end) /* if colours are activated & seem to work, print target number in red, otherwise print in parens */ if (*red) { /* must set colour first, then all other current attributes */ - sprintf(sc_before_token + strlen(sc_before_token), + snprintf(sc_before_token + strlen(sc_before_token), 0 + strlen(sc_before_token), "%s%s%s%s%s%d", sc_all_out, red, @@ -394,7 +394,7 @@ ascii_print_field(FieldType field, int at_end) field - TargetField); /* should yield 0 .. 9 */ } else - sprintf(sc_before_token + strlen(sc_before_token), "(%d)", field - TargetField ); /* should yield 0 .. 9 */ + snprintf(sc_before_token + strlen(sc_before_token), strlen(sc_before_token), "(%d)", field - TargetField ); /* should yield 0 .. 9 */ } /* set the display attribute flags */ @@ -427,7 +427,7 @@ ascii_print_field(FieldType field, int at_end) } /* now compose escape sequence which has to be sent to the terminal (setting _all_ attributes to their current values) */ - sprintf(sc_before_token + strlen(sc_before_token), + snprintf(sc_before_token + strlen(sc_before_token), strlen(sc_before_token), "%s%s%s%s", sc_all_out, /* first switch off all attributes, then set the active ones in order standout, underline, bold */ (sc_s_mode) ? sc_s_in : "", diff --git a/src/cwb/cqp/concordance.c b/src/cwb/cqp/concordance.c index ecfe7a3..84b5b04 100644 --- a/src/cwb/cqp/concordance.c +++ b/src/cwb/cqp/concordance.c @@ -86,7 +86,7 @@ compose_kwic_print_structures(ContextDescriptor *cd, { if (show_cpos && pdr->CPOSPrintFormat) { static char rendered_cpos[CL_MAX_LINE_LENGTH]; /* another 'Oli': this was num[16], definitely not enough for HTML output */ - sprintf(rendered_cpos, pdr->CPOSPrintFormat, position); + snprintf(rendered_cpos, CL_MAX_LINE_LENGTH, pdr->CPOSPrintFormat, position); cl_autostring_concat(s, rendered_cpos); } @@ -230,7 +230,7 @@ compose_kwic_token(ContextDescriptor *cd, int position, ClAutoString dest, Print static char body[CL_MAX_LINE_LENGTH]; /* 'body' of the start tag, may include annotation */ if (show_tag_attributes && region->annot) - sprintf(body, "%s %s", region->name, region->annot); + snprintf(body, CL_MAX_LINE_LENGTH, "%s %s", region->name, region->annot); else cl_strcpy(body, region->name); diff --git a/src/cwb/cqp/corpmanag.c b/src/cwb/cqp/corpmanag.c index f59e2db..f8a701c 100644 --- a/src/cwb/cqp/corpmanag.c +++ b/src/cwb/cqp/corpmanag.c @@ -281,7 +281,7 @@ ensure_corpus_size(CorpusList *cl) if (cl->mother_name == NULL) strcpy(filename, cl->name); else - sprintf(filename, "%s:%s", cl->mother_name, cl->name); + snprintf(filename, CL_MAX_FILENAME_LENGTH, "%s:%s", cl->mother_name, cl->name); return attach_subcorpus(cl, cl->local_dir, filename); } @@ -927,14 +927,14 @@ get_fulllocalpath(CorpusList *cl, int qualify) upname = cl->mother_name ? cl_strdup(cl->mother_name) : cl_strdup("NONE"); cl_id_toupper(upname); - sprintf(fullname, "%s%s%s:%s", data_directory, + snprintf(fullname, CL_MAX_FILENAME_LENGTH, "%s%s%s:%s", data_directory, data_directory[strlen(data_directory)-1] == SUBDIR_SEPARATOR ? "" : SUBDIR_SEP_STRING, cl->mother_name ? cl->mother_name : "NONE", cl->name); cl_free(upname); } else - sprintf(fullname, "%s%s%s", data_directory, + snprintf(fullname, CL_MAX_FILENAME_LENGTH, "%s%s%s", data_directory, data_directory[strlen(data_directory)-1] == SUBDIR_SEPARATOR ? "" : SUBDIR_SEP_STRING, cl->name); @@ -990,7 +990,7 @@ check_stamp(char *directory, char *fname) char full_name[CL_MAX_FILENAME_LENGTH]; int magic, ok; - sprintf(full_name, "%s" SUBDIR_SEP_STRING "%s", directory, fname); + snprintf(full_name, CL_MAX_FILENAME_LENGTH, "%s" SUBDIR_SEP_STRING "%s", directory, fname); if (((fd = cl_open_stream(full_name, "rb")) == NULL) || (fread(&magic, sizeof(int), 1, fd) == 0) || @@ -1450,7 +1450,7 @@ save_subcorpus(CorpusList *cl, char *fname) cqpmessage(Warning, "Directory for private subcorpora isn't set, can't save %s", cl->name); return False; } - sprintf(fname, "%s%c%s:%s", + snprintf(fname, CL_MAX_FILENAME_LENGTH, "%s%c%s:%s", data_directory, SUBDIR_SEPARATOR, cl->mother_name ? cl->mother_name : "NONE", @@ -1837,7 +1837,7 @@ show_corpora_backend(CorpusType ct, int only_active_corpus) if (pretty_print) { if (list[i][0] != initial) { initial = list[i][0]; - sprintf(label, " %c:", initial); + snprintf(label, 4, " %c:", initial); ilist_print_break(label); } ilist_print_item(list[i]); diff --git a/src/cwb/cqp/cqp.c b/src/cwb/cqp/cqp.c index 4f57881..5e53c42 100644 --- a/src/cwb/cqp/cqp.c +++ b/src/cwb/cqp/cqp.c @@ -210,7 +210,7 @@ initialize_cqp(int argc, char **argv) /* under Windows it is %HOMEDRIVE%%HOMEPATH% */ if (NULL != (homepath = (char *)getenv("HOMEPATH")) && NULL != (homedrive = (char *)getenv("HOMEDRIVE"))) { home = (char *)cl_malloc(256); - sprintf(home, "%s%s", homedrive, homepath); + snprintf(home, 256, "%s%s", homedrive, homepath); } #endif /* note that either way above, home is NULL if the needed env var(s) were not found. */ @@ -237,9 +237,9 @@ initialize_cqp(int argc, char **argv) /* read init file specified with -I , otherwise look for $HOME/.cqprc */ if (cqp_init_file) - sprintf(init_file_fullname, "%s", cqp_init_file); + snprintf(init_file_fullname, CL_MAX_FILENAME_LENGTH, "%s", cqp_init_file); else if (home) - sprintf(init_file_fullname, "%s%c%s", home, SUBDIR_SEPARATOR, CQPRC_NAME); + snprintf(init_file_fullname, CL_MAX_FILENAME_LENGTH, "%s%c%s", home, SUBDIR_SEPARATOR, CQPRC_NAME); if (init_file_fullname[0] != '\0') { if (NULL != (cqprc = fopen(init_file_fullname, "r"))) { @@ -274,9 +274,9 @@ initialize_cqp(int argc, char **argv) /* read macro init file specified with -M ; otherwise look for ~/.cqpmacros */ if (macro_init_file) - sprintf(init_file_fullname, "%s", macro_init_file); + snprintf(init_file_fullname, CL_MAX_FILENAME_LENGTH, "%s", macro_init_file); else if (home) - sprintf(init_file_fullname, "%s%c%s", home, SUBDIR_SEPARATOR, CQPMACROS_NAME); + snprintf(init_file_fullname, CL_MAX_FILENAME_LENGTH, "%s%c%s", home, SUBDIR_SEPARATOR, CQPMACROS_NAME); if (init_file_fullname[0] != '\0') { if (NULL != (cqprc = fopen(init_file_fullname, "r"))) { diff --git a/src/cwb/cqp/groups.c b/src/cwb/cqp/groups.c index b547256..a708762 100644 --- a/src/cwb/cqp/groups.c +++ b/src/cwb/cqp/groups.c @@ -355,7 +355,7 @@ ComputeGroupExternally(Group *group) Rprintf("%d %d\n", get_group_id(group, i, 0, NULL), get_group_id(group, i, 1, NULL)); /* (source ID, target ID) */ /* construct sort call */ - sprintf(sort_call, ExternalGroupCommand, temporary_name); + snprintf(sort_call, CL_MAX_LINE_LENGTH, ExternalGroupCommand, temporary_name); #if GROUP_DEBUG Rprintf("Running grouping sort: \n\t%s\n", sort_call); #endif diff --git a/src/cwb/cqp/llquery.c b/src/cwb/cqp/llquery.c index fe531b9..bc6c9ea 100644 --- a/src/cwb/cqp/llquery.c +++ b/src/cwb/cqp/llquery.c @@ -201,7 +201,7 @@ cqp_custom_completion(const char *text, int start, int end) #endif if (strncmp(prefix, var->my_name, prefix_len) == 0) { /* found variable matching prefix -> format and add */ completion = cl_malloc(strlen(var->my_name) + 2); - sprintf(completion, "$%s", var->my_name); + snprintf(completion, strlen(var->my_name) + 2, "$%s", var->my_name); cc_compl_list_add(completion); } var = variables_iterator_next(); @@ -308,7 +308,7 @@ cqp_custom_completion(const char *text, int start, int end) if (mother_len) { /* we must allocate a string of sufficient length and build a full subcorpus specifier */ completion = (char *) cl_malloc(mother_len + 1 + strlen(cl->name) + 1); - sprintf(completion, "%s:%s", mother, cl->name); + snprintf(completion, mother_len + 1 + strlen(cl->name) + 1, "%s:%s", mother, cl->name); cc_compl_list_add(completion); } else { @@ -329,7 +329,7 @@ cqp_custom_completion(const char *text, int start, int end) /* requires special handling: return '':'' */ char *completion = (char *) cl_malloc(strlen(cl->mother_name) + 2); /* just show there are subcorpora as well; user must type ':' to see subcorpora completions */ - sprintf(completion, "%s:", cl->mother_name); + snprintf(completion, strlen(cl->mother_name) + 2, "%s:", cl->mother_name); /* note that this will return the same string over and over again if there are multiple subcorpora; fortunately, readline sorts and uniqs the list of completions, so we don't have to worry */ cc_compl_list_add(completion); @@ -405,15 +405,15 @@ readline_main(void) if (current_corpus != NULL) { /* don't use terminal colours for the prompt because they mess up readline's formatting */ if (cl_streq(current_corpus->name, current_corpus->mother_name)) - sprintf(prompt, "%s> ", current_corpus->name); + snprintf(prompt, CL_MAX_LINE_LENGTH, "%s> ", current_corpus->name); else - sprintf(prompt, "%s:%s[%d]> ", + snprintf(prompt, CL_MAX_LINE_LENGTH, "%s:%s[%d]> ", current_corpus->mother_name, current_corpus->name, current_corpus->size); } else - sprintf(prompt, "[no corpus]> "); + snprintf(prompt, CL_MAX_LINE_LENGTH, "[no corpus]> "); input = readline(prompt); } @@ -491,7 +491,7 @@ main(int argc, char *argv[]) for (i = 3; i <= 4; i++) { Rprintf("["); for (j = 0; j < 8; j++) { - sprintf(sc_colour, "\x1B[0;%d%dm", i,j); + snprintf(sc_colour, 256, "\x1B[0;%d%dm", i,j); Rprintf("%d%d: %sN%s%sB%s%sU%s%sS%s ", i, j, sc_colour, diff --git a/src/cwb/cqp/macro.c b/src/cwb/cqp/macro.c index bfd47dc..8c0c379 100644 --- a/src/cwb/cqp/macro.c +++ b/src/cwb/cqp/macro.c @@ -522,7 +522,7 @@ expand_macro(const char *name) cl_free(macro_arg[i]); /* set pseudo argument */ - sprintf(pseudo_arg, "_pseudo_%u", pseudo_arg_counter++); + snprintf(pseudo_arg, 20, "_pseudo_%u", pseudo_arg_counter++); /* some rather daring circular usage of the lexical analyzer ... but it _should_ work */ /* the '(' is part of the macro token recognised by the lexical analyzer now */ @@ -1149,7 +1149,7 @@ macro_iterator_next_prototype(const char *prefix) prototype = (char *)cl_malloc(len); /* assemble formatted prototype: /NAME[ARG, ARG, ARG] */ - sprintf(prototype, "/%s[", macro->name); + snprintf(prototype, len, "/%s[", macro->name); for (i = 0; i < macro->args; i++) { strcat(prototype, macro->argnames[i] ? macro->argnames[i] : "_"); /* append ',' unless this is the last argument */ @@ -1209,7 +1209,7 @@ list_macros(const char *prefix) if (!prefix || !strncasecmp(p->name, prefix, len)) { l = strlen(p->name) + 8; macro_name = (char *) cl_malloc(l); - sprintf(macro_name, "/%s(%d)", p->name, p->args); + snprintf(macro_name, l, "/%s(%d)", p->name, p->args); list[k++] = macro_name; } @@ -1223,7 +1223,7 @@ list_macros(const char *prefix) if (pretty_print) { if (list[i][1] != initial) { initial = list[i][1]; - sprintf(label, " %c:", initial); + snprintf(label, 4, " %c:", initial); ilist_print_break(label); } ilist_print_item(list[i]); diff --git a/src/cwb/cqp/output.c b/src/cwb/cqp/output.c index 1ff5d8a..99e9219 100644 --- a/src/cwb/cqp/output.c +++ b/src/cwb/cqp/output.c @@ -58,7 +58,7 @@ char emulate_setenv_buffer[CL_MAX_LINE_LENGTH]; /* should be big enough for "var int setenv(const char *name, const char *value, int overwrite) { assert(name != NULL && value != NULL && "Invalid call of setenv() emulation function."); - sprintf(emulate_setenv_buffer, "%s=%s", name, value); + snprintf(emulate_setenv_buffer, CL_MAX_LINE_LENGTH, "%s=%s", name, value); return putenv(emulate_setenv_buffer); } diff --git a/src/cwb/cqp/parse_actions.c b/src/cwb/cqp/parse_actions.c index ed1b409..d046260 100644 --- a/src/cwb/cqp/parse_actions.c +++ b/src/cwb/cqp/parse_actions.c @@ -1406,10 +1406,10 @@ do_XMLTag(char *s_name, int is_closing, int op, char *regex, int flags) pattern = cl_malloc(strlen(conv_regex) + 42); /* leave some room for the regexp wrapper */ if (OP_CONTAINS == op_type) - sprintf(pattern, ".*\\|(%s)\\|.*", conv_regex); + snprintf(pattern, strlen(conv_regex) + 42, ".*\\|(%s)\\|.*", conv_regex); else /* op_type == OP_MATCHES */ /* if inner regexp is 'safe', we can omit the parentheses and thus enable optimisation */ - sprintf(pattern, safe_regex ? "\\|(%s\\|)+" : "\\|((%s)\\|)+", conv_regex); + snprintf(pattern, strlen(conv_regex) + 42, safe_regex ? "\\|(%s\\|)+" : "\\|((%s)\\|)+", conv_regex); cl_free(conv_regex); break; @@ -1508,7 +1508,7 @@ Evaltree do_RegionElement(char *name, /* try to find a named query result for the query_corpus */ corpus_name = (query_corpus->type == SUB || query_corpus->type == TEMP) ? query_corpus->mother_name : query_corpus->name; /* type TEMP is a temporary subcorpus, which seems to be used for all subqueries */ nqr_name = cl_malloc(strlen(corpus_name) + strlen(name) + 2); - sprintf(nqr_name, "%s:%s", corpus_name, name); /* construct qualified NQR name for lookup with findcorpus() */ + snprintf(nqr_name, strlen(corpus_name) + strlen(name) + 2, "%s:%s", corpus_name, name); /* construct qualified NQR name for lookup with findcorpus() */ nqr = findcorpus(nqr_name, SUB, 0); /* also ensures that NQR is loaded from disk if necessary */ cl_free(nqr_name); if (!nqr) { @@ -2476,10 +2476,10 @@ do_flagged_re_variable(char *varname, int flags) for (i = 0; i < N_strings; i++) length += strlen(items[i]) + 1; s = cl_malloc(length); - l = sprintf(s, "%s", items[0]); + l = snprintf(s, length, "%s", items[0]); mark = s + l; /* points to the trailing null byte */ for (i = 1; i < N_strings; i++) { - l = sprintf(mark, "|%s", items[i]); + l = snprintf(mark, strlen(mark), "|%s", items[i]); mark += l; } cl_free(items); @@ -2643,13 +2643,13 @@ do_feature_set_string(char *s, int op, int flags) switch (op & OP_NOT_MASK) { case OP_CONTAINS: - sprintf(pattern, ".*\\|(%s)\\|.*", converted_s); + snprintf(pattern, strlen(converted_s) + 42, ".*\\|(%s)\\|.*", converted_s); break; case OP_MATCHES: if (safe_regexp) /* inner regexp is 'safe' so we can omit the parentheses and thus enable optimisation */ - sprintf(pattern, "\\|(%s\\|)+", converted_s); + snprintf(pattern, strlen(converted_s) + 42, "\\|(%s\\|)+", converted_s); else - sprintf(pattern, "\\|((%s)\\|)+", converted_s); + snprintf(pattern, strlen(converted_s) + 42, "\\|((%s)\\|)+", converted_s); break; default: /* undefined operator */ diff --git a/src/cwb/cqp/print-modes.c b/src/cwb/cqp/print-modes.c index 64d0f2f..73002f4 100644 --- a/src/cwb/cqp/print-modes.c +++ b/src/cwb/cqp/print-modes.c @@ -91,7 +91,7 @@ ComputePrintStructures(CorpusList *cl) for ( ai = al ? al->list : NULL ; ai ; ai = ai->next ) { if (p != printStructure) *p++ = ' '; /* insert blank between attributes */ - sprintf(p, "%s", ai->attribute->any.name); + snprintf(p, strlen(p), "%s", ai->attribute->any.name); p += strlen(p); } diff --git a/src/cwb/cqp/ranges.c b/src/cwb/cqp/ranges.c index 01b314e..78f21d2 100644 --- a/src/cwb/cqp/ranges.c +++ b/src/cwb/cqp/ranges.c @@ -1106,7 +1106,7 @@ SortExternally(void) fclose(tmp); /* now, execute the external sort command on the temporary file */ - sprintf(sort_call, "%s %s %s | gawk '{print $1}'", ExternalSortCommand, (srt_ascending ? "" : "-r"), temporary_name); + snprintf(sort_call, CL_MAX_LINE_LENGTH, "%s %s %s | gawk '{print $1}'", ExternalSortCommand, (srt_ascending ? "" : "-r"), temporary_name); if (SORT_DEBUG) Rprintf("Running sort: \n\t%s\n", sort_call); diff --git a/src/cwb/cqp/tree.c b/src/cwb/cqp/tree.c index 5dae904..9d6c0bd 100644 --- a/src/cwb/cqp/tree.c +++ b/src/cwb/cqp/tree.c @@ -667,7 +667,7 @@ evaltree2searchstr(Evaltree etptr, int *length) right = evaltree2searchstr(etptr->node.right, &len_r); *length = len_l + len_r + 1; result = (char *)cl_malloc(*length); - sprintf(result, "%s %s", left, right); + snprintf(result, *length, "%s %s", left, right); cl_free(left); cl_free(right); break; @@ -680,7 +680,7 @@ evaltree2searchstr(Evaltree etptr, int *length) right = evaltree2searchstr(etptr->node.right, &len_r); *length = len_l + len_r + 7; result = (char *)cl_malloc(*length); - sprintf(result, "( %s | %s )", left, right); + snprintf(result, *length, "( %s | %s )", left, right); cl_free(left); cl_free(right); break; @@ -697,21 +697,21 @@ evaltree2searchstr(Evaltree etptr, int *length) if (min == 0 && max == repeat_inf) { *length = len_l + 5; result = (char *)cl_malloc(*length); - sprintf(result, "( %s )*", left); + snprintf(result, *length, "( %s )*", left); cl_free(left); } else if ((min == 1) && (max == repeat_inf)) { *length = len_l + 5; result = (char *)cl_malloc(*length); - sprintf(result, "( %s )+", left); + snprintf(result, *length, "( %s )+", left); cl_free(left); } else if ((min == 0) && (max == 1)) { *length = len_l + 4; result = (char *)cl_malloc(*length); - sprintf(result, "[ %s ]", left); + snprintf(result, *length, "[ %s ]", left); cl_free(left); } @@ -781,7 +781,7 @@ evaltree2searchstr(Evaltree etptr, int *length) /* etptr->node.type is something other than node */ char numstr[16]; assert(etptr->leaf.type == leaf); - sprintf(numstr, " \"%d\" ",etptr->leaf.patindex); + snprintf(numstr, 16, " \"%d\" ",etptr->leaf.patindex); result = cl_strdup(numstr); *length = strlen(result) + 1; } diff --git a/src/cwb/utils/cwb-align-encode.c b/src/cwb/utils/cwb-align-encode.c index 0e67ef8..b79d4f6 100644 --- a/src/cwb/utils/cwb-align-encode.c +++ b/src/cwb/utils/cwb-align-encode.c @@ -280,9 +280,9 @@ main(int argc, char *argv[]) } } else { - sprintf(alx_name, "%s" SUBDIR_SEP_STRING "%s.alx", data_dir, attribute_name); + snprintf(alx_name, CL_MAX_LINE_LENGTH, "%s" SUBDIR_SEP_STRING "%s.alx", data_dir, attribute_name); if (compatibility) - sprintf(alg_name, "%s" SUBDIR_SEP_STRING "%s.alg", data_dir, attribute_name); + snprintf(alg_name, CL_MAX_LINE_LENGTH, "%s" SUBDIR_SEP_STRING "%s.alg", data_dir, attribute_name); } /* now open output file(s) */ diff --git a/src/cwb/utils/cwb-align-show.c b/src/cwb/utils/cwb-align-show.c index ee6e42f..84b72f2 100644 --- a/src/cwb/utils/cwb-align-show.c +++ b/src/cwb/utils/cwb-align-show.c @@ -251,11 +251,11 @@ alignshow_print_next_region(FILE *f) /* print separator bar */ if (args == 6) - sprintf(line, "%s-alignment bead [%d, %d] x [%d, %d] (%d)", type, f1, l1, f2, l2, quality); + snprintf(line, CL_MAX_LINE_LENGTH, "%s-alignment bead [%d, %d] x [%d, %d] (%d)", type, f1, l1, f2, l2, quality); else if (args == 5) - sprintf(line, "%s-alignment bead [%d, %d] x [%d, %d] ", type, f1, l1, f2, l2); + snprintf(line, CL_MAX_LINE_LENGTH, "%s-alignment bead [%d, %d] x [%d, %d] ", type, f1, l1, f2, l2); else - sprintf(line, "alignment bead [%d, %d] x [%d, %d] ", f1, l1, f2, l2); + snprintf(line, CL_MAX_LINE_LENGTH, "alignment bead [%d, %d] x [%d, %d] ", f1, l1, f2, l2); n = (2 * COL_WIDTH + COL_SEP) - strlen(line); printf("%s", line); @@ -275,10 +275,10 @@ alignshow_print_next_region(FILE *f) n = strlen(word); } if ((w + n) > COL_WIDTH) break; /* column full */ - sprintf(col + w, "%s", word); w += n; + snprintf(col + w, MAX_COL_WIDTH + 1, "%s", word); w += n; i1++; /* next token */ if (w < COL_WIDTH) { - sprintf(col + w, " "); /* add token separator, if there's room */ + snprintf(col + w, MAX_COL_WIDTH + 1, " "); /* add token separator, if there's room */ w++; } } @@ -302,10 +302,10 @@ alignshow_print_next_region(FILE *f) } if ((w + n) > COL_WIDTH) break; /* column full */ - sprintf(col + w, "%s", word); w += n; + snprintf(col + w, MAX_COL_WIDTH + 1, "%s", word); w += n; i2++; /* next token */ if (w < COL_WIDTH) { - sprintf(col + w, " "); /* add token separator, if there's room */ + snprintf(col + w, MAX_COL_WIDTH + 1, " "); /* add token separator, if there's room */ w++; } } diff --git a/src/cwb/utils/cwb-compress-rdx.c b/src/cwb/utils/cwb-compress-rdx.c index c3e0927..80a4337 100644 --- a/src/cwb/utils/cwb-compress-rdx.c +++ b/src/cwb/utils/cwb-compress-rdx.c @@ -228,8 +228,8 @@ compress_reversed_index(Attribute *attr, char *output_fn, char *corpus_id, int d } if (output_fn) { - sprintf(data_fname, "%s.crc", output_fn); - sprintf(index_fname, "%s.crx", output_fn); + snprintf(data_fname, CL_MAX_FILENAME_LENGTH, "%s.crc", output_fn); + snprintf(index_fname, CL_MAX_FILENAME_LENGTH, "%s.crx", output_fn); } else { s = component_full_name(attr, CompCompRF, NULL); @@ -358,8 +358,8 @@ decompress_check_reversed_index(Attribute *attr, char *output_fn, char *corpus_i } if (output_fn) { - sprintf(data_fname, "%s.crc", output_fn); - sprintf(index_fname, "%s.crx", output_fn); + snprintf(data_fname, CL_MAX_FILENAME_LENGTH, "%s.crc", output_fn); + snprintf(index_fname, CL_MAX_FILENAME_LENGTH, "%s.crx", output_fn); } else { s = component_full_name(attr, CompCompRF, NULL); diff --git a/src/cwb/utils/cwb-decode.c b/src/cwb/utils/cwb-decode.c index 461f30d..add1111 100644 --- a/src/cwb/utils/cwb-decode.c +++ b/src/cwb/utils/cwb-decode.c @@ -241,23 +241,23 @@ decode_string_escape(const char *s) if (mode == XMLMode || xml_compatible) { for (i = 0; s[i]; i++) { if (s[i] == '"') { - sprintf(coded_s+t, """); + snprintf(coded_s+t, CL_MAX_LINE_LENGTH, """); t += strlen(coded_s+t); } else if (s[i] == '\'') { - sprintf(coded_s+t, "'"); + snprintf(coded_s+t, CL_MAX_LINE_LENGTH, "'"); t += strlen(coded_s+t); } else if (s[i] == '<') { - sprintf(coded_s+t, "<"); + snprintf(coded_s+t, CL_MAX_LINE_LENGTH, "<"); t += strlen(coded_s+t); } else if (s[i] == '>') { - sprintf(coded_s+t, ">"); + snprintf(coded_s+t, CL_MAX_LINE_LENGTH, ">"); t += strlen(coded_s+t); } else if (s[i] == '&') { - sprintf(coded_s+t, "&"); + snprintf(coded_s+t, CL_MAX_LINE_LENGTH, "&"); t += strlen(coded_s+t); } else if ((s[i] > 0) && (s[i] < 32)) { @@ -482,7 +482,7 @@ decode_add_attribute(const char *name, int type, const char *display_name, const return 0; } if (recursion > 0) - sprintf(temp, "%s%d", name, recursion); + snprintf(temp, CL_MAX_LINE_LENGTH, "%s%d", name, recursion); else cl_strcpy(temp, name); handle = get_attribute_handle(temp, type); /* aborts with error message if not found */ @@ -507,9 +507,9 @@ decode_add_attribute(const char *name, int type, const char *display_name, const av = cl_strdup(avspec); for (item = strtok(av, "+"); item; item = strtok(NULL, "+")) { if (recursion > 0) - sprintf(temp, "%s_%s%d", name, item, recursion); + snprintf(temp, CL_MAX_LINE_LENGTH, "%s_%s%d", name, item, recursion); else - sprintf(temp, "%s_%s", name, item); + snprintf(temp, CL_MAX_LINE_LENGTH, "%s_%s", name, item); av_handle = get_attribute_handle(temp, ATT_STRUC); if (!cl_struc_values(av_handle)) { fprintf(stderr, "Error: S-attribute %s selected by -S %s+%s has no annotated values (aborted).\n", diff --git a/src/cwb/utils/cwb-encode.c b/src/cwb/utils/cwb-encode.c index 01b76ed..43145cd 100644 --- a/src/cwb/utils/cwb-encode.c +++ b/src/cwb/utils/cwb-encode.c @@ -360,7 +360,7 @@ encode_scan_directory(char *dir) || (len_name >= 9 && (0 == strcasecmp(name + len_name - 8, DEFAULT_INFILE_EXTENSION ".bz2"))) ) { char *full_name = (char *) cl_malloc(len_dir + len_name + 2); - sprintf(full_name, "%s%c%s", dir, SUBDIR_SEPARATOR, name); + snprintf(full_name, len_dir + len_name + 2, "%s%c%s", dir, SUBDIR_SEPARATOR, name); if (stat(full_name, &statbuf) != 0) { perror("Can't stat file:"); encode_error("Failed to access input file %s -- aborted.\n", full_name); @@ -575,20 +575,20 @@ s_att_declare(char *name, char *directory, int store_values, int null_attribute) /* open data files for this s-attribute (children will be added later) */ /* create .rng component */ - sprintf(buf, PATH_STRUC_RNG, directory, sbuilder->name); + snprintf(buf, CL_MAX_LINE_LENGTH, PATH_STRUC_RNG, directory, sbuilder->name); if ((sbuilder->rng_fh = fopen(buf, "wb")) == NULL) { perror(buf); encode_error("Can't write .rng file for s-attribute <%s>.", name); } if (sbuilder->store_values) { /* create .avx and .avs components and initialise lexicon hash */ - sprintf(buf, PATH_STRUC_AVS, sbuilder->dir, sbuilder->name); + snprintf(buf, CL_MAX_LINE_LENGTH, PATH_STRUC_AVS, sbuilder->dir, sbuilder->name); if ((sbuilder->avs_fh = fopen(buf, "wb")) == NULL) { perror(buf); encode_error("Can't write .avs file for s-attribute <%s>.", name); } - sprintf(buf, PATH_STRUC_AVX, sbuilder->dir, sbuilder->name); + snprintf(buf, CL_MAX_LINE_LENGTH, PATH_STRUC_AVX, sbuilder->dir, sbuilder->name); if ((sbuilder->avx_fh = fopen(buf, "wb")) == NULL) { perror(buf); encode_error("Can't write .avx file for s-attribute <%s>.", name); @@ -613,7 +613,7 @@ s_att_declare(char *name, char *directory, int store_values, int null_attribute) sbuilder->recursion_children[0] = sbuilder; /* zeroeth recursion level is stored in the att. itself */ for (i = 1; i <= sbuilder->max_recursion; i++) { /* recursion children have 'flat' structure, because recursion is handled explicitly */ - sprintf(buf, "%s%d%s", sbuilder->name, i, is_feature_set ? "/" : ""); + snprintf(buf, CL_MAX_LINE_LENGTH, "%s%d%s", sbuilder->name, i, is_feature_set ? "/" : ""); sbuilder->recursion_children[i] = s_att_declare(buf, sbuilder->dir, sbuilder->store_values, /*null*/ 0); sbuilder->recursion_children[i]->automatic = 1; /* mark as automatically handled attribute */ } @@ -638,9 +638,9 @@ s_att_declare(char *name, char *directory, int store_values, int null_attribute) *p = '\0'; /* ea now points to NUL-terminated "" */ if (sbuilder->max_recursion >= 0) - sprintf(buf, "%s_%s:%d", sbuilder->name, ea, sbuilder->max_recursion); + snprintf(buf, CL_MAX_LINE_LENGTH, "%s_%s:%d", sbuilder->name, ea, sbuilder->max_recursion); else - sprintf(buf, "%s_%s", sbuilder->name, ea); + snprintf(buf, CL_MAX_LINE_LENGTH, "%s_%s", sbuilder->name, ea); /* potential feature set marker (/) is passed on to the respective child attribute and handled there */ if (ea[strlen(ea)-1] == '/') @@ -1102,9 +1102,9 @@ p_att_declare(char *name, char *directory, int nr_buckets) /* We now create paths for each of the three files that this encoder generates. * The paths aren't stored in the p_attr - only the file handles from opening them. */ - sprintf(corname, PATH_POS_CORPUS, directory, p_encoder[p_encoder_ix].name); - sprintf(lexname, PATH_POS_LEX, directory, p_encoder[p_encoder_ix].name); - sprintf(idxname, PATH_POS_LEXIDX, directory, p_encoder[p_encoder_ix].name); + snprintf(corname, CL_MAX_LINE_LENGTH, PATH_POS_CORPUS, directory, p_encoder[p_encoder_ix].name); + snprintf(lexname, CL_MAX_LINE_LENGTH, PATH_POS_LEX, directory, p_encoder[p_encoder_ix].name); + snprintf(idxname, CL_MAX_LINE_LENGTH, PATH_POS_LEXIDX, directory, p_encoder[p_encoder_ix].name); /* Note: corpus_fh is a binary file, lex_fh is kinda mixed(*), and lexidx_fh is a binary file. * @@ -1400,7 +1400,7 @@ encode_generate_registry_file(char *registry_file) cl_id_toupper(corpus_name); info_file = (char *)cl_malloc(strlen(directory) + 1 + strlen(CWB_INFOFILE_DEFAULT_NAME) + 4); /* extra bytes as safety margin */ - sprintf(info_file, "%s%c%s", directory, SUBDIR_SEPARATOR, CWB_INFOFILE_DEFAULT_NAME); + snprintf(info_file, strlen(directory) + 1 + strlen(CWB_INFOFILE_DEFAULT_NAME) + 4, "%s%c%s", directory, SUBDIR_SEPARATOR, CWB_INFOFILE_DEFAULT_NAME); /* write header part for registry file */ fprintf(registry_fh, "##\n## registry entry for corpus %s\n##\n\n", corpus_name); diff --git a/src/cwb/utils/cwb-huffcode.c b/src/cwb/utils/cwb-huffcode.c index 4ac1bfb..dd93d17 100644 --- a/src/cwb/utils/cwb-huffcode.c +++ b/src/cwb/utils/cwb-huffcode.c @@ -622,9 +622,9 @@ compute_code_lengths(Attribute *attr, HCD *hc, char *fname) if (fname) { path = fname; - sprintf(hcd_path, "%s.hcd", path); - sprintf(huf_path, "%s.huf", path); - sprintf(sync_path, "%s.huf.syn", path); + snprintf(hcd_path, CL_MAX_LINE_LENGTH, "%s.hcd", path); + snprintf(huf_path, CL_MAX_LINE_LENGTH, "%s.huf", path); + snprintf(sync_path, CL_MAX_LINE_LENGTH, "%s.huf.syn", path); } else { path = component_full_name(attr, CompHuffSeq, NULL); @@ -749,9 +749,9 @@ decode_check_huff(Attribute *attr, char *corpus_id, char *fname) Rprintf("VALIDATING %s.%s\n", corpus_id, attr->any.name); if (fname) { - sprintf(hcd_path, "%s.hcd", fname); - sprintf(huf_path, "%s.huf", fname); - sprintf(sync_path, "%s.huf.syn", fname); + snprintf(hcd_path, CL_MAX_LINE_LENGTH, "%s.hcd", fname); + snprintf(huf_path, CL_MAX_LINE_LENGTH, "%s.huf", fname); + snprintf(sync_path, CL_MAX_LINE_LENGTH, "%s.huf.syn", fname); } else { char *path; diff --git a/src/cwb/utils/cwb-s-encode.c b/src/cwb/utils/cwb-s-encode.c index 2e3e45f..e668567 100644 --- a/src/cwb/utils/cwb-s-encode.c +++ b/src/cwb/utils/cwb-s-encode.c @@ -510,20 +510,20 @@ sencode_open_files(void) { char buf[CL_MAX_LINE_LENGTH]; - sprintf(buf, RNG_RNG, new_satt.dir, new_satt.name); + snprintf(buf, CL_MAX_LINE_LENGTH, RNG_RNG, new_satt.dir, new_satt.name); if ((new_satt.fd = fopen(buf, "wb")) == NULL) { perror(buf); exit(1); } if (new_satt.store_values) { - sprintf(buf, RNG_AVS, new_satt.dir, new_satt.name); + snprintf(buf, CL_MAX_LINE_LENGTH, RNG_AVS, new_satt.dir, new_satt.name); if ((new_satt.avs = fopen(buf, "w")) == NULL) { perror(buf); exit(1); } - sprintf(buf, RNG_AVX, new_satt.dir, new_satt.name); + snprintf(buf, CL_MAX_LINE_LENGTH, RNG_AVX, new_satt.dir, new_satt.name); if ((new_satt.avx = fopen(buf, "wb")) == NULL) { perror(buf); exit(1); diff --git a/src/cwb/utils/cwb-scan-corpus.c b/src/cwb/utils/cwb-scan-corpus.c index c80c75d..30978a4 100644 --- a/src/cwb/utils/cwb-scan-corpus.c +++ b/src/cwb/utils/cwb-scan-corpus.c @@ -645,7 +645,7 @@ main (int argc, char *argv[]) * but still avoid unnecessary computation if current cpos is outside region */ char *within_key = cl_malloc(strlen(within_att) + 4); - sprintf(within_key, "?%s+0", within_att); + snprintf(within_key, strlen(within_att) + 4, "?%s+0", within_att); scancorpus_add_key(within_key); cl_free(within_key); } From 241d989edffc4f2ee36e7269effd125e615c1d55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Sat, 29 Mar 2025 07:29:06 +0100 Subject: [PATCH 56/90] exploring Rf_error() as alternative to exit() --- src/cwb/cl/fileutils.c | 5 +++++ src/cwb/cl/globals.h | 4 ++++ src/cwb/cl/lex.creg.c | 6 +++++- src/cwb/cl/macros.c | 16 ++++++++++++++++ src/cwb/cl/makecomps.c | 5 +++++ src/cwb/config.mk | 1 + src/cwb/definitions.mk | 1 + 7 files changed, 37 insertions(+), 1 deletion(-) diff --git a/src/cwb/cl/fileutils.c b/src/cwb/cl/fileutils.c index 990e73c..eb1121e 100644 --- a/src/cwb/cl/fileutils.c +++ b/src/cwb/cl/fileutils.c @@ -430,7 +430,12 @@ int was_pipe; break; default: Rprintf("CL: internal error, managed I/O stream has invalid type = %d\n", stream->type); +#ifndef R_PACKAGE exit(1); +#else + /* we return -1 to indicate that closing the stream was not successful */ + return -1; +#endif } /* remove stream from list */ diff --git a/src/cwb/cl/globals.h b/src/cwb/cl/globals.h index 050c719..524c0d5 100644 --- a/src/cwb/cl/globals.h +++ b/src/cwb/cl/globals.h @@ -16,7 +16,11 @@ */ +#ifdef R_PACKAGE void Rprintf(const char *, ...); +void Rf_error(const char *, ...); +#endif + #ifndef _cl_globals_h_ #define _cl_globals_h_ diff --git a/src/cwb/cl/lex.creg.c b/src/cwb/cl/lex.creg.c index b3dc2b4..5c041a0 100644 --- a/src/cwb/cl/lex.creg.c +++ b/src/cwb/cl/lex.creg.c @@ -2088,8 +2088,12 @@ YY_BUFFER_STATE yy_scan_bytes (const char * yybytes, yy_size_t _yybytes_len ) static void yynoreturn yy_fatal_error (const char* msg ) { - Rprintf("%s\n", msg ); +#ifndef R_PACKAGE + Rprintf("%s\n", msg ); exit( YY_EXIT_FAILURE ); +#else + Rf_error(msg); +#endif } /* Redefine yyless() so it works in section 3 code. */ diff --git a/src/cwb/cl/macros.c b/src/cwb/cl/macros.c index 99fcefb..503fe71 100644 --- a/src/cwb/cl/macros.c +++ b/src/cwb/cl/macros.c @@ -58,7 +58,11 @@ cl_malloc(size_t bytes) Rprintf("CL: Out of memory. (killed)\n"); Rprintf("CL: [cl_malloc(%ld)]\n", bytes); Rprintf("\n"); /* for CQP's child mode */ + #ifdef R_PACKAGE + return NULL; + #else exit(1); + #endif } return block; } @@ -80,7 +84,11 @@ cl_calloc(size_t nr_of_elements, size_t element_size) Rprintf("CL: Out of memory. (killed)\n"); Rprintf("CL: [cl_calloc(%ld*%ld bytes)]\n", nr_of_elements, element_size); Rprintf("\n"); /* for CQP's child mode */ + #ifdef R_PACKAGE + return NULL; + #else exit(1); + #endif } return block; } @@ -109,7 +117,11 @@ cl_realloc(void *block, size_t bytes) Rprintf("CL: Out of memory. (killed)\n"); Rprintf("CL: [cl_realloc(block at %p to %ld bytes)]\n", block, bytes); Rprintf("\n"); /* for CQP's child mode */ + #ifdef R_PACKAGE + return NULL; + #else exit(1); + #endif } } return new_block; @@ -131,7 +143,11 @@ cl_strdup(const char *string) Rprintf("CL: Out of memory. (killed)\n"); Rprintf("CL: [cl_strdup(addr=%p, len=%ld)]\n", string, strlen(string)); Rprintf("\n"); /* for CQP's child mode */ + #ifdef R_PACKAGE + return NULL; + #else exit(1); + #endif } return new_string; } diff --git a/src/cwb/cl/makecomps.c b/src/cwb/cl/makecomps.c index 23146b1..9e4d47e 100644 --- a/src/cwb/cl/makecomps.c +++ b/src/cwb/cl/makecomps.c @@ -278,7 +278,12 @@ creat_rev_corpus(Component *revcorp) /* open REVCORP data file for writing */ if ((revcorp_fd = fopen(revcorp->path, "wb")) == NULL) { perror(revcorp->path); +#ifndef R_PACKAGE exit(1); +#else + Rf_error("Could not create reverse component\n"); + return -1; +#endif } /* diff --git a/src/cwb/config.mk b/src/cwb/config.mk index f7f65fe..6174e41 100644 --- a/src/cwb/config.mk +++ b/src/cwb/config.mk @@ -20,6 +20,7 @@ # * Edit this file to configure CWB for your system * # ********************************************************** + # # PLATFORM-SPECIFIC CONFIGURATION (OS and CPU type) # diff --git a/src/cwb/definitions.mk b/src/cwb/definitions.mk index fef8b76..7235adf 100644 --- a/src/cwb/definitions.mk +++ b/src/cwb/definitions.mk @@ -202,6 +202,7 @@ endif # Set up compiler and linker flags # +CFLAGS += -DR_PACKAGE CFLAGS += $(DEBUG_FLAGS) $(SITE_CFLAGS) LDFLAGS += $(DEBUG_FLAGS) $(SITE_LDFLAGS) From c30fdbec8a179b353d3c536f13ad8c1cb9ac8e62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Sat, 29 Mar 2025 22:52:02 +0100 Subject: [PATCH 57/90] replaces exit() with Rf_error() in C CWB code #95 --- cran-comments.md | 4 +++- src/cwb/cl/lex.creg.c | 17 +++++++++----- src/cwb/cl/makecomps.c | 4 ++++ src/cwb/cqp/cqp.c | 24 ++++++++++++++++++++ src/cwb/cqp/lex.yy.c | 16 ++++++++++++-- src/cwb/cqp/macro.c | 4 ++++ src/cwb/cqp/options.c | 32 +++++++++++++++++++++++++++ src/cwb/cqp/parse_actions.c | 4 ++++ src/cwb/cqp/parser.tab.c | 4 ++++ src/cwb/cqp/parser.y | 4 ++++ src/cwb/cqp/ranges.c | 4 ++++ src/cwb/cqp/regex2dfa.c | 44 +++++++++++++++++++++++++++++++++++++ src/cwb/cqp/symtab.c | 8 +++++++ 13 files changed, 161 insertions(+), 8 deletions(-) diff --git a/cran-comments.md b/cran-comments.md index e2249a0..cc7a93a 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,6 +1,8 @@ ## General remarks -- Usage of `sprintf()` has been replaced by `snprintf()` +- Usage of `sprintf()` has been replaced by `snprintf()` +- Calls of `exit()` have been replaced by `Rf_error()` (using preprocessor +directives and a newly used macro R_PACKAGE) Previous aspects I repeat: diff --git a/src/cwb/cl/lex.creg.c b/src/cwb/cl/lex.creg.c index 5c041a0..102192a 100644 --- a/src/cwb/cl/lex.creg.c +++ b/src/cwb/cl/lex.creg.c @@ -603,7 +603,11 @@ extern char *yytext; static yy_state_type yy_get_previous_state ( void ); static yy_state_type yy_try_NUL_trans ( yy_state_type current_state ); static int yy_get_next_buffer ( void ); +#ifndef R_PACKAGE static void yynoreturn yy_fatal_error ( const char* msg ); +#else +static void yy_fatal_error ( const char* msg ); +#endif /* Done after the current pattern has been matched and before the * corresponding action - sets up yytext. @@ -2086,15 +2090,18 @@ YY_BUFFER_STATE yy_scan_bytes (const char * yybytes, yy_size_t _yybytes_len ) #define YY_EXIT_FAILURE 2 #endif -static void yynoreturn yy_fatal_error (const char* msg ) -{ #ifndef R_PACKAGE +static void yynoreturn yy_fatal_error (const char* msg ) +} Rprintf("%s\n", msg ); - exit( YY_EXIT_FAILURE ); -#else + exit( YY_EXIT_FAILURE ); +} +#else +static void yy_fatal_error (const char* msg ) +{ Rf_error(msg); -#endif } +#endif /* Redefine yyless() so it works in section 3 code. */ diff --git a/src/cwb/cl/makecomps.c b/src/cwb/cl/makecomps.c index 9e4d47e..25738b6 100644 --- a/src/cwb/cl/makecomps.c +++ b/src/cwb/cl/makecomps.c @@ -192,7 +192,11 @@ creat_freqs(Component *freqs) if ((fd = fopen(corpus_fn, "rb")) == NULL) { Rprintf("CL makecomps:creat_freqs(): Couldn't open corpus %s\n", corpus_fn); perror(corpus_fn); +#ifndef R_PACKAGE exit(2); +#else + Rf_error("Abort\n"); +#endif } /* do the counts */ diff --git a/src/cwb/cqp/cqp.c b/src/cwb/cqp/cqp.c index 5e53c42..7225125 100644 --- a/src/cwb/cqp/cqp.c +++ b/src/cwb/cqp/cqp.c @@ -115,7 +115,11 @@ static void sigINT_signal_handler(int signum) { if (!signal_handler_is_installed) +#ifndef R_PACKAGE exit(cqp_error_status ? cqp_error_status : 1); /* make sure we abort if Ctrl-C is pressed a second time (even on platforms where signal handlers don't need to be reinstalled) */ +#else + Rf_error("** Aborting evaluation ..."); +#endif if (EvaluationIsRunning) { Rprintf("** Aborting evaluation ... (press Ctrl-C again to exit CQP)\n"); @@ -245,14 +249,22 @@ initialize_cqp(int argc, char **argv) if (NULL != (cqprc = fopen(init_file_fullname, "r"))) { reading_cqprc = 1; /* not good for very much, really */ if (!cqp_parse_file(cqprc, 1)) { +#ifndef R_PACKAGE Rprintf("Parse errors while reading %s, exiting.\n", init_file_fullname); exit(cqp_error_status ? cqp_error_status : 1); +#else + Rf_error("Parse errors while reading %s, exiting.\n", init_file_fullname); +#endif } reading_cqprc = 0; /* no need to close the file - cqp_parse_file() does so once it's chewed it up */ } else if (cqp_init_file) { +#ifndef R_PACKAGE Rprintf("Can't read initialization file %s\n", init_file_fullname); exit(cqp_error_status ? cqp_error_status : 1); +#else + Rf_error("Can't read initialization file %s\n", init_file_fullname); +#endif } } } @@ -282,14 +294,22 @@ initialize_cqp(int argc, char **argv) if (NULL != (cqprc = fopen(init_file_fullname, "r"))) { reading_cqprc = 1; if (!cqp_parse_file(cqprc, 1)) { +#ifndef R_PACKAGE Rprintf("Parse errors while reading %s, exiting.\n", init_file_fullname); exit(cqp_error_status ? cqp_error_status : 1); +#else + Rf_error("Parse errors while reading %s, exiting.\n", init_file_fullname); +#endif } reading_cqprc = 0; } else if (macro_init_file) { +#ifndef R_PACKAGE Rprintf("Can't read macro initialization file %s\n", init_file_fullname); exit(cqp_error_status ? cqp_error_status : 1); +#else + Rf_error("Can't read macro initialization file %s\n", init_file_fullname); +#endif } } } /* end of (if we have a macro init file, OR we are in interactive mode & able to seek for ~/.cqpmacros ...) */ @@ -299,8 +319,12 @@ initialize_cqp(int argc, char **argv) /* load the default corpus. */ if ((default_corpus) && !set_current_corpus_name(default_corpus, 0)) { +#ifndef R_PACKAGE Rprintf("Can't set current corpus to default corpus %s, exiting.\n", default_corpus); exit(cqp_error_status ? cqp_error_status : 1); +#else + Rf_error("Can't set current corpus to default corpus %s, exiting.\n", default_corpus); +#endif } #ifndef __MINGW__ diff --git a/src/cwb/cqp/lex.yy.c b/src/cwb/cqp/lex.yy.c index 94843ac..7e8036b 100644 --- a/src/cwb/cqp/lex.yy.c +++ b/src/cwb/cqp/lex.yy.c @@ -344,7 +344,11 @@ extern char *yytext; static yy_state_type yy_get_previous_state ( void ); static yy_state_type yy_try_NUL_trans ( yy_state_type current_state ); static int yy_get_next_buffer ( void ); +#ifndef R_PACKAGE static void yynoreturn yy_fatal_error ( const char* msg ); +#else +static void yy_fatal_error ( const char* msg ); +#endif /* Done after the current pattern has been matched and before the * corresponding action - sets up yytext. @@ -2889,11 +2893,19 @@ YY_BUFFER_STATE yy_scan_bytes (const char * yybytes, yy_size_t _yybytes_len ) #define YY_EXIT_FAILURE 2 #endif + +#ifndef R_PACKAGE static void yynoreturn yy_fatal_error (const char* msg ) { - Rprintf("%s\n", msg ); - exit( YY_EXIT_FAILURE ); + Rprintf("%s\n", msg ); + exit( YY_EXIT_FAILURE ); +} +#else +static void yy_fatal_error (const char* msg ) +{ + Rf_error("%s\n", msg ); } +#endif /* Redefine yyless() so it works in section 3 code. */ diff --git a/src/cwb/cqp/macro.c b/src/cwb/cqp/macro.c index 8c0c379..83b4a2d 100644 --- a/src/cwb/cqp/macro.c +++ b/src/cwb/cqp/macro.c @@ -278,8 +278,12 @@ MacroHashDelete(MacroEntry macro) if (p->next == macro) break; if (!p) { /* this REALLY shouldn't happen */ +#ifndef R_PACKAGE cqpmessage(Error, "MacroHashDelete: MacroEntry not found in hash ???"); exit(cqp_error_status ? cqp_error_status : 1); +#else + Rf_error("MacroHashDelete: MacroEntry not found in hash ???"); +#endif } p->next = macro->next; /* cut macro from list */ } diff --git a/src/cwb/cqp/options.c b/src/cwb/cqp/options.c index 77674d9..d18f725 100644 --- a/src/cwb/cqp/options.c +++ b/src/cwb/cqp/options.c @@ -377,8 +377,12 @@ cqp_usage(void) Rprintf("Usage: %s [options]\n", progname); break; default: +#ifndef R_PACKAGE Rprintf("??? Unknown application ???\n"); exit(cqp_error_status ? cqp_error_status : 1); +#else + Rf_error("??? Unknown application ???\n"); +#endif } Rprintf("Options:\n"); Rprintf(" -h help\n"); @@ -426,7 +430,11 @@ cqp_usage(void) Rprintf(" [ ServerLog [on], ServerDebug, Snoop (log all network traffic) ]\n"); Rprintf(" [ ALL (activate all modes except ParseOnly) ]\n"); Rprintf("\n"); +#ifndef R_PACKAGE exit(cqp_error_status ? cqp_error_status : 1); +#else + Rf_error("Aborting ...\n"); +#endif } /** @@ -779,8 +787,12 @@ execute_side_effects(int opt) break; default: +#ifndef R_PACKAGE Rprintf("Unknown side-effect #%d invoked by option %s.\n", cqpoptions[opt].side_effect, cqpoptions[opt].opt_name); exit(cqp_error_status ? cqp_error_status : 1); +#else + Rf_error("Unknown side-effect #%d invoked by option %s.\n", cqpoptions[opt].side_effect, cqpoptions[opt].opt_name); +#endif } } @@ -1070,8 +1082,12 @@ parse_options(int ac, char *av[]) case 'E': if (!(query_string = getenv(optarg))) { +#ifndef R_PACKAGE Rprintf("Environment variable %s has no value, exiting\n", optarg); exit(cqp_error_status ? cqp_error_status : 1); +#else + Rf_error("Environment variable %s has no value, exiting\n", optarg); +#endif } break; @@ -1127,8 +1143,12 @@ parse_options(int ac, char *av[]) cl_set_debug_level(activate_cl_debug); } else { +#ifndef R_PACKAGE Rprintf("Invalid debug mode: -d %s\nType '%s -h' for more information.\n", optarg, progname); exit(cqp_error_status ? cqp_error_status : 1); +#else + Rf_error("Invalid debug mode: -d %s\nType '%s -h' for more information.\n", optarg, progname); +#endif } } break; @@ -1138,8 +1158,12 @@ parse_options(int ac, char *av[]) break; case 'v': +#ifndef R_PACKAGE Rprintf("%s\n", licensee); exit(cqp_error_status); +#else + Rf_error("%s\n", licensee); +#endif case 's': auto_subquery = 1; @@ -1197,13 +1221,21 @@ parse_options(int ac, char *av[]) /* note that cl_open_stream() handles the case where the filename is "-" for stdin */ if (!(batchfh = cl_open_stream(optarg, CL_STREAM_READ, CL_STREAM_MAGIC))) { perror(optarg); +#ifndef R_PACKAGE exit(cqp_error_status ? cqp_error_status : 1); +#else + Rf_error("Aborting ...\n"); +#endif } break; default: +#ifndef R_PACKAGE Rprintf("Invalid option. Type '%s -h' for more information.\n", progname); exit(cqp_error_status ? cqp_error_status : 1); +#else + Rf_error("Invalid option. Type '%s -h' for more information.\n", progname); +#endif break; } } diff --git a/src/cwb/cqp/parse_actions.c b/src/cwb/cqp/parse_actions.c index d046260..b4d0acf 100644 --- a/src/cwb/cqp/parse_actions.c +++ b/src/cwb/cqp/parse_actions.c @@ -346,8 +346,12 @@ ActivateCorpus(CorpusList *cl) cqpmessage(Message, "ActivateCorpus: %s", cl); if (inhibit_activation) { +#ifndef R_PACKAGE Rprintf("Activation prohibited\n"); exit(cqp_error_status ? cqp_error_status : 1); /* hard way! */ +#else + Rf_error("Activation prohibited\n"); +#endif } else { query_corpus = cl; diff --git a/src/cwb/cqp/parser.tab.c b/src/cwb/cqp/parser.tab.c index 606029e..be6a782 100644 --- a/src/cwb/cqp/parser.tab.c +++ b/src/cwb/cqp/parser.tab.c @@ -2345,9 +2345,13 @@ yyparse () if ((yyvsp[(2) - (2)].ival) == query_lock) query_lock = 0; else { +#ifndef R_PACKAGE Rprintf("ALERT! Query lock violation.\n"); Rprintf("\n"); /* so CQP.pm won't block -- should no longer be needed after switching to .EOL. mechanism */ exit(1); +#else + Rf_error("ALERT! Query lock violation.\n"); +#endif } ;} break; diff --git a/src/cwb/cqp/parser.y b/src/cwb/cqp/parser.y index 86e4b2e..40eadf2 100644 --- a/src/cwb/cqp/parser.y +++ b/src/cwb/cqp/parser.y @@ -438,9 +438,13 @@ command: { prepare_input(); } if ($2 == query_lock) query_lock = 0; else { +#ifndef R_PACKAGE Rprintf("ALERT! Query lock violation.\n"); Rprintf("\n"); /* so CQP.pm won't block -- should no longer be needed after switching to .EOL. mechanism */ exit(1); +#else + Rf_error("ALERT! Query lock violation.\n"); +#endif } } ';' diff --git a/src/cwb/cqp/ranges.c b/src/cwb/cqp/ranges.c index 78f21d2..9c9f9d5 100644 --- a/src/cwb/cqp/ranges.c +++ b/src/cwb/cqp/ranges.c @@ -275,8 +275,12 @@ calculate_ranges(CorpusList *cl, int cpos, Context spc, int *left, int *right) break; default: +#ifndef R_PACKAGE Rprintf("calculate_ranges: undefined space type %d detected\n", spc.space_type); exit(cqp_error_status ? cqp_error_status : 1); +#else + Rf_error("calculate_ranges: undefined space type %d detected\n", spc.space_type); +#endif break; } return 1; diff --git a/src/cwb/cqp/regex2dfa.c b/src/cwb/cqp/regex2dfa.c index c736688..10c5ca7 100644 --- a/src/cwb/cqp/regex2dfa.c +++ b/src/cwb/cqp/regex2dfa.c @@ -318,8 +318,12 @@ REGEX2DFA_ERROR(char *Format, ...) va_end(AP); fputc('\n', stderr); if (++ERRORS == MAX_ERRORS) { +#ifndef R_PACKAGE Rprintf("regex2dfa: Reached the %d error limit.\n", MAX_ERRORS); exit(cqp_error_status ? cqp_error_status : 1); +#else + Rf_error("regex2dfa: Reached the %d error limit.\n", MAX_ERRORS); +#endif } } @@ -363,8 +367,12 @@ LEX(void) if (isalpha(Ch) || Ch == '_' || Ch == '$') { for (LastW = ChP; isalnum(Ch) || Ch == '_' || Ch == '$'; ChP++) { if (ChP - ChArr == MAX_CHAR) { +#ifndef R_PACKAGE Rprintf("Out of character space.\n"); exit(cqp_error_status ? cqp_error_status : 1); +#else + Rf_error("Out of character space.\n"); +#endif } *ChP = Ch; Ch = GET(); @@ -372,8 +380,12 @@ LEX(void) if (Ch != EOF) UNGET(Ch); if (ChP - ChArr == MAX_CHAR) { +#ifndef R_PACKAGE Rprintf("Out of character space.\n"); exit(cqp_error_status ? cqp_error_status : 1); +#else + Rf_error("Out of character space.\n"); +#endif } *ChP++ = '\0'; return IdenT; @@ -382,19 +394,31 @@ LEX(void) Ch = GET(); for (LastW = ChP; Ch != '"' && Ch != EOF; ChP++) { if (ChP - ChArr == MAX_CHAR) { +#ifndef R_PACKAGE Rprintf("Out of character space.\n"); exit(cqp_error_status ? cqp_error_status : 1); +#else + Rf_error("Out of character space.\n"); +#endif } *ChP = Ch; Ch = GET(); } if (Ch == EOF) { +#ifndef R_PACKAGE Rprintf("Missing closing \".\n"); exit(cqp_error_status ? cqp_error_status : 1); +#else + Rf_error("Missing closing \".\n"); +#endif } if (ChP - ChArr == MAX_CHAR) { +#ifndef R_PACKAGE Rprintf("Out of character space.\n"); exit(cqp_error_status ? cqp_error_status : 1); +#else + Rf_error("Out of character space.\n"); +#endif } *ChP++ = '\0'; return IdenT; @@ -604,8 +628,12 @@ static void PUSH(StackTag Tag, int Q) { if (SP >= Stack + STACK_MAX) { +#ifndef R_PACKAGE REGEX2DFA_ERROR("Expression too complex ... aborting."); exit(cqp_error_status ? cqp_error_status : 1); +#else + Rf_error("Expression too complex ... aborting."); +#endif } SP->Tag = Tag; SP->Q = Q; @@ -683,8 +711,12 @@ Parse(void) /* finish off parsing by checking for anything left over indicating errors etc. */ switch (Action[TOP][L]) { case 'A': +#ifndef R_PACKAGE REGEX2DFA_ERROR("Extra ','"); exit(cqp_error_status ? cqp_error_status : 1); +#else + Rf_error("Extra ','"); +#endif case 'B': REGEX2DFA_ERROR("Unmatched )."); L = LEX(); @@ -710,11 +742,19 @@ Parse(void) L = LEX(); goto MakeOpt; case 'H': +#ifndef R_PACKAGE REGEX2DFA_ERROR("Left-hand side of '=' must be symbol."); exit(cqp_error_status ? cqp_error_status : 1); +#else + Rf_error("Left-hand side of '=' must be symbol."); +#endif case 'I': +#ifndef R_PACKAGE REGEX2DFA_ERROR("Missing evaluation."); exit(cqp_error_status ? cqp_error_status : 1); +#else + Rf_error("Missing evaluation.\n"); +#endif case '.': POP(); return RHS; @@ -1178,7 +1218,11 @@ regex2dfa(char *rxs, DFA *automaton) if (ERRORS > 0) Rprintf("%d error(s)\n", ERRORS); if (Q == -1) +#ifndef R_PACKAGE exit(cqp_error_status ? cqp_error_status : 1); +#else + Rf_error("Aborting ...\n"); +#endif FormState(Q); MergeStates(); diff --git a/src/cwb/cqp/symtab.c b/src/cwb/cqp/symtab.c index aded1dd..8876eef 100644 --- a/src/cwb/cqp/symtab.c +++ b/src/cwb/cqp/symtab.c @@ -254,8 +254,12 @@ dup_reftab(RefTab rt1, RefTab rt2) { assert(rt1 && rt2); if (rt1->size != rt2->size) { +#ifndef R_PACKAGE Rprintf("dup_reftab(): Tried to dup() RefTab (%d entries) to RefTab of different size (%d entries)\n", rt1->size, rt2->size); exit(cqp_error_status ? cqp_error_status : 1); +#else + Rf_error("dup_reftab(): Tried to dup() RefTab (%d entries) to RefTab of different size (%d entries)\n", rt1->size, rt2->size); +#endif } memcpy(rt2->data, rt1->data, rt1->size * sizeof(int)); } @@ -276,8 +280,12 @@ set_reftab(RefTab rt, int index, int value) { if (rt) { if (index < 0 || index >= rt->size) { +#ifndef R_PACKAGE cqpmessage(Error, "RefTab index #%d not in range 0 .. %d", index, rt->size - 1); exit(cqp_error_status ? cqp_error_status : 1); +#else + Rf_error("RefTab index #%d not in range 0 .. %d", index, rt->size - 1); +#endif } else rt->data[index] = value; From 9e9303f1d9d8d4402b4189b7be30093bef79c2b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Sat, 29 Mar 2025 23:12:44 +0100 Subject: [PATCH 58/90] further calls to exit() replaced #95 --- src/cwb/CQi/server.c | 35 +++++++++++++++++++++++++++++++++++ src/cwb/cl/makecomps.c | 12 ++++++++++++ 2 files changed, 47 insertions(+) diff --git a/src/cwb/CQi/server.c b/src/cwb/CQi/server.c index c6acb22..5813b38 100644 --- a/src/cwb/CQi/server.c +++ b/src/cwb/CQi/server.c @@ -98,8 +98,12 @@ char cqi_error_string[GENERAL_ERROR_SIZE] = "No error."; void cqi_send_error(char *function) { +#ifndef R_PACKAGE cqiserver_log(Error, "ERROR CQi data send failure in function\n\t%s() ", function); exit(1); +#else + Rf_error("ERROR CQi data send failure in function\n\t%s() ", function); +#endif } /** @@ -112,8 +116,12 @@ cqi_send_error(char *function) void cqi_recv_error(char *function) { +#ifndef R_PACKAGE cqiserver_log(Error, "ERROR CQi data recv failure in function\n\t%s() \n", function); exit(1); +#else + Rf_error("ERROR CQi data recv failure in function\n\t%s() \n", function); +#endif } /** @@ -127,8 +135,12 @@ cqi_recv_error(char *function) void cqi_internal_error(char *function, char *cause) { +#ifndef R_PACKAGE cqiserver_log(Error, "ERROR Internal error in function\n\t%s() \n\t''%s''", function, cause); exit(1); +#else + Rf_error("ERROR Internal error in function\n\t%s() \n\t''%s''", function, cause); +#endif } @@ -190,8 +202,12 @@ accept_connection(int port) #ifndef __MINGW__ if (SIG_ERR == signal(SIGCHLD, SIG_IGN)) { +#ifndef R_PACKAGE perror("ERROR Can't ignore SIGCHLD"); exit(1); +#else + Rf_error("ERROR Can't ignore SIGCHLD"); +#endif } #endif @@ -254,7 +270,11 @@ accept_connection(int port) if (pid != 0) { /* parent returns to caller */ close(sockfd); +#ifndef R_PACKAGE exit(cqiserver_log(Info, "[child is running in background now, parent server quits]") || cqp_error_status); +#else + Rf_error("[child is running in background now, parent server quits]"); +#endif } } #else @@ -273,7 +293,12 @@ accept_connection(int port) FD_SET(sockfd, &read_fd); if (0 >= select(sockfd+1, &read_fd, NULL, NULL, &tv) || !FD_ISSET(sockfd, &read_fd)) +#ifndef R_PACKAGE exit(cqiserver_log(Error, "Port #%d timed out in private server mode. Aborting.", port) || cqp_error_status); +#else + Rf_error("Port #%d timed out in private server mode. Aborting.", port); +#endif + } connfd = accept(sockfd, (struct sockaddr *)&client_addr, &sin_size); @@ -304,7 +329,11 @@ accept_connection(int port) if (private_server) { close(sockfd); +#ifndef R_PACKAGE exit(cqiserver_log(Info, "Accepting no more connections (private server).") || cqp_error_status); +#else + Rf_error("Accepting no more connections (private server)."); +#endif /* SIGCHLD should be reaped by calling process */ } #else @@ -329,10 +358,16 @@ accept_connection(int port) /* check if remote host is in validation list */ if (!check_host(client_addr.sin_addr)) { +#ifndef R_PACKAGE cqiserver_log(Info, "WARNING %s not in list, connection refused!\n", remote_address); cqiserver_log(Info, "Exit. (pid = %d)\n", (int)getpid()); close(connfd); exit(1); +#else + close(connfd); + Rprintf("WARNING %s not in list, connection refused!\n", remote_address); + Rf_error("Exit. (pid = %d)\n", (int)getpid()); +#endif } #ifndef __MINGW__ diff --git a/src/cwb/cl/makecomps.c b/src/cwb/cl/makecomps.c index 25738b6..22c8a73 100644 --- a/src/cwb/cl/makecomps.c +++ b/src/cwb/cl/makecomps.c @@ -345,8 +345,12 @@ creat_rev_corpus(Component *revcorp) for (id = primus + 1; id <= secundus; id++) { ptr += cl_id2freq(attr, id); if (ptr != ptab[id]) { +#ifndef R_PACKAGE Rprintf("CL makecomps: Pointer inconsistency for id=%d. Aborting.\n", id); exit(1); +#else + Rf_error("CL makecomps: Pointer inconsistency for id=%d. Aborting.\n", id); +#endif } } @@ -364,8 +368,12 @@ creat_rev_corpus(Component *revcorp) /* finally, check amount of data read/written vs. expected */ if ((ints_written != cpos) || (ints_written != datasize)) { +#ifndef R_PACKAGE Rprintf("CL makecomps: Data size inconsistency: expected=%d, read=%d, written=%d.\n", datasize, cpos, ints_written); exit(1); +#else + Rf_error("CL makecomps: Data size inconsistency: expected=%d, read=%d, written=%d.\n", datasize, cpos, ints_written); +#endif } /* free allocated memory */ @@ -433,9 +441,13 @@ creat_rev_corpus_idx(Component *revcidx) /* WE DO NOT CONVERT the table from host to network order while * writing it, since it's already been created in network order!!! */ if (write_file_from_blob(revcidx->path, &(revcidx->data), 0) == 0) { +#ifndef R_PACKAGE Rprintf("CL makecomps: Can't open %s for writing", revcidx->path); perror(revcidx->path); exit(2); +#else + Rf_error("CL makecomps: Can't open %s for writing", revcidx->path); +#endif } return 1; From e952a840e5551da643b9ec21b5cac0f52bda5376 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Sun, 30 Mar 2025 00:24:01 +0100 Subject: [PATCH 59/90] stderr and stddout removed from C code #95 --- src/cwb/cl/corpus.c | 7 +++++++ src/cwb/cl/fileutils.c | 8 ++++++++ src/cwb/cl/globals.h | 1 + src/cwb/cl/ui-helpers.c | 8 ++++++++ src/cwb/cqp/cqp.c | 3 ++- src/cwb/cqp/regex2dfa.c | 4 ++++ 6 files changed, 30 insertions(+), 1 deletion(-) diff --git a/src/cwb/cl/corpus.c b/src/cwb/cl/corpus.c index 4489343..8f62db3 100644 --- a/src/cwb/cl/corpus.c +++ b/src/cwb/cl/corpus.c @@ -393,10 +393,17 @@ cl_new_corpus(char *registry_dir, char *registry_name) loaded_corpora = corpus; /* check whether ID field corresponds to name of registry file */ if (corpus->id && (strcmp(corpus->id, canonical_name) != 0)) { +#ifndef R_PACKAGE fprintf( stderr, "CL warning: ID field '%s' does not match name of registry file %s/%s\n", corpus->id, real_registry_name, canonical_name); +#else + Rf_warning( + "CL warning: ID field '%s' does not match name of registry file %s/%s\n", + corpus->id, real_registry_name, canonical_name); + +#endif } } else diff --git a/src/cwb/cl/fileutils.c b/src/cwb/cl/fileutils.c index eb1121e..67d19bb 100644 --- a/src/cwb/cl/fileutils.c +++ b/src/cwb/cl/fileutils.c @@ -281,7 +281,11 @@ cl_open_stream(const char *filename, int mode, int type) /* "-" = STDIN or STDOUT */ if (cl_str_is(filename, "-")) +#ifndef R_PACKAGE type = CL_STREAM_STDIO; +#else + Rf_warning("Reading/writing from stdout/stdin disabled in R context\n"); +#endif else { point = (char *) filename + strspn(filename, " \t"); @@ -355,7 +359,11 @@ cl_open_stream(const char *filename, int mode, int type) handle = popen(filename, mode_spec); break; case CL_STREAM_STDIO: +#ifndef R_PACKAGE handle = (mode == CL_STREAM_READ) ? stdin : stdout; +#else + Rf_error("CL: invalid I/O stream type in R context = %d\n", type); +#endif break; default: Rprintf("CL: invalid I/O stream type = %d\n", type); diff --git a/src/cwb/cl/globals.h b/src/cwb/cl/globals.h index 524c0d5..c37c8f0 100644 --- a/src/cwb/cl/globals.h +++ b/src/cwb/cl/globals.h @@ -19,6 +19,7 @@ #ifdef R_PACKAGE void Rprintf(const char *, ...); void Rf_error(const char *, ...); +void Rf_warning(const char *, ...); #endif #ifndef _cl_globals_h_ diff --git a/src/cwb/cl/ui-helpers.c b/src/cwb/cl/ui-helpers.c index 7ed3bec..b362688 100644 --- a/src/cwb/cl/ui-helpers.c +++ b/src/cwb/cl/ui-helpers.c @@ -77,7 +77,9 @@ progress_bar_clear_line(void) { if (!progress_bar_simple) { /* clear the contents of the bottom terminal line */ Rprintf(" \r"); +#ifndef R_PACKAGE fflush(stderr); +#endif } } @@ -106,13 +108,17 @@ progress_bar_message(int pass, int total, char *message) } if (progress_bar_simple) { Rprintf("-::-PROGRESS-::-\t%d\t%d\t%s\n", pass, total, message); +#ifndef R_PACKAGE fflush(stdout); +#endif } else { Rprintf("["); Rprintf("pass %d of %d: ", pass, total); Rprintf("%s] \r", message); +#ifndef R_PACKAGE fflush(stderr); +#endif } } @@ -246,5 +252,7 @@ ilist_end(void) else Rprintf("\n"); ilist_cursor = 0; +#ifndef R_PACKAGE fflush(stdout); +#endif } diff --git a/src/cwb/cqp/cqp.c b/src/cwb/cqp/cqp.c index 7225125..8d037aa 100644 --- a/src/cwb/cqp/cqp.c +++ b/src/cwb/cqp/cqp.c @@ -443,11 +443,12 @@ cqp_parse_file(FILE *src, int exit_on_parse_errors) } /* in child mode, flush output streams after every parse pass. */ +#ifndef R_PACKAGE if (child_process && !reading_cqprc) { fflush(stdout); fflush(stderr); } - +#endif } /* end of loop over yyparse() calls. "ok" is now set to what we want to return. */ diff --git a/src/cwb/cqp/regex2dfa.c b/src/cwb/cqp/regex2dfa.c index 10c5ca7..6a19323 100644 --- a/src/cwb/cqp/regex2dfa.c +++ b/src/cwb/cqp/regex2dfa.c @@ -316,7 +316,11 @@ REGEX2DFA_ERROR(char *Format, ...) va_start(AP, Format); Rprintf(Format, AP); va_end(AP); +#ifndef R_PACKAGE fputc('\n', stderr); +#else + Rprintf("\n"); +#endif if (++ERRORS == MAX_ERRORS) { #ifndef R_PACKAGE Rprintf("regex2dfa: Reached the %d error limit.\n", MAX_ERRORS); From c4bfec435903ba4fc4b177840e7699d149e357d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Sun, 30 Mar 2025 00:58:02 +0100 Subject: [PATCH 60/90] remaining usage of stdout removed #95 --- src/cwb/cl/lex.creg.c | 2 ++ src/cwb/cqp/cqp.c | 2 ++ src/cwb/cqp/eval.c | 2 ++ src/cwb/cqp/lex.yy.c | 4 +++- src/cwb/cqp/output.c | 13 +++++++++++++ src/cwb/cqp/parser.tab.c | 2 ++ 6 files changed, 24 insertions(+), 1 deletion(-) diff --git a/src/cwb/cl/lex.creg.c b/src/cwb/cl/lex.creg.c index 102192a..f9f7b58 100644 --- a/src/cwb/cl/lex.creg.c +++ b/src/cwb/cl/lex.creg.c @@ -1059,8 +1059,10 @@ YY_DECL if ( ! yyin ) yyin = stdin; +#ifndef R_PACKAGE if ( ! yyout ) yyout = stdout; +#endif if ( ! YY_CURRENT_BUFFER ) { yyensure_buffer_stack (); diff --git a/src/cwb/cqp/cqp.c b/src/cwb/cqp/cqp.c index 8d037aa..cac889a 100644 --- a/src/cwb/cqp/cqp.c +++ b/src/cwb/cqp/cqp.c @@ -201,8 +201,10 @@ initialize_cqp(int argc, char **argv) /* let's always run stdout unbuffered */ /* if (batchmode || rangeoutput || insecure || !isatty(fileno(stdout))) */ +#ifndef R_PACKAGE if (setvbuf(stdout, NULL, _IONBF, 0) != 0) perror("unbuffer stdout"); +#endif yydebug = parser_debug; diff --git a/src/cwb/cqp/eval.c b/src/cwb/cqp/eval.c index 7cb541e..1c797e4 100644 --- a/src/cwb/cqp/eval.c +++ b/src/cwb/cqp/eval.c @@ -3616,7 +3616,9 @@ show_environment(int thisenv) } Rprintf("\n ================= END ENVIRONMENT #%d =============\n", thisenv); +#ifndef R_PACKAGE fflush(stdout); +#endif } } diff --git a/src/cwb/cqp/lex.yy.c b/src/cwb/cqp/lex.yy.c index 7e8036b..1b81027 100644 --- a/src/cwb/cqp/lex.yy.c +++ b/src/cwb/cqp/lex.yy.c @@ -1410,8 +1410,10 @@ YY_DECL if ( ! yyin ) yyin = stdin; +#ifndef R_PACKAGE if ( ! yyout ) - yyout = stdout; + yyout = stdout; +#endif if ( ! YY_CURRENT_BUFFER ) { yyensure_buffer_stack (); diff --git a/src/cwb/cqp/output.c b/src/cwb/cqp/output.c index 99e9219..a3821b0 100644 --- a/src/cwb/cqp/output.c +++ b/src/cwb/cqp/output.c @@ -241,6 +241,7 @@ open_rd_output_stream(struct Redir *rd, CorpusCharset charset) rd->stream = cl_open_stream(rd->name, mode, (insecure) ? CL_STREAM_MAGIC_NOPIPE : CL_STREAM_MAGIC); rd->is_paging = False; } +#ifndef R_PACKAGE else { if (pager && paging && isatty(fileno(stdout))) { if (insecure) @@ -269,6 +270,11 @@ open_rd_output_stream(struct Redir *rd, CorpusCharset charset) rd->is_paging = False; } } +#else + else { + Rf_error("Paging not allowed in the R context\n"); + } +#endif if (!rd->stream) { cqpmessage(Error, "Can't write to %s: %s", (rd->name) ? rd->name : "STDOUT", cl_error_string(cl_errno)); @@ -607,7 +613,14 @@ corpus_info(CorpusList *cl) if (cl->type == SYSTEM) { /* use pager, or simply print to stdout if it fails */ stream_ok = open_rd_output_stream(&to_less, ascii); +#ifndef R_PACKAGE outfh = stream_ok ? to_less.stream : stdout; +#else + if (!stream_ok) + Rf_error("pager not available, aborting\n"); + else + outfh = to_less.stream; +#endif /* print name for child mode (added v3.4.15) */ if (child_process) diff --git a/src/cwb/cqp/parser.tab.c b/src/cwb/cqp/parser.tab.c index be6a782..d512108 100644 --- a/src/cwb/cqp/parser.tab.c +++ b/src/cwb/cqp/parser.tab.c @@ -2450,7 +2450,9 @@ yyparse () { /* print special code ``-::-EOL-::-'' marking end-of-command in child mode */ Rprintf("-::-EOL-::-\n"); +#ifndef R_PACKAGE fflush(stdout); +#endif ;} break; From 1a31b3d59d10c812c25bd9b48005ce8c5230c92a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Sun, 30 Mar 2025 01:08:44 +0100 Subject: [PATCH 61/90] removed usage of stdout in libcwb.a #95 --- src/cwb/utils/cwb-encode.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/cwb/utils/cwb-encode.c b/src/cwb/utils/cwb-encode.c index 43145cd..79466d3 100644 --- a/src/cwb/utils/cwb-encode.c +++ b/src/cwb/utils/cwb-encode.c @@ -1508,7 +1508,9 @@ int cwb_encode_worker(cl_string_list input_files){ while ( encode_get_input_line(linebuf, MAX_INPUT_LINE_LENGTH) ) { if (verbose && (line % 15000 == 0)) { Rprintf("%" COMMA_SEP_THOUSANDS_CONVSPEC "9dk tokens processed\r", line >> 10); +#ifndef R_PACKAGE fflush(stdout); +#endif } input_line++; From 2428d96a7e6231e22245b0c060105f199d936261 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Sun, 30 Mar 2025 01:31:03 +0100 Subject: [PATCH 62/90] vprintf and putchar removed #95 --- src/cwb/CQi/log.c | 2 +- src/cwb/cqp/regex2dfa.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/cwb/CQi/log.c b/src/cwb/CQi/log.c index 946bb5b..ebba0d0 100644 --- a/src/cwb/CQi/log.c +++ b/src/cwb/CQi/log.c @@ -68,7 +68,7 @@ cqiserver_log(MessageType msg_type, const char *msg, ...) if (server_log || Error == msg_type) { va_start(vector, msg); Rprintf("CQPserver: "); - vprintf(msg, vector); + Rprintf(msg, vector); Rprintf("\n"); va_end(vector); } diff --git a/src/cwb/cqp/regex2dfa.c b/src/cwb/cqp/regex2dfa.c index 6a19323..56ccfe1 100644 --- a/src/cwb/cqp/regex2dfa.c +++ b/src/cwb/cqp/regex2dfa.c @@ -1113,7 +1113,7 @@ WriteStates(void) Rprintf(" |"); Rprintf(" %s s%d", SP->ShList[Sh].LHS->Name, STab[C].Class); } - putchar('\n'); + Rprintf('\n'); } } @@ -1156,7 +1156,7 @@ show_complete_dfa(DFA dfa) if (dfa.Final[i]) Rprintf("(final)"); else - putchar('\t'); + Rprintf('\t'); for (j = 0; j < dfa.Max_Input; j++) { Rprintf("\t%d -> ", j); if (dfa.TransTable[i][j] == dfa.E_State) @@ -1164,7 +1164,7 @@ show_complete_dfa(DFA dfa) else Rprintf("s%d,",dfa.TransTable[i][j]); } - putchar('\n'); + Rprintf('\n'); } } From 4dd918d2ca32c1339d06ce3f43d1cec8fc6da027 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Sun, 30 Mar 2025 01:36:38 +0100 Subject: [PATCH 63/90] =?UTF-8?q?resolve=20=E2=80=98Rprintf=E2=80=99=20mak?= =?UTF-8?q?es=20pointer=20from=20integer=20without=20a=20cast?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/cwb/cqp/regex2dfa.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/cwb/cqp/regex2dfa.c b/src/cwb/cqp/regex2dfa.c index 56ccfe1..612d1cc 100644 --- a/src/cwb/cqp/regex2dfa.c +++ b/src/cwb/cqp/regex2dfa.c @@ -1113,7 +1113,7 @@ WriteStates(void) Rprintf(" |"); Rprintf(" %s s%d", SP->ShList[Sh].LHS->Name, STab[C].Class); } - Rprintf('\n'); + Rprintf("\n"); } } @@ -1156,7 +1156,7 @@ show_complete_dfa(DFA dfa) if (dfa.Final[i]) Rprintf("(final)"); else - Rprintf('\t'); + Rprintf("\t"); for (j = 0; j < dfa.Max_Input; j++) { Rprintf("\t%d -> ", j); if (dfa.TransTable[i][j] == dfa.E_State) @@ -1164,7 +1164,7 @@ show_complete_dfa(DFA dfa) else Rprintf("s%d,",dfa.TransTable[i][j]); } - Rprintf('\n'); + Rprintf("\n"); } } From 6eb3a08d57def4e0c409898630b70c0128afa84d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Sun, 30 Mar 2025 01:50:26 +0100 Subject: [PATCH 64/90] no Apple/clang compiler warnings #95 --- src/cwb/cqp/output.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/cwb/cqp/output.c b/src/cwb/cqp/output.c index a3821b0..2dd8ef9 100644 --- a/src/cwb/cqp/output.c +++ b/src/cwb/cqp/output.c @@ -595,11 +595,15 @@ cqpmessage(MessageType type, const char *format, ...) void corpus_info(CorpusList *cl) { +#ifndef R_PACKAGE FILE *fh; FILE *outfh; char buf[CL_MAX_LINE_LENGTH]; int i, ok, stream_ok; +#else + int stream_ok; +#endif struct Redir to_less = { NULL, NULL, NULL, 0 }; /* for paging (with open_stream()) */ @@ -618,8 +622,6 @@ corpus_info(CorpusList *cl) #else if (!stream_ok) Rf_error("pager not available, aborting\n"); - else - outfh = to_less.stream; #endif /* print name for child mode (added v3.4.15) */ @@ -645,6 +647,7 @@ corpus_info(CorpusList *cl) Rprintf("\n"); /* do we have further info in a .INFO file? */ +#ifndef R_PACKAGE if ( !cl->corpus->info_file || @@ -668,7 +671,10 @@ corpus_info(CorpusList *cl) cl_close_stream(fh); } - +#else + Rprintf("Printing information from info file not available from R.\n"); +#endif + if (stream_ok) close_rd_output_stream(&to_less); /* close pipe to pager if we were using it */ From 71e13f2a4d3a95c21643be451cf41aec7f47f956 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Sun, 30 Mar 2025 03:00:52 +0200 Subject: [PATCH 65/90] towards release --- DESCRIPTION | 6 +++--- cran-comments.md | 9 +++++++-- man/RcppCWB-packge.Rd | 8 ++++++++ 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index fd6c8df..bf18628 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: RcppCWB Type: Package Title: 'Rcpp' Bindings for the 'Corpus Workbench' ('CWB') -Version: 0.6.5 -Date: 2024-09-23 +Version: 0.6.6 +Date: 2025-03-30 Author: Andreas Blaette [aut, cre], Bernard Desgraupes [aut], Sylvain Loiseau [aut], @@ -56,7 +56,7 @@ Biarch: true VignetteBuilder: knitr URL: https://github.com/PolMine/RcppCWB BugReports: https://github.com/PolMine/RcppCWB/issues -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.2 Roxygen: list(markdown = TRUE) Collate: 'RcppCWB_package.R' diff --git a/cran-comments.md b/cran-comments.md index cc7a93a..88681b4 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,8 +1,13 @@ ## General remarks -- Usage of `sprintf()` has been replaced by `snprintf()` +This maintanence release shall meet expanded checks of symbols in linked static +libraries: + +- Usage of `sprintf()` has been replaced by `snprintf()`; - Calls of `exit()` have been replaced by `Rf_error()` (using preprocessor -directives and a newly used macro R_PACKAGE) +directives and a newly used macro R_PACKAGE); +- Calls of `putchar()` and `vprintf()` have been replaced; +- `stderr` and `stdout` not used any more. Previous aspects I repeat: diff --git a/man/RcppCWB-packge.Rd b/man/RcppCWB-packge.Rd index 4d3de67..4732be4 100644 --- a/man/RcppCWB-packge.Rd +++ b/man/RcppCWB-packge.Rd @@ -139,6 +139,14 @@ Open Corpus Workbench (\url{https://cwb.sourceforge.io}) Witten, I.H.; Moffat, A.; Bell, T.C. (1999). Managing Gigabytes. Morgan Kaufmann Publishing, San Francisco, 2nd edition. +} +\seealso{ +Useful links: +\itemize{ + \item \url{https://github.com/PolMine/RcppCWB} + \item Report bugs at \url{https://github.com/PolMine/RcppCWB/issues} +} + } \author{ Andreas Blaette (andreas.blaette@uni-due.de) From 15ce02a1d6092abd34953c8714ac02d24d1e2e4d Mon Sep 17 00:00:00 2001 From: ablaette Date: Sun, 30 Mar 2025 03:03:43 +0200 Subject: [PATCH 66/90] Update R-CMD-check.yaml --- .github/workflows/R-CMD-check.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 3c7f823..66ec835 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -36,7 +36,7 @@ jobs: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - uses: r-lib/actions/setup-r@v2 with: From fd1884c7899a94466d579aec28eeabde0439adc2 Mon Sep 17 00:00:00 2001 From: ablaette Date: Sun, 30 Mar 2025 03:05:48 +0200 Subject: [PATCH 67/90] Update R-CMD-check.yaml --- .github/workflows/R-CMD-check.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 66ec835..35cf2ba 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -60,7 +60,7 @@ jobs: - name: Cache R packages if: runner.os != 'Windows' - uses: actions/cache@v2 + uses: actions/cache@v4 with: path: ${{ env.R_LIBS_USER }} key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }} From c58f032f776813ca588d4eadcda641bbddecd393 Mon Sep 17 00:00:00 2001 From: ablaette Date: Sun, 30 Mar 2025 03:08:50 +0200 Subject: [PATCH 68/90] Update R-CMD-check.yaml --- .github/workflows/R-CMD-check.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 35cf2ba..67a138f 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -99,7 +99,7 @@ jobs: - name: Upload Windows binary if: matrix.config.os == 'windows-latest' - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v4 with: name: RcppCWB-Windows-binary path: ./*.zip From 7a992285ddb150f6aa62eb924cdae31a2dec6d82 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Sun, 30 Mar 2025 03:12:14 +0200 Subject: [PATCH 69/90] pkgdown.yaml updated --- .github/workflows/pkgdown.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index 3c908d3..61b30a6 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -12,7 +12,7 @@ jobs: env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - uses: r-lib/actions/setup-r@v1 @@ -26,7 +26,7 @@ jobs: shell: Rscript {0} - name: Cache R packages - uses: actions/cache@v2 + uses: actions/cache@v4 with: path: ${{ env.R_LIBS_USER }} key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }} From 05f72c2ee4e5b5c22cfdc49b79bd00df55a93265 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Sun, 30 Mar 2025 09:47:22 +0200 Subject: [PATCH 70/90] changed links for pcre.org --- DESCRIPTION | 2 +- README.Rmd | 2 +- README.md | 4 ++-- cran-comments.md | 4 ++++ 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index bf18628..769b3ec 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -29,7 +29,7 @@ Maintainer: Andreas Blaette Description: 'Rcpp' Bindings for the C code of the 'Corpus Workbench' ('CWB'), an indexing and query engine to efficiently analyze large corpora (). 'RcppCWB' is licensed under the GNU GPL-3, in line with the GPL-3 license of the 'CWB' (). - The 'CWB' relies on 'pcre2' (BSD license, see ) + The 'CWB' relies on 'pcre2' (BSD license, see ) and 'GLib' (LGPL license, see ). See the file LICENSE.note for further information. The package includes modified code of the 'rcqp' package (GPL-2, see ). The original work of the authors diff --git a/README.Rmd b/README.Rmd index a075800..2313fb0 100644 --- a/README.Rmd +++ b/README.Rmd @@ -59,7 +59,7 @@ devtools::install_github("PolMine/RcppCWB", ref = "dev") ## Installation on macOS -On macOS, the [pcre2](http://www.pcre.org/) and [Glib](https://docs.gtk.org/glib) libraries need to be present. We recommend to use 'Homebrew' as a package manager for macOS. To install Homebrew, follow the instructions on the [Homebrew Website](https://brew.sh). It may also be necessary to also install [Xcode](https://developer.apple.com/xcode/) and [XQuartz](https://www.xquartz.org). +On macOS, the [pcre2](https://www.pcre.org/) and [Glib](https://docs.gtk.org/glib) libraries need to be present. We recommend to use 'Homebrew' as a package manager for macOS. To install Homebrew, follow the instructions on the [Homebrew Website](https://brew.sh). It may also be necessary to also install [Xcode](https://developer.apple.com/xcode/) and [XQuartz](https://www.xquartz.org). The following commands then need to be executed from a terminal window. They will install the C libraries the CWB relies on: diff --git a/README.md b/README.md index 0530680..70ce6a0 100644 --- a/README.md +++ b/README.md @@ -76,7 +76,7 @@ devtools::install_github("PolMine/RcppCWB", ref = "dev") ## Installation on macOS -On macOS, the [pcre2](http://www.pcre.org/) and +On macOS, the [pcre2](https://www.pcre.org/) and [Glib](https://docs.gtk.org/glib) libraries need to be present. We recommend to use ‘Homebrew’ as a package manager for macOS. To install Homebrew, follow the instructions on the [Homebrew @@ -185,7 +185,7 @@ cqp_initialize(registry = registry) cqp_query(corpus = "REUTERS", query = '"crude" "oil"') ``` - ## + ## ``` r cpos <- cqp_dump_subcorpus(corpus = "REUTERS") diff --git a/cran-comments.md b/cran-comments.md index 88681b4..b7a7b21 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -9,6 +9,10 @@ directives and a newly used macro R_PACKAGE); - Calls of `putchar()` and `vprintf()` have been replaced; - `stderr` and `stdout` not used any more. +Further fixes: + +- The page for the BSD licence for https://www.pcre.org/licence.txt is not +available, in the DESCRIPTION file I now refer to https://github.com/PCRE2Project/pcre2/blob/master/LICENCE.md Previous aspects I repeat: From b5107393850ae9d63801ed666549edc903cf0bd8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Sun, 30 Mar 2025 10:07:35 +0200 Subject: [PATCH 71/90] handling exit() on Windows #95 --- cran-comments.md | 2 +- src/cwb/cl/windows-mmap.c | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/cran-comments.md b/cran-comments.md index b7a7b21..a594ccc 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -12,7 +12,7 @@ directives and a newly used macro R_PACKAGE); Further fixes: - The page for the BSD licence for https://www.pcre.org/licence.txt is not -available, in the DESCRIPTION file I now refer to https://github.com/PCRE2Project/pcre2/blob/master/LICENCE.md +available. In the DESCRIPTION file I now refer to https://github.com/PCRE2Project/pcre2/blob/master/LICENCE.md Previous aspects I repeat: diff --git a/src/cwb/cl/windows-mmap.c b/src/cwb/cl/windows-mmap.c index abaddc7..69683e1 100644 --- a/src/cwb/cl/windows-mmap.c +++ b/src/cwb/cl/windows-mmap.c @@ -35,7 +35,11 @@ #ifdef __MINGW__ +#ifdef R_PACKAGE void Rprintf(const char *, ...); +void Rf_error(const char *, ...); +#endif + #include "windows-mmap.h" /** @@ -61,16 +65,24 @@ mmap(void *start, size_t length, int prot, int flags, int fd, off_t offset) if (!fstat(fd, &st)) len = (size_t) st.st_size; else { +#ifndef R_PACKAGE Rprintf("mmap: could not determine filesize"); exit(1); +#else + Rf_error("mmap: could not determine filesize"); +#endif } if ((length + offset) > len) length = len - offset; if (!(flags & MAP_PRIVATE)) { +#ifndef R_PACKAGE Rprintf("Invalid usage of mmap when built with USE_WIN32_MMAP"); exit(1); +#else + Rf_error("Invalid usage of mmap when built with USE_WIN32_MMAP"); +#endif } hmap = CreateFileMapping((HANDLE)_get_osfhandle(fd), 0, PAGE_WRITECOPY, 0, 0, 0); From 76ccd61cdb221534ddd319f5acac478bcd2668e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Thu, 10 Apr 2025 22:20:13 +0200 Subject: [PATCH 72/90] dissable assert in CWB static libs by setting macro NDEBUG #97 --- src/cwb/definitions.mk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cwb/definitions.mk b/src/cwb/definitions.mk index 7235adf..cd0ae03 100644 --- a/src/cwb/definitions.mk +++ b/src/cwb/definitions.mk @@ -202,7 +202,7 @@ endif # Set up compiler and linker flags # -CFLAGS += -DR_PACKAGE +CFLAGS += -DR_PACKAGE -DNDEBUG CFLAGS += $(DEBUG_FLAGS) $(SITE_CFLAGS) LDFLAGS += $(DEBUG_FLAGS) $(SITE_LDFLAGS) From 7adc5d2799d861f2d135effe8ec6aae7bb7352ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Fri, 11 Apr 2025 09:35:28 +0200 Subject: [PATCH 73/90] add macros to handle warnings resulting from setting macro NDEBUG #98 --- src/cwb/cl/makecomps.c | 9 ++++++++- src/cwb/utils/cwb-huffcode.c | 7 +++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/src/cwb/cl/makecomps.c b/src/cwb/cl/makecomps.c index 22c8a73..c082b35 100644 --- a/src/cwb/cl/makecomps.c +++ b/src/cwb/cl/makecomps.c @@ -241,7 +241,10 @@ creat_freqs(Component *freqs) int creat_rev_corpus(Component *revcorp) { +/* freqs is only used in assert, causes unused variable warning when macro NDEBUG is set */ +#ifndef NDEBUG Component *freqs; +#endif int cpos = 0, f, id, ints_written, pass; int datasize; @@ -261,11 +264,15 @@ creat_rev_corpus(Component *revcorp) attr = revcorp->attribute; /* need the attribute handle to use CL functions */ /* get the frequency table to compute offsets and fill buffer */ +#ifndef NDEBUG freqs = ensure_component(attr, CompCorpusFreqs, 1); assert(freqs != NULL); assert(freqs->corpus == revcorp->corpus); /* gotta be kidding ... */ - +#else + ensure_component(attr, CompCorpusFreqs, 1); +#endif + lexsize = cl_max_id(attr); /* this is the number of lexicon entries for this attribute */ ptab = (int **) cl_malloc(sizeof(int *) * ((size_t) lexsize)); /* table of pointers into */ diff --git a/src/cwb/utils/cwb-huffcode.c b/src/cwb/utils/cwb-huffcode.c index dd93d17..a8c5904 100644 --- a/src/cwb/utils/cwb-huffcode.c +++ b/src/cwb/utils/cwb-huffcode.c @@ -610,15 +610,22 @@ compute_code_lengths(Attribute *attr, HCD *hc, char *fname) char huf_path[CL_MAX_LINE_LENGTH]; char sync_path[CL_MAX_LINE_LENGTH]; +/* corp is only used by assert, if macro NDEBUG is set, defining the variable causes compiler warning */ +#ifndef NDEBUG Component *corp; +#endif BFile bfd; FILE *sync; int cl, code, pos; +#ifndef NDEBUG corp = ensure_component(attr, CompCorpus, 0); assert(corp); +#else + ensure_component(attr, CompCorpus, 0); +#endif if (fname) { path = fname; From 94b9d54ea39d045871f3b4a29e210568887e0eb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Fri, 11 Apr 2025 11:11:11 +0200 Subject: [PATCH 74/90] ready for maintenance release v0.6.7 --- DESCRIPTION | 4 ++-- cran-comments.md | 24 +++++++++--------------- 2 files changed, 11 insertions(+), 17 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 769b3ec..1d7839e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: RcppCWB Type: Package Title: 'Rcpp' Bindings for the 'Corpus Workbench' ('CWB') -Version: 0.6.6 -Date: 2025-03-30 +Version: 0.6.7 +Date: 2025-04-11 Author: Andreas Blaette [aut, cre], Bernard Desgraupes [aut], Sylvain Loiseau [aut], diff --git a/cran-comments.md b/cran-comments.md index a594ccc..95c90e3 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,18 +1,13 @@ ## General remarks -This maintanence release shall meet expanded checks of symbols in linked static -libraries: +This is a quick follow up to the previous release to meet expanded checks of +symbols in linked static libraries. A WARNING issued on macOS check machines +is addressed: -- Usage of `sprintf()` has been replaced by `snprintf()`; -- Calls of `exit()` have been replaced by `Rf_error()` (using preprocessor -directives and a newly used macro R_PACKAGE); -- Calls of `putchar()` and `vprintf()` have been replaced; -- `stderr` and `stdout` not used any more. - -Further fixes: - -- The page for the BSD licence for https://www.pcre.org/licence.txt is not -available. In the DESCRIPTION file I now refer to https://github.com/PCRE2Project/pcre2/blob/master/LICENCE.md +- (Non-)Usage of `assert()` is now controlled by explicitly setting the flag +NDEBUG when compiling static libraries; +- 'unused variable' warnings that are issued as a side effect are handled by +using macros. Previous aspects I repeat: @@ -28,10 +23,9 @@ change. ## Test environments -* Docker image with Fedora 40, R-devel r87186 and GCC 14 +* Docker image with Fedora 42, R-devel r87186 and GCC 20 * CI checks with GitHub Actions (Windows/macOS/Ubuntu) -* R winbuilder (R 4.3.3, R 4.4.1, R-devel r87186 ucrt) -* local macOS, R 4.3.1 (arm64) +* local macOS, R 4.4.1 (arm64) ## R CMD check results From f8f9ea32cedfe48375c80f6b22ba500ea72bc14c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Wed, 16 Jul 2025 22:10:51 +0200 Subject: [PATCH 75/90] null destination pointer warning fixed #99 --- DESCRIPTION | 4 ++-- src/cwb/cqp/print-modes.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 1d7839e..ee7d5e0 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: RcppCWB Type: Package Title: 'Rcpp' Bindings for the 'Corpus Workbench' ('CWB') -Version: 0.6.7 -Date: 2025-04-11 +Version: 0.6.8 +Date: 2025-07-16 Author: Andreas Blaette [aut, cre], Bernard Desgraupes [aut], Sylvain Loiseau [aut], diff --git a/src/cwb/cqp/print-modes.c b/src/cwb/cqp/print-modes.c index 73002f4..1ca2559 100644 --- a/src/cwb/cqp/print-modes.c +++ b/src/cwb/cqp/print-modes.c @@ -91,7 +91,7 @@ ComputePrintStructures(CorpusList *cl) for ( ai = al ? al->list : NULL ; ai ; ai = ai->next ) { if (p != printStructure) *p++ = ' '; /* insert blank between attributes */ - snprintf(p, strlen(p), "%s", ai->attribute->any.name); + snprintf(p, printStructure + sizeof(printStructure) - p, "%s", ai->attribute->any.name); p += strlen(p); } From 23d4516995fbadb1dab7d21a6f28dbee3d2fe372 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Wed, 16 Jul 2025 22:53:25 +0200 Subject: [PATCH 76/90] updaded NEWS file --- NEWS.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/NEWS.md b/NEWS.md index 39101c8..87b4876 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,7 @@ +# RcppCWB 0.6.8 + +* Fixes null destination pointer warning issued by gcc-ASAN #99. + # RcppCWB 0.6.5 * Fixes a 'exceeds maximum object size'-compiler warning #93. From f2cb3af491c6e8e78c541d028d5a58b885cba27d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Wed, 16 Jul 2025 22:58:27 +0200 Subject: [PATCH 77/90] _PACKAGE satisfying roxygen #96 --- R/RcppCWB_package.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/RcppCWB_package.R b/R/RcppCWB_package.R index e37b544..f3ae631 100644 --- a/R/RcppCWB_package.R +++ b/R/RcppCWB_package.R @@ -135,5 +135,5 @@ #' paste(tokens, collapse = " ") #' } #' ) -NULL +"_PACKAGE" From 7f5d3c8e8aff2d064e0561b9aeb71a547520dd9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Wed, 16 Jul 2025 23:04:30 +0200 Subject: [PATCH 78/90] cran-comments updated --- cran-comments.md | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/cran-comments.md b/cran-comments.md index 95c90e3..3163554 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,13 +1,6 @@ ## General remarks -This is a quick follow up to the previous release to meet expanded checks of -symbols in linked static libraries. A WARNING issued on macOS check machines -is addressed: - -- (Non-)Usage of `assert()` is now controlled by explicitly setting the flag -NDEBUG when compiling static libraries; -- 'unused variable' warnings that are issued as a side effect are handled by -using macros. +This release fixes an error thrown by gcc-ASAN checks (null destination pointer warning) Previous aspects I repeat: @@ -23,19 +16,14 @@ change. ## Test environments -* Docker image with Fedora 42, R-devel r87186 and GCC 20 +* Docker image with Fedora 42 * CI checks with GitHub Actions (Windows/macOS/Ubuntu) * local macOS, R 4.4.1 (arm64) ## R CMD check results -Check status is OK on all test environments. A warning I have seen but that I cannot reproduce results from this website: - -https://txm.gitpages.huma-num.fr/textometrie/ (unable to get local issuer certificate) - -I do not see these on the R winbuilder for R release of R devel. My browsers do -not show a problem with these certificates either. +Check status is OK on all test environments. ## Downstream dependencies From d9ce863428b7cc68b250df08ef897a4ce9794a30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Fri, 26 Sep 2025 23:47:24 +0200 Subject: [PATCH 79/90] rm sterr/stdout --- DESCRIPTION | 4 ++-- src/cwb/cl/registry.tab.c | 6 +++++- src/cwb/cqp/context_descriptor.c | 4 ++++ src/cwb/cqp/parser.tab.c | 8 ++++++-- 4 files changed, 17 insertions(+), 5 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index ee7d5e0..af395ad 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: RcppCWB Type: Package Title: 'Rcpp' Bindings for the 'Corpus Workbench' ('CWB') -Version: 0.6.8 -Date: 2025-07-16 +Version: 0.6.9 +Date: 2025-09-26 Author: Andreas Blaette [aut, cre], Bernard Desgraupes [aut], Sylvain Loiseau [aut], diff --git a/src/cwb/cl/registry.tab.c b/src/cwb/cl/registry.tab.c index a9beaaf..df15d3d 100644 --- a/src/cwb/cl/registry.tab.c +++ b/src/cwb/cl/registry.tab.c @@ -825,9 +825,11 @@ do { \ do { \ if (yydebug) \ { \ - YYFPRINTF ("%s ", Title); \ + YYFPRINTF ("%s ", Title); +#ifndef R_PACKAGE yy_symbol_print (stderr, \ Type, Value); \ +#endif; YYFPRINTF ("\n"); \ } \ } while (YYID (0)) @@ -944,9 +946,11 @@ yy_reduce_print (yyvsp, yyrule) for (yyi = 0; yyi < yynrhs; yyi++) { Rprintf(" $%d = ", yyi + 1); +#ifndef R_PACKAGE yy_symbol_print (stderr, yyrhs[yyprhs[yyrule] + yyi], &(yyvsp[(yyi + 1) - (yynrhs)]) ); +#endif; Rprintf("\n"); } } diff --git a/src/cwb/cqp/context_descriptor.c b/src/cwb/cqp/context_descriptor.c index a5b20b0..725c94b 100644 --- a/src/cwb/cqp/context_descriptor.c +++ b/src/cwb/cqp/context_descriptor.c @@ -352,7 +352,11 @@ print_context_descriptor(ContextDescriptor *cdp) if (cdp) { stream_ok = open_rd_output_stream(&dst, ascii); +#ifndef R_PACKAGE fh = (stream_ok) ? dst.stream : stdout; /* use pager, or simply print to stdout if it fails */ +#else + if (stream_ok) fh = dst.stream; else return; +#endif; if (pretty_print) { Rprintf("===Context Descriptor=======================================\n"); diff --git a/src/cwb/cqp/parser.tab.c b/src/cwb/cqp/parser.tab.c index d512108..b722806 100644 --- a/src/cwb/cqp/parser.tab.c +++ b/src/cwb/cqp/parser.tab.c @@ -1600,9 +1600,11 @@ do { \ do { \ if (yydebug) \ { \ - YYFPRINTF ("%s ", Title); \ + YYFPRINTF ("%s ", Title); \ +#ifndef R_PACKAGE yy_symbol_print (stderr, \ Type, Value); \ +#endif YYFPRINTF ("\n"); \ } \ } while (YYID (0)) @@ -1719,10 +1721,12 @@ yy_reduce_print (yyvsp, yyrule) for (yyi = 0; yyi < yynrhs; yyi++) { Rprintf(" $%d = ", yyi + 1); +#ifndef R_PACKAGE yy_symbol_print (stderr, yyrhs[yyprhs[yyrule] + yyi], &(yyvsp[(yyi + 1) - (yynrhs)]) ); - Rprintf("\n"); +#endif + Rprintf("\n"); } } From b7adddb216ef72a783b434ec770aa0c49c36e589 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Fri, 26 Sep 2025 23:53:18 +0200 Subject: [PATCH 80/90] no line breaks --- src/cwb/cl/registry.tab.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/cwb/cl/registry.tab.c b/src/cwb/cl/registry.tab.c index df15d3d..0070360 100644 --- a/src/cwb/cl/registry.tab.c +++ b/src/cwb/cl/registry.tab.c @@ -827,8 +827,7 @@ do { \ { \ YYFPRINTF ("%s ", Title); #ifndef R_PACKAGE - yy_symbol_print (stderr, \ - Type, Value); \ + yy_symbol_print (stderr, Type, Value); #endif; YYFPRINTF ("\n"); \ } \ @@ -947,9 +946,7 @@ yy_reduce_print (yyvsp, yyrule) { Rprintf(" $%d = ", yyi + 1); #ifndef R_PACKAGE - yy_symbol_print (stderr, yyrhs[yyprhs[yyrule] + yyi], - &(yyvsp[(yyi + 1) - (yynrhs)]) - ); + yy_symbol_print (stderr, yyrhs[yyprhs[yyrule] + yyi], &(yyvsp[(yyi + 1) - (yynrhs)])); #endif; Rprintf("\n"); } From 38db2233133d43a909d87510f52ba1d9cb0924f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Fri, 26 Sep 2025 23:56:52 +0200 Subject: [PATCH 81/90] rm ; after #endif --- src/cwb/cl/registry.tab.c | 4 ++-- src/cwb/cqp/context_descriptor.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/cwb/cl/registry.tab.c b/src/cwb/cl/registry.tab.c index 0070360..3e34bb9 100644 --- a/src/cwb/cl/registry.tab.c +++ b/src/cwb/cl/registry.tab.c @@ -828,7 +828,7 @@ do { \ YYFPRINTF ("%s ", Title); #ifndef R_PACKAGE yy_symbol_print (stderr, Type, Value); -#endif; +#endif YYFPRINTF ("\n"); \ } \ } while (YYID (0)) @@ -947,7 +947,7 @@ yy_reduce_print (yyvsp, yyrule) Rprintf(" $%d = ", yyi + 1); #ifndef R_PACKAGE yy_symbol_print (stderr, yyrhs[yyprhs[yyrule] + yyi], &(yyvsp[(yyi + 1) - (yynrhs)])); -#endif; +#endif Rprintf("\n"); } } diff --git a/src/cwb/cqp/context_descriptor.c b/src/cwb/cqp/context_descriptor.c index 725c94b..dd8ca70 100644 --- a/src/cwb/cqp/context_descriptor.c +++ b/src/cwb/cqp/context_descriptor.c @@ -356,7 +356,7 @@ print_context_descriptor(ContextDescriptor *cdp) fh = (stream_ok) ? dst.stream : stdout; /* use pager, or simply print to stdout if it fails */ #else if (stream_ok) fh = dst.stream; else return; -#endif; +#endif if (pretty_print) { Rprintf("===Context Descriptor=======================================\n"); From 4b6d7acb180dca56a04840dd748c09a4df060659 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Fri, 26 Sep 2025 23:59:15 +0200 Subject: [PATCH 82/90] rm ; after #endif --- src/cwb/cl/registry.tab.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/cwb/cl/registry.tab.c b/src/cwb/cl/registry.tab.c index 3e34bb9..36dfb5a 100644 --- a/src/cwb/cl/registry.tab.c +++ b/src/cwb/cl/registry.tab.c @@ -825,10 +825,12 @@ do { \ do { \ if (yydebug) \ { \ - YYFPRINTF ("%s ", Title); + YYFPRINTF ("%s ", Title); \ + #ifndef R_PACKAGE yy_symbol_print (stderr, Type, Value); #endif + YYFPRINTF ("\n"); \ } \ } while (YYID (0)) From c5a94dbacaa53683285bbc00fa9aac00cccdc45e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Sat, 27 Sep 2025 00:01:28 +0200 Subject: [PATCH 83/90] rm ; after #endif --- src/cwb/cl/registry.tab.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/cwb/cl/registry.tab.c b/src/cwb/cl/registry.tab.c index 36dfb5a..673436f 100644 --- a/src/cwb/cl/registry.tab.c +++ b/src/cwb/cl/registry.tab.c @@ -827,9 +827,9 @@ do { \ { \ YYFPRINTF ("%s ", Title); \ -#ifndef R_PACKAGE +/* yy_symbol_print (stderr, Type, Value); -#endif +*/ YYFPRINTF ("\n"); \ } \ @@ -947,9 +947,9 @@ yy_reduce_print (yyvsp, yyrule) for (yyi = 0; yyi < yynrhs; yyi++) { Rprintf(" $%d = ", yyi + 1); -#ifndef R_PACKAGE +/* yy_symbol_print (stderr, yyrhs[yyprhs[yyrule] + yyi], &(yyvsp[(yyi + 1) - (yynrhs)])); -#endif +*/ Rprintf("\n"); } } From a0f01ed7d8c2e7e292755629a842bcf1938f01b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Sat, 27 Sep 2025 00:03:47 +0200 Subject: [PATCH 84/90] rm whitespace --- src/cwb/cl/registry.tab.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/cwb/cl/registry.tab.c b/src/cwb/cl/registry.tab.c index 673436f..786471f 100644 --- a/src/cwb/cl/registry.tab.c +++ b/src/cwb/cl/registry.tab.c @@ -825,14 +825,14 @@ do { \ do { \ if (yydebug) \ { \ - YYFPRINTF ("%s ", Title); \ + YYFPRINTF ("%s ", Title); /* yy_symbol_print (stderr, Type, Value); */ - YYFPRINTF ("\n"); \ - } \ + YYFPRINTF ("\n"); + } } while (YYID (0)) From cd6f9b12d0f464112981e6a83185175081dac540 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Sat, 27 Sep 2025 00:06:07 +0200 Subject: [PATCH 85/90] debugging --- src/cwb/cl/registry.tab.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/cwb/cl/registry.tab.c b/src/cwb/cl/registry.tab.c index 786471f..1554490 100644 --- a/src/cwb/cl/registry.tab.c +++ b/src/cwb/cl/registry.tab.c @@ -822,9 +822,9 @@ do { \ } while (YYID (0)) # define YY_SYMBOL_PRINT(Title, Type, Value, Location) \ -do { \ - if (yydebug) \ - { \ +do { + if (yydebug) + { YYFPRINTF ("%s ", Title); /* From 3dcc09e267ff10281c33cd77ce67435fab615c07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Sat, 27 Sep 2025 00:08:54 +0200 Subject: [PATCH 86/90] debugging --- src/cwb/cl/registry.tab.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/cwb/cl/registry.tab.c b/src/cwb/cl/registry.tab.c index 1554490..0026b67 100644 --- a/src/cwb/cl/registry.tab.c +++ b/src/cwb/cl/registry.tab.c @@ -822,16 +822,16 @@ do { \ } while (YYID (0)) # define YY_SYMBOL_PRINT(Title, Type, Value, Location) \ -do { +do { if (yydebug) { - YYFPRINTF ("%s ", Title); + YYFPRINTF("%s ", Title); /* yy_symbol_print (stderr, Type, Value); */ - YYFPRINTF ("\n"); + YYFPRINTF("\n"); } } while (YYID (0)) From 9d09220406a43c6422acf5581bd82128d2fde229 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Sat, 27 Sep 2025 00:10:06 +0200 Subject: [PATCH 87/90] debugging --- src/cwb/cl/registry.tab.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/cwb/cl/registry.tab.c b/src/cwb/cl/registry.tab.c index 0026b67..ea56868 100644 --- a/src/cwb/cl/registry.tab.c +++ b/src/cwb/cl/registry.tab.c @@ -826,11 +826,6 @@ do { if (yydebug) { YYFPRINTF("%s ", Title); - -/* - yy_symbol_print (stderr, Type, Value); -*/ - YYFPRINTF("\n"); } } while (YYID (0)) From 1f2c05ec7264567b28d28826b9fdd43bd84445d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Sat, 27 Sep 2025 00:16:20 +0200 Subject: [PATCH 88/90] fresh copy of registry.tab.c from master --- src/cwb/cl/registry.tab.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/src/cwb/cl/registry.tab.c b/src/cwb/cl/registry.tab.c index ea56868..95368b5 100644 --- a/src/cwb/cl/registry.tab.c +++ b/src/cwb/cl/registry.tab.c @@ -822,12 +822,14 @@ do { \ } while (YYID (0)) # define YY_SYMBOL_PRINT(Title, Type, Value, Location) \ -do { - if (yydebug) - { - YYFPRINTF("%s ", Title); - YYFPRINTF("\n"); - } +do { \ + if (yydebug) \ + { \ + YYFPRINTF ("%s ", Title); \ + /* yy_symbol_print (stderr, \ + Type, Value); */ \ + YYFPRINTF ("\n"); \ + } \ } while (YYID (0)) @@ -942,9 +944,9 @@ yy_reduce_print (yyvsp, yyrule) for (yyi = 0; yyi < yynrhs; yyi++) { Rprintf(" $%d = ", yyi + 1); -/* - yy_symbol_print (stderr, yyrhs[yyprhs[yyrule] + yyi], &(yyvsp[(yyi + 1) - (yynrhs)])); -*/ + /* yy_symbol_print (stderr, yyrhs[yyprhs[yyrule] + yyi], + &(yyvsp[(yyi + 1) - (yynrhs)]) + ); */ Rprintf("\n"); } } From 5bbc70753911387f62c19e803a26d99181c42263 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Sat, 27 Sep 2025 00:21:17 +0200 Subject: [PATCH 89/90] getting it done --- src/cwb/cl/registry.tab.c | 8 ++++---- src/cwb/cqp/context_descriptor.c | 4 ---- src/cwb/cqp/parser.tab.c | 16 ++++++---------- 3 files changed, 10 insertions(+), 18 deletions(-) diff --git a/src/cwb/cl/registry.tab.c b/src/cwb/cl/registry.tab.c index 95368b5..a9beaaf 100644 --- a/src/cwb/cl/registry.tab.c +++ b/src/cwb/cl/registry.tab.c @@ -826,8 +826,8 @@ do { \ if (yydebug) \ { \ YYFPRINTF ("%s ", Title); \ - /* yy_symbol_print (stderr, \ - Type, Value); */ \ + yy_symbol_print (stderr, \ + Type, Value); \ YYFPRINTF ("\n"); \ } \ } while (YYID (0)) @@ -944,9 +944,9 @@ yy_reduce_print (yyvsp, yyrule) for (yyi = 0; yyi < yynrhs; yyi++) { Rprintf(" $%d = ", yyi + 1); - /* yy_symbol_print (stderr, yyrhs[yyprhs[yyrule] + yyi], + yy_symbol_print (stderr, yyrhs[yyprhs[yyrule] + yyi], &(yyvsp[(yyi + 1) - (yynrhs)]) - ); */ + ); Rprintf("\n"); } } diff --git a/src/cwb/cqp/context_descriptor.c b/src/cwb/cqp/context_descriptor.c index dd8ca70..a5b20b0 100644 --- a/src/cwb/cqp/context_descriptor.c +++ b/src/cwb/cqp/context_descriptor.c @@ -352,11 +352,7 @@ print_context_descriptor(ContextDescriptor *cdp) if (cdp) { stream_ok = open_rd_output_stream(&dst, ascii); -#ifndef R_PACKAGE fh = (stream_ok) ? dst.stream : stdout; /* use pager, or simply print to stdout if it fails */ -#else - if (stream_ok) fh = dst.stream; else return; -#endif if (pretty_print) { Rprintf("===Context Descriptor=======================================\n"); diff --git a/src/cwb/cqp/parser.tab.c b/src/cwb/cqp/parser.tab.c index b722806..4b10e9c 100644 --- a/src/cwb/cqp/parser.tab.c +++ b/src/cwb/cqp/parser.tab.c @@ -1600,12 +1600,10 @@ do { \ do { \ if (yydebug) \ { \ - YYFPRINTF ("%s ", Title); \ -#ifndef R_PACKAGE - yy_symbol_print (stderr, \ + YYFPRINTF ("%s ", Title); \ + /* yy_symbol_print (stderr, \ Type, Value); \ -#endif - YYFPRINTF ("\n"); \ + YYFPRINTF ("\n"); */ \ } \ } while (YYID (0)) @@ -1721,12 +1719,10 @@ yy_reduce_print (yyvsp, yyrule) for (yyi = 0; yyi < yynrhs; yyi++) { Rprintf(" $%d = ", yyi + 1); -#ifndef R_PACKAGE - yy_symbol_print (stderr, yyrhs[yyprhs[yyrule] + yyi], + /* yy_symbol_print (stderr, yyrhs[yyprhs[yyrule] + yyi], &(yyvsp[(yyi + 1) - (yynrhs)]) - ); -#endif - Rprintf("\n"); + ); */ + Rprintf("\n"); } } From d1b845c87237ff174495d39c75bff9c006d9214b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Bl=C3=A4tte?= Date: Mon, 29 Sep 2025 21:13:16 +0200 Subject: [PATCH 90/90] fix stderr/stdout issue in compiled code #100 --- NEWS.md | 4 ++++ cran-comments.md | 5 +++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index 87b4876..0356d4a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,7 @@ +# RcppCWB 0.6.9 + +* Fixes stderr/stout usage in C code. + # RcppCWB 0.6.8 * Fixes null destination pointer warning issued by gcc-ASAN #99. diff --git a/cran-comments.md b/cran-comments.md index 3163554..4ca28d0 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,6 +1,7 @@ ## General remarks -This release fixes an error thrown by gcc-ASAN checks (null destination pointer warning) +This release fixes a warning thrown by Fedora/R-devel(clang: potential +stderr/stdout usage removed from C code. Previous aspects I repeat: @@ -18,7 +19,7 @@ change. * Docker image with Fedora 42 * CI checks with GitHub Actions (Windows/macOS/Ubuntu) -* local macOS, R 4.4.1 (arm64) +* local macOS, R 4.5.1 (arm64) ## R CMD check results