diff --git a/NEWS b/NEWS index 1d546c5..3916943 100644 --- a/NEWS +++ b/NEWS @@ -10,6 +10,10 @@ conversion routines will output debugging information when strings are converted from R to Java. + o JRI: console callbacks (Read/WriteConsole) encode and decode + strings between native encoding and Java. (#24) + Previously, only UTF-8 native locales were supported. + 1.0-6 2021-12-10 o remove obsolete autoconf macros diff --git a/jri/src/Makefile.all b/jri/src/Makefile.all index 081a9b2..16b7851 100644 --- a/jri/src/Makefile.all +++ b/jri/src/Makefile.all @@ -15,7 +15,10 @@ JRI.jar: $(JRI_JSRC) $(JNIPREFIX)jri$(JNISO) org_rosuda_JRI_Rengine.h: org/rosuda/JRI/Rengine.class if [ -n "$(JAVAH)" ]; then $(JAVAH) -d . -classpath . org.rosuda.JRI.Rengine; fi -Rcallbacks.o: Rcallbacks.c Rcallbacks.h globals.h org_rosuda_JRI_Rengine.h +Rcallbacks.o: Rcallbacks.c Rcallbacks.h globals.h rjstring.h org_rosuda_JRI_Rengine.h + $(CC) -c -o $@ $< $(CFLAGS) $(CPICF) $(JAVAINC) $(RINC) $(JRI_CPPFLAGS) + +rjstring.o: rjstring.c rjstring.h $(CC) -c -o $@ $< $(CFLAGS) $(CPICF) $(JAVAINC) $(RINC) $(JRI_CPPFLAGS) Rinit.o: Rinit.c Rinit.h Rcallbacks.h @@ -33,7 +36,7 @@ Rengine.o: Rengine.c org_rosuda_JRI_Rengine.h globals.h Rcallbacks.h Rinit.h jri.o: jri.c $(CC) -c -o $@ jri.c $(CFLAGS) $(CPICF) $(JAVAINC) $(RINC) $(JRI_CPPFLAGS) -$(JNIPREFIX)jri$(JNISO): Rengine.o jri.o Rcallbacks.o Rinit.o globals.o rjava.o $(JRIDEPS) +$(JNIPREFIX)jri$(JNISO): Rengine.o jri.o Rcallbacks.o Rinit.o globals.o rjava.o rjstring.o $(JRIDEPS) $(CC) -o $@ $^ $(LDFLAGS) $(JNILD) $(RLD) $(JRI_LIBS) win32/libjvm.dll.a: diff --git a/jri/src/Rcallbacks.c b/jri/src/Rcallbacks.c index a4e0130..39f0e3c 100644 --- a/jri/src/Rcallbacks.c +++ b/jri/src/Rcallbacks.c @@ -5,6 +5,7 @@ #include "globals.h" #include "Rdecl.h" #include "Rcallbacks.h" +#include "rjstring.h" #include "org_rosuda_JRI_Rengine.h" #include @@ -64,12 +65,15 @@ JNIEnv *checkEnvironment() int Re_ReadConsole(RCCONST char *prompt, RCSIGN char *buf, int len, int addtohistory) { - jstring r,s; + jstring r, s; jmethodID mid; - JNIEnv *lenv=checkEnvironment(); - - if (!lenv || !engineObj) return -1; - + JNIEnv *lenv=checkEnvironment(); + const void *vmax = 0; + int ret = -1; + const char *c = 0; + + if (!lenv || !engineObj) return -1; + jri_checkExceptions(lenv, 1); mid=(*lenv)->GetMethodID(eenv, engineClass, "jriReadConsole", "(Ljava/lang/String;I)Ljava/lang/String;"); #ifdef JRI_DEBUG @@ -77,28 +81,46 @@ int Re_ReadConsole(RCCONST char *prompt, RCSIGN char *buf, int len, int addtohis #endif jri_checkExceptions(lenv, 0); if (!mid) return -1; - - s=(*lenv)->NewStringUTF(eenv, prompt); - r=(jstring) (*lenv)->CallObjectMethod(lenv, engineObj, mid, s, addtohistory); + vmax = vmaxget(); + s = rj_newNativeJavaString(lenv, prompt, -1); + vmaxset(vmax); + if (!s) return -1; + r = (jstring) (*lenv)->CallObjectMethod(lenv, engineObj, mid, s, addtohistory); jri_checkExceptions(lenv, 1); (*lenv)->DeleteLocalRef(lenv, s); jri_checkExceptions(lenv, 0); - if (r) { - const char *c=(*lenv)->GetStringUTFChars(lenv, r, 0); - if (!c) return -1; - { - int l=strlen(c); - strncpy((char*)buf, c, (l>len-1)?len-1:l); - buf[(l>len-1)?len-1:l]=0; + while (r) { + /* get string in Java UTF-8 */ + c = (*lenv)->GetStringUTFChars(lenv, r, 0); + if (!c) break; + vmax = vmaxget(); + + /* convert from Java UTF-8 to real UTF-8 in a CHARSXP */ + SEXP sRes = rj_mkCharUTF8_noerr(c); + if (!sRes) { + vmaxset(vmax); + break; + } + + /* UTF8 -> native */ + const char *rc = Rf_translateChar(sRes); + int l = strlen(rc); + strncpy((char*)buf, rc, (l > len - 1) ? len - 1 : l); + vmaxset(vmax); + + /* truncate if needed */ + buf[(l > len - 1) ? len - 1 : l] = 0; #ifdef JRI_DEBUG - printf("Re_ReadConsole succeeded: \"%s\"\n",buf); + printf("Re_ReadConsole succeeded: \"%s\"\n", buf); #endif - } - (*lenv)->ReleaseStringUTFChars(lenv, r, c); + ret = 1; + break; + } + if (r) { + if (c) (*lenv)->ReleaseStringUTFChars(lenv, r, c); (*lenv)->DeleteLocalRef(lenv, r); - return 1; - } - return -1; + } + return ret; } void Re_Busy(int which) @@ -118,20 +140,27 @@ void Re_Busy(int which) void Re_WriteConsoleEx(RCCONST char *buf, int len, int oType) { - JNIEnv *lenv=checkEnvironment(); - jri_checkExceptions(lenv, 1); - { - jstring s=(*lenv)->NewStringUTF(lenv, buf); - jmethodID mid=(*lenv)->GetMethodID(lenv, engineClass, "jriWriteConsole", "(Ljava/lang/String;I)V"); - jri_checkExceptions(lenv, 0); + JNIEnv *lenv = checkEnvironment(); + jri_checkExceptions(lenv, 1); + + const void *vmax = vmaxget(); + jstring s = rj_newNativeJavaString(lenv, buf, len); + vmaxset(vmax); + if (!s) { #ifdef JRI_DEBUG - printf("jriWriteConsole mid=%x\n", mid); + printf("jriWriteConsole rj_newNativeJavaString() FAILED!\n"); #endif - if (!mid) return; - (*lenv)->CallVoidMethod(lenv, engineObj, mid, s, oType); - jri_checkExceptions(lenv, 1); - (*lenv)->DeleteLocalRef(lenv, s); - } + return; + } + jmethodID mid = (*lenv)->GetMethodID(lenv, engineClass, "jriWriteConsole", "(Ljava/lang/String;I)V"); + jri_checkExceptions(lenv, 0); +#ifdef JRI_DEBUG + printf("jriWriteConsole mid=%x\n", mid); +#endif + if (!mid) return; + (*lenv)->CallVoidMethod(lenv, engineObj, mid, s, oType); + jri_checkExceptions(lenv, 1); + (*lenv)->DeleteLocalRef(lenv, s); } /* old-style WriteConsole (for old R versions only) */ diff --git a/jri/src/rjstring.c b/jri/src/rjstring.c new file mode 100644 index 0000000..6fce28b --- /dev/null +++ b/jri/src/rjstring.c @@ -0,0 +1,231 @@ +#include "rjstring.h" + +#include +#include +#include +#include + +#ifdef WIN32 +/* -- currently unused - was used to mimick reEnc() + extern unsigned int localeCP; + static char cpbuf[16]; */ +#endif +static jchar js_zero[2] = { 0, 0 }; +static jchar js_buf[128]; + +/* if len = -1 then c is assumed to be NUL terminated */ +int rj_char_utf16(const char *c, int len, jchar **buf, const char *ifrom, int can_error) { + void *ih; + const char *ce = (len < 0) ? strchr(c, 0) : (c + len); + if (ce == c) { + buf[0] = js_zero; + return 0; + } + size_t osize = sizeof(jchar) * (ce - c + 1), isize = ce - c; + jchar *js = buf[0] = (osize < sizeof(js_buf)) ? js_buf : (jchar*) R_alloc(sizeof(jchar), ce - c + 1); + char *dst = (char*) js; + int end_test = 1, is_le = (((char*)&end_test)[0] == 1) ? 1 : 0; + if (!ifrom) ifrom = ""; + +#ifdef DEBUG_ENCODING + fprintf(stderr, "rJava.rj_char_utf16_native:"); + { const char *c0 = c; while (*c0) fprintf(stderr, " %02x", (int)((unsigned char)*(c0++))); } + fprintf(stderr, "\n"); +#endif + + ih = Riconv_open(is_le ? "UTF-16LE" : "UTF-16BE", ifrom); + if (ih == (void *)(-1)) { + if (can_error) + Rf_error("Unable to start conversion to UTF-16"); + return -1; + } + while (c < ce) { + size_t res = Riconv(ih, &c, &isize, &dst, &osize); + /* this should never happen since we allocated far more than needed */ + if (res == -1 && errno == E2BIG) { + if (can_error) + Rf_error("Conversion to UTF-16 failed due to unexpectedly large buffer requirements."); + return -1; + } else if(res == -1 && (errno == EILSEQ || errno == EINVAL)) { /* invalid char */ + if (is_le) { + *(dst++) = '?'; + *(dst++) = 0; + } else { + *(dst++) = 0; + *(dst++) = '?'; + } + osize -= 2; + c++; + isize--; + } + } + Riconv_close(ih); +#ifdef DEBUG_ENCODING + { const jchar *j = js; while (j < (const jchar*)dst) fprintf(stderr, " %04x", (unsigned int)*(j++)); } + fprintf(stderr, "\n"); +#endif + return dst - (char*) js; +} + +/* returns string from a CHARSXP making sure that the result is in UTF-16. + the buffer is owned by the function and may be static, so copy after use. + + Returns the length of the resulting string or -1 on error (if + can_error is 0). + */ +static int rj_CHARSXP_utf16_(SEXP s, jchar **buf, int can_error) { + cetype_t ce_in = getCharCE(s); + const char *ifrom = "", *c = CHAR(s), *ce = strchr(c, 0); + if (ce == c) { + buf[0] = js_zero; + return 0; + } + + switch (ce_in) { +#ifdef WIN32 + case CE_NATIVE: +/* reEnc uses this, but translateCharUtf8 uses "" so let's go with "" + sprintf(cpbuf, "CP%d", localeCP); + ifrom = cpbuf; +*/ + break; + case CE_LATIN1: ifrom = "CP1252"; break; +#else + case CE_NATIVE: break; /* is already "" */ + case CE_LATIN1: ifrom = "latin1"; break; +#endif + default: + ifrom = "UTF-8"; break; + } + + return rj_char_utf16(c, ce - c, buf, ifrom, can_error); +} + +int rj_rchar_utf16(SEXP s, jchar **buf) { return rj_CHARSXP_utf16_(s, buf, 1); } +int rj_rchar_utf16_noerr(SEXP s, jchar **buf) { return rj_CHARSXP_utf16_(s, buf, 0); } + +/* FIXME: we should probably deprecate this as well and use UTF-16 instead. + The only reason not to is that we would have to fully implement + a full UTF-16 -> UTF-8 conversion including surrogate pairs ... */ + +/* Java returns *modified* UTF-8 which is incompatible with UTF-8, + so we have to detect the illegal surrgoate pairs and convert them */ +SEXP rj_mkCharUTF8_(const char *src, int can_error) { + const unsigned char *s = (const unsigned char*) src; + const unsigned char *c = (const unsigned char*) s; + /* check if the string contains any surrogate pairs, i.e. + Unicode in the range 0xD800-0xDFFF + We want this to be fast since in 99.99% of cases it will + be false */ + while (*c) { + if (c[0] == 0xED && + (c[1] & 0xE0) == 0xA0) + break; + c++; + } + if (*c) { /* yes, we have to convert them */ + SEXP res; + const unsigned char *e = (const unsigned char*) strchr((const char*)s, 0); /* find the end for size */ + unsigned char *dst = 0, *d, sbuf[64]; + if (!e) /* should never occur */ + return mkChar(""); + /* we use static buffer for small strings and dynamic alloc for large */ + if (e - s >= sizeof(sbuf)) { + /* allocate temp buffer since our input is const */ + d = dst = (unsigned char *) malloc(e - s + 1); + if (!dst) { + if (can_error) + Rf_error("Cannot allocate memory for surrogate pair conversion"); + return 0; + } + } else + d = (unsigned char *)sbuf; + if (c - s > 0) { + memcpy(d, s, c - s); + d += c - s; + } + while (*c) { + unsigned int u1, u; + *(d++) = *(c++); + /* start of a sequence ? */ + if ((c[-1] & 0xC0) != 0xC0) + continue; + if ((c[-1] & 0xE0) == 0xC0) { /* 2-byte, not a surrogate pair */ + if ((c[0] & 0xC0) != 0x80) { + if (dst) free(dst); + if (can_error) + Rf_error("illegal 2-byte sequence in Java string"); + return 0; + } + *(d++) = *(c++); + continue; + } + if ((c[-1] & 0xF0) != 0xE0) { /* must be 3-byte */ + if (dst) free(dst); + if (can_error) + Rf_error("illegal multi-byte seqeunce in Java string (>3-byte)"); + return 0; + } + if (((c[0] & 0xC0) != 0x80 || + (c[1] & 0xC0) != 0x80)) { + if (dst) free(dst); + if (can_error) + Rf_error("illegal 3-byte sequence in Java string"); + return 0; + } + u1 = ((((unsigned int)c[-1]) & 0x0F) << 12) | + ((((unsigned int)c[0]) & 0x3F) << 6) | + (((unsigned int)c[1]) & 0x3F); + if (u1 < 0xD800 || u1 > 0xDBFF) { /* not a surrogate pair -> regular copy */ + *(d++) = *(c++); + *(d++) = *(c++); + continue; + } + if (u1 >= 0xDC00 && u1 <= 0xDFFF) { /* low surrogate pair ? */ + if (dst) free(dst); + if (can_error) + Rf_error("illegal sequence in Java string: low surrogate pair without a high one"); + return 0; + } + c += 2; /* move to the low pair */ + if (c[0] != 0xED || + (c[1] & 0xF0) != 0xB0 || + (c[2] & 0xC0) != 0x80) { + if (dst) free(dst); + if (can_error) + Rf_error("illegal sequence in Java string: high surrogate pair not followed by low one"); + return 0; + } + /* the actually encoded unicode character */ + u = ((((unsigned int)c[1]) & 0x0F) << 6) | + (((unsigned int)c[2]) & 0x3F); + u |= (u1 & 0x03FF) << 10; + u += 0x10000; + c += 3; + /* it must be <= 0x10FFFF by design (each surrogate has 10 bits) */ + d[-1] = (unsigned char) (((u >> 18) & 0x0F) | 0xF0); + *(d++) = (unsigned char) (((u >> 12) & 0x3F) | 0x80); + *(d++) = (unsigned char) (((u >> 6) & 0x3F) | 0x80); + *(d++) = (unsigned char) ((u & 0x3F) | 0x80); + } + res = mkCharLenCE((const char*) (dst ? dst : sbuf), dst ? (d - dst) : (d - sbuf), CE_UTF8); + if (dst) free(dst); + return res; + } + return mkCharLenCE(src, c - s, CE_UTF8); +} + +SEXP rj_mkCharUTF8(const char *src) { return rj_mkCharUTF8_(src, 0); } +SEXP rj_mkCharUTF8_noerr(const char *src) { return rj_mkCharUTF8_(src, 1); } + +jstring rj_newJavaString(JNIEnv *env, SEXP sChar) { + jchar *s; + int len = rj_rchar_utf16(sChar, &s); + return (*env)->NewString(env, s, (len + 1) >> 1); +} + +jstring rj_newNativeJavaString(JNIEnv *env, const char *str, int len) { + jchar *s; + int rlen = rj_char_utf16(str, len, &s, "", 0); + return (rlen < 0) ? 0 : (*env)->NewString(env, s, (rlen + 1) >> 1); +} diff --git a/jri/src/rjstring.h b/jri/src/rjstring.h new file mode 100644 index 0000000..6fd24a1 --- /dev/null +++ b/jri/src/rjstring.h @@ -0,0 +1,25 @@ +#ifndef RJ_STRING_H__ +#define RJ_STRING_H__ + +#include /* for jchar */ +#include /* for SEXP */ + +/* --- API --- */ + +/* Returns static content for short strings so don't re-use. + For dynamic strings uses R_alloc */ +int rj_char_utf16(const char *c, int len, jchar **buf, const char *ifrom, int can_error); + +/* wrappers for above to use with CHARSXP to detect proper ifrom */ +int rj_rchar_utf16(SEXP s, jchar **buf); +int rj_rchar_utf16_noerr(SEXP s, jchar **buf); + +/* return jstring, but do NOT check exceptions */ +jstring rj_newJavaString(JNIEnv *env, SEXP sChar); +jstring rj_newNativeJavaString(JNIEnv *env, const char *str, int len); + +/* takes modified UTF-8 from Java, creates CHARSXP with valid UTF8 */ +SEXP rj_mkCharUTF8(const char *src); +SEXP rj_mkCharUTF8_noerr(const char *src); + +#endif diff --git a/src/Rglue.c b/src/Rglue.c index c2ce7f4..f009c00 100644 --- a/src/Rglue.c +++ b/src/Rglue.c @@ -6,6 +6,7 @@ #include #include #include +#include "rjstring.h" /* R 4.0.1 broke EXTPTR_PTR ABI so re-map it to safety at the small expense of speed */ @@ -156,9 +157,9 @@ SEXP j2SEXP(JNIEnv *env, jobject o, int releaseLocal) { } } -#if R_VERSION >= R_Version(2,7,0) /* returns string from a CHARSXP making sure that the result is in UTF-8 - NOTE: this should NOT be used to create Java strings as they require UTF-16 natively */ + NOTE: this should NOT be used to create Java strings as they require UTF-16 natively + For Java strings use rj_*_utf16 function from rjstring.h */ const char *rj_char_utf8(SEXP s) { #ifdef DEBUG_ENCODING fprintf(stderr, "rJava.rj_char_utf8, CE=%d: \"%s\"\n", (int)Rf_getCharCE(s), CHAR(s)); @@ -168,176 +169,10 @@ const char *rj_char_utf8(SEXP s) { return (Rf_getCharCE(s) == CE_UTF8) ? CHAR(s) : Rf_reEnc(CHAR(s), getCharCE(s), CE_UTF8, 0); /* subst. invalid chars: 1=hex, 2=., 3=?, other=skip */ } -#ifdef WIN32 -extern unsigned int localeCP; -static char cpbuf[16]; -#endif -static jchar js_zero[2] = { 0, 0 }; -static jchar js_buf[128]; -/* returns string from a CHARSXP making sure that the result is in UTF-16. - the buffer is owned by the function and may be static, so copy after use */ -int rj_char_utf16(SEXP s, jchar **buf) { - void *ih; - cetype_t ce_in = getCharCE(s); - const char *ifrom = "", *c = CHAR(s), *ce = strchr(c, 0); - if (ce == c) { - buf[0] = js_zero; - return 0; - } - size_t osize = sizeof(jchar) * (ce - c + 1), isize = ce - c; - jchar *js = buf[0] = (osize < sizeof(js_buf)) ? js_buf : (jchar*) R_alloc(sizeof(jchar), ce - c + 1); - char *dst = (char*) js; - int end_test = 1; - -#ifdef DEBUG_ENCODING - fprintf(stderr, "rJava.rj_char_utf16, CE=%d:", (int)ce_in); - { const char *c0 = c; while (*c0) fprintf(stderr, " %02x", (int)((unsigned char)*(c0++))); } - fprintf(stderr, "\n"); -#endif - - switch (ce_in) { -#ifdef WIN32 - case CE_NATIVE: -/* reEnc uses this, but translateCharUtf8 uses "" so let's go with "" - sprintf(cpbuf, "CP%d", localeCP); - ifrom = cpbuf; -*/ - break; - case CE_LATIN1: ifrom = "CP1252"; break; -#else - case CE_NATIVE: break; /* is already "" */ - case CE_LATIN1: ifrom = "latin1"; break; -#endif - default: - ifrom = "UTF-8"; break; - } - -#ifdef DEBUG_ENCODING - fprintf(stderr, " '%s' -> UTF-16: ", ifrom); -#endif - ih = Riconv_open(((char*)&end_test)[0] == 1 ? "UTF-16LE" : "UTF-16BE", ifrom); - if(ih == (void *)(-1)) - Rf_error("Unable to start conversion to UTF-16"); - while (c < ce) { - size_t res = Riconv(ih, &c, &isize, &dst, &osize); - /* this should never happen since we allocated far more than needed */ - if (res == -1 && errno == E2BIG) - Rf_error("Conversion to UTF-16 failed due to unexpectedly large buffer requirements."); - else if(res == -1 && (errno == EILSEQ || errno == EINVAL)) { /* invalid char */ - *(dst++) = '?'; - *(dst++) = 0; - osize -= 2; - c++; - isize--; - } - } - Riconv_close(ih); -#ifdef DEBUG_ENCODING - { const jchar *j = js; while (j < (const jchar*)dst) fprintf(stderr, " %04x", (unsigned int)*(j++)); } - fprintf(stderr, "\n"); -#endif - return dst - (char*) js; -} - -/* Java returns *modified* UTF-8 which is incompatible with UTF-8, - so we have to detect the illegal surrgoate pairs and convert them */ -SEXP mkCharUTF8(const char *src) { - const unsigned char *s = (const unsigned char*) src; - const unsigned char *c = (const unsigned char*) s; - /* check if the string contains any surrogate pairs, i.e. - Unicode in the range 0xD800-0xDFFF - We want this to be fast since in 99.99% of cases it will - be false */ - while (*c) { - if (c[0] == 0xED && - (c[1] & 0xE0) == 0xA0) - break; - c++; - } - if (*c) { /* yes, we have to convert them */ - SEXP res; - const unsigned char *e = (const unsigned char*) strchr((const char*)s, 0); /* find the end for size */ - unsigned char *dst = 0, *d, sbuf[64]; - if (!e) /* should never occur */ - return mkChar(""); - /* we use static buffer for small strings and dynamic alloc for large */ - if (e - s >= sizeof(sbuf)) { - /* allocate temp buffer since our input is const */ - d = dst = (unsigned char *) malloc(e - s + 1); - if (!dst) - Rf_error("Cannot allocate memory for surrogate pair conversion"); - } else - d = (unsigned char *)sbuf; - if (c - s > 0) { - memcpy(d, s, c - s); - d += c - s; - } - while (*c) { - unsigned int u1, u; - *(d++) = *(c++); - /* start of a sequence ? */ - if ((c[-1] & 0xC0) != 0xC0) - continue; - if ((c[-1] & 0xE0) == 0xC0) { /* 2-byte, not a surrogate pair */ - if ((c[0] & 0xC0) != 0x80) { - if (dst) free(dst); - Rf_error("illegal 2-byte sequence in Java string"); - } - *(d++) = *(c++); - continue; - } - if ((c[-1] & 0xF0) != 0xE0) { /* must be 3-byte */ - if (dst) free(dst); - Rf_error("illegal multi-byte seqeunce in Java string (>3-byte)"); - } - if (((c[0] & 0xC0) != 0x80 || - (c[1] & 0xC0) != 0x80)) { - if (dst) free(dst); - Rf_error("illegal 3-byte sequence in Java string"); - } - u1 = ((((unsigned int)c[-1]) & 0x0F) << 12) | - ((((unsigned int)c[0]) & 0x3F) << 6) | - (((unsigned int)c[1]) & 0x3F); - if (u1 < 0xD800 || u1 > 0xDBFF) { /* not a surrogate pair -> regular copy */ - *(d++) = *(c++); - *(d++) = *(c++); - continue; - } - if (u1 >= 0xDC00 && u1 <= 0xDFFF) { /* low surrogate pair ? */ - if (dst) free(dst); - Rf_error("illegal sequence in Java string: low surrogate pair without a high one"); - } - c += 2; /* move to the low pair */ - if (c[0] != 0xED || - (c[1] & 0xF0) != 0xB0 || - (c[2] & 0xC0) != 0x80) { - if (dst) free(dst); - Rf_error("illegal sequence in Java string: high surrogate pair not followed by low one"); - } - /* the actually encoded unicode character */ - u = ((((unsigned int)c[1]) & 0x0F) << 6) | - (((unsigned int)c[2]) & 0x3F); - u |= (u1 & 0x03FF) << 10; - u += 0x10000; - c += 3; - /* it must be <= 0x10FFFF by design (each surrogate has 10 bits) */ - d[-1] = (unsigned char) (((u >> 18) & 0x0F) | 0xF0); - *(d++) = (unsigned char) (((u >> 12) & 0x3F) | 0x80); - *(d++) = (unsigned char) (((u >> 6) & 0x3F) | 0x80); - *(d++) = (unsigned char) ((u & 0x3F) | 0x80); - } - res = mkCharLenCE((const char*) (dst ? dst : sbuf), dst ? (d - dst) : (d - sbuf), CE_UTF8); - if (dst) free(dst); - return res; - } - return mkCharLenCE(src, c - s, CE_UTF8); -} - -#endif static jstring newJavaString(JNIEnv *env, SEXP sChar) { jchar *s; - size_t len = rj_char_utf16(sChar, &s); + size_t len = rj_rchar_utf16(sChar, &s); return newString16(env, s, (len + 1) >> 1); } diff --git a/src/rJava.h b/src/rJava.h index 9e5bf78..b4c4c8f 100644 --- a/src/rJava.h +++ b/src/rJava.h @@ -97,14 +97,17 @@ void profReport(char *fmt, ...); #define END_RJAVA_CALL }; #endif -/* define mkCharUTF8 in a compatible fashion */ +/* define mkCharUTF8 in a compatible fashion + NOTE: those should NOT be used anymore since native + Java strings use UTF-16 so use only in cases where UTF8 is required */ #if R_VERSION < R_Version(2,7,0) #define mkCharUTF8(X) mkChar(X) #define CHAR_UTF8(X) CHAR(X) #else +#define mkCharUTF8(X) rj_mkCharUTF8(X) #define CHAR_UTF8(X) rj_char_utf8(X) -extern SEXP mkCharUTF8(const char *); -extern const char *rj_char_utf8(SEXP); +extern SEXP rj_mkCharUTF8(const char *); /* rjstring.c */ +extern const char *rj_char_utf8(SEXP); /* Rglue.c */ #endif /* signatures are stored in a local buffer if they fit. Only if they don't fit a heap buffer is allocated and used. */ diff --git a/src/rjstring.c b/src/rjstring.c new file mode 100644 index 0000000..6fce28b --- /dev/null +++ b/src/rjstring.c @@ -0,0 +1,231 @@ +#include "rjstring.h" + +#include +#include +#include +#include + +#ifdef WIN32 +/* -- currently unused - was used to mimick reEnc() + extern unsigned int localeCP; + static char cpbuf[16]; */ +#endif +static jchar js_zero[2] = { 0, 0 }; +static jchar js_buf[128]; + +/* if len = -1 then c is assumed to be NUL terminated */ +int rj_char_utf16(const char *c, int len, jchar **buf, const char *ifrom, int can_error) { + void *ih; + const char *ce = (len < 0) ? strchr(c, 0) : (c + len); + if (ce == c) { + buf[0] = js_zero; + return 0; + } + size_t osize = sizeof(jchar) * (ce - c + 1), isize = ce - c; + jchar *js = buf[0] = (osize < sizeof(js_buf)) ? js_buf : (jchar*) R_alloc(sizeof(jchar), ce - c + 1); + char *dst = (char*) js; + int end_test = 1, is_le = (((char*)&end_test)[0] == 1) ? 1 : 0; + if (!ifrom) ifrom = ""; + +#ifdef DEBUG_ENCODING + fprintf(stderr, "rJava.rj_char_utf16_native:"); + { const char *c0 = c; while (*c0) fprintf(stderr, " %02x", (int)((unsigned char)*(c0++))); } + fprintf(stderr, "\n"); +#endif + + ih = Riconv_open(is_le ? "UTF-16LE" : "UTF-16BE", ifrom); + if (ih == (void *)(-1)) { + if (can_error) + Rf_error("Unable to start conversion to UTF-16"); + return -1; + } + while (c < ce) { + size_t res = Riconv(ih, &c, &isize, &dst, &osize); + /* this should never happen since we allocated far more than needed */ + if (res == -1 && errno == E2BIG) { + if (can_error) + Rf_error("Conversion to UTF-16 failed due to unexpectedly large buffer requirements."); + return -1; + } else if(res == -1 && (errno == EILSEQ || errno == EINVAL)) { /* invalid char */ + if (is_le) { + *(dst++) = '?'; + *(dst++) = 0; + } else { + *(dst++) = 0; + *(dst++) = '?'; + } + osize -= 2; + c++; + isize--; + } + } + Riconv_close(ih); +#ifdef DEBUG_ENCODING + { const jchar *j = js; while (j < (const jchar*)dst) fprintf(stderr, " %04x", (unsigned int)*(j++)); } + fprintf(stderr, "\n"); +#endif + return dst - (char*) js; +} + +/* returns string from a CHARSXP making sure that the result is in UTF-16. + the buffer is owned by the function and may be static, so copy after use. + + Returns the length of the resulting string or -1 on error (if + can_error is 0). + */ +static int rj_CHARSXP_utf16_(SEXP s, jchar **buf, int can_error) { + cetype_t ce_in = getCharCE(s); + const char *ifrom = "", *c = CHAR(s), *ce = strchr(c, 0); + if (ce == c) { + buf[0] = js_zero; + return 0; + } + + switch (ce_in) { +#ifdef WIN32 + case CE_NATIVE: +/* reEnc uses this, but translateCharUtf8 uses "" so let's go with "" + sprintf(cpbuf, "CP%d", localeCP); + ifrom = cpbuf; +*/ + break; + case CE_LATIN1: ifrom = "CP1252"; break; +#else + case CE_NATIVE: break; /* is already "" */ + case CE_LATIN1: ifrom = "latin1"; break; +#endif + default: + ifrom = "UTF-8"; break; + } + + return rj_char_utf16(c, ce - c, buf, ifrom, can_error); +} + +int rj_rchar_utf16(SEXP s, jchar **buf) { return rj_CHARSXP_utf16_(s, buf, 1); } +int rj_rchar_utf16_noerr(SEXP s, jchar **buf) { return rj_CHARSXP_utf16_(s, buf, 0); } + +/* FIXME: we should probably deprecate this as well and use UTF-16 instead. + The only reason not to is that we would have to fully implement + a full UTF-16 -> UTF-8 conversion including surrogate pairs ... */ + +/* Java returns *modified* UTF-8 which is incompatible with UTF-8, + so we have to detect the illegal surrgoate pairs and convert them */ +SEXP rj_mkCharUTF8_(const char *src, int can_error) { + const unsigned char *s = (const unsigned char*) src; + const unsigned char *c = (const unsigned char*) s; + /* check if the string contains any surrogate pairs, i.e. + Unicode in the range 0xD800-0xDFFF + We want this to be fast since in 99.99% of cases it will + be false */ + while (*c) { + if (c[0] == 0xED && + (c[1] & 0xE0) == 0xA0) + break; + c++; + } + if (*c) { /* yes, we have to convert them */ + SEXP res; + const unsigned char *e = (const unsigned char*) strchr((const char*)s, 0); /* find the end for size */ + unsigned char *dst = 0, *d, sbuf[64]; + if (!e) /* should never occur */ + return mkChar(""); + /* we use static buffer for small strings and dynamic alloc for large */ + if (e - s >= sizeof(sbuf)) { + /* allocate temp buffer since our input is const */ + d = dst = (unsigned char *) malloc(e - s + 1); + if (!dst) { + if (can_error) + Rf_error("Cannot allocate memory for surrogate pair conversion"); + return 0; + } + } else + d = (unsigned char *)sbuf; + if (c - s > 0) { + memcpy(d, s, c - s); + d += c - s; + } + while (*c) { + unsigned int u1, u; + *(d++) = *(c++); + /* start of a sequence ? */ + if ((c[-1] & 0xC0) != 0xC0) + continue; + if ((c[-1] & 0xE0) == 0xC0) { /* 2-byte, not a surrogate pair */ + if ((c[0] & 0xC0) != 0x80) { + if (dst) free(dst); + if (can_error) + Rf_error("illegal 2-byte sequence in Java string"); + return 0; + } + *(d++) = *(c++); + continue; + } + if ((c[-1] & 0xF0) != 0xE0) { /* must be 3-byte */ + if (dst) free(dst); + if (can_error) + Rf_error("illegal multi-byte seqeunce in Java string (>3-byte)"); + return 0; + } + if (((c[0] & 0xC0) != 0x80 || + (c[1] & 0xC0) != 0x80)) { + if (dst) free(dst); + if (can_error) + Rf_error("illegal 3-byte sequence in Java string"); + return 0; + } + u1 = ((((unsigned int)c[-1]) & 0x0F) << 12) | + ((((unsigned int)c[0]) & 0x3F) << 6) | + (((unsigned int)c[1]) & 0x3F); + if (u1 < 0xD800 || u1 > 0xDBFF) { /* not a surrogate pair -> regular copy */ + *(d++) = *(c++); + *(d++) = *(c++); + continue; + } + if (u1 >= 0xDC00 && u1 <= 0xDFFF) { /* low surrogate pair ? */ + if (dst) free(dst); + if (can_error) + Rf_error("illegal sequence in Java string: low surrogate pair without a high one"); + return 0; + } + c += 2; /* move to the low pair */ + if (c[0] != 0xED || + (c[1] & 0xF0) != 0xB0 || + (c[2] & 0xC0) != 0x80) { + if (dst) free(dst); + if (can_error) + Rf_error("illegal sequence in Java string: high surrogate pair not followed by low one"); + return 0; + } + /* the actually encoded unicode character */ + u = ((((unsigned int)c[1]) & 0x0F) << 6) | + (((unsigned int)c[2]) & 0x3F); + u |= (u1 & 0x03FF) << 10; + u += 0x10000; + c += 3; + /* it must be <= 0x10FFFF by design (each surrogate has 10 bits) */ + d[-1] = (unsigned char) (((u >> 18) & 0x0F) | 0xF0); + *(d++) = (unsigned char) (((u >> 12) & 0x3F) | 0x80); + *(d++) = (unsigned char) (((u >> 6) & 0x3F) | 0x80); + *(d++) = (unsigned char) ((u & 0x3F) | 0x80); + } + res = mkCharLenCE((const char*) (dst ? dst : sbuf), dst ? (d - dst) : (d - sbuf), CE_UTF8); + if (dst) free(dst); + return res; + } + return mkCharLenCE(src, c - s, CE_UTF8); +} + +SEXP rj_mkCharUTF8(const char *src) { return rj_mkCharUTF8_(src, 0); } +SEXP rj_mkCharUTF8_noerr(const char *src) { return rj_mkCharUTF8_(src, 1); } + +jstring rj_newJavaString(JNIEnv *env, SEXP sChar) { + jchar *s; + int len = rj_rchar_utf16(sChar, &s); + return (*env)->NewString(env, s, (len + 1) >> 1); +} + +jstring rj_newNativeJavaString(JNIEnv *env, const char *str, int len) { + jchar *s; + int rlen = rj_char_utf16(str, len, &s, "", 0); + return (rlen < 0) ? 0 : (*env)->NewString(env, s, (rlen + 1) >> 1); +} diff --git a/src/rjstring.h b/src/rjstring.h new file mode 100644 index 0000000..6fd24a1 --- /dev/null +++ b/src/rjstring.h @@ -0,0 +1,25 @@ +#ifndef RJ_STRING_H__ +#define RJ_STRING_H__ + +#include /* for jchar */ +#include /* for SEXP */ + +/* --- API --- */ + +/* Returns static content for short strings so don't re-use. + For dynamic strings uses R_alloc */ +int rj_char_utf16(const char *c, int len, jchar **buf, const char *ifrom, int can_error); + +/* wrappers for above to use with CHARSXP to detect proper ifrom */ +int rj_rchar_utf16(SEXP s, jchar **buf); +int rj_rchar_utf16_noerr(SEXP s, jchar **buf); + +/* return jstring, but do NOT check exceptions */ +jstring rj_newJavaString(JNIEnv *env, SEXP sChar); +jstring rj_newNativeJavaString(JNIEnv *env, const char *str, int len); + +/* takes modified UTF-8 from Java, creates CHARSXP with valid UTF8 */ +SEXP rj_mkCharUTF8(const char *src); +SEXP rj_mkCharUTF8_noerr(const char *src); + +#endif