/** * @license * Copyright 2019 The Emscripten Authors * SPDX-License-Identifier: MIT */ // runtime_strings.js: Strings related runtime functions that are part of both MINIMAL_RUNTIME and regular runtime. // Given a pointer 'ptr' to a null-terminated UTF8-encoded string in the given array that contains uint8 values, returns // a copy of that string as a Javascript String object. #if USE_PTHREADS && TEXTDECODER // UTF8Decoder.decode may not work with a view of a SharedArrayBuffer, see // https://github.com/whatwg/encoding/issues/172 // To avoid that, we wrap around it and add a copy into a normal ArrayBuffer, // which can still be much faster than creating a string character by // character. function TextDecoderWrapper(encoding) { var textDecoder = new TextDecoder(encoding); this.decode = function(data) { #if ASSERTIONS assert(data instanceof Uint8Array); #endif // While we compile with pthreads, this method can be called on side buffers // as well, such as the stdout buffer in the filesystem code. Only copy when // we have to. if (data.buffer instanceof SharedArrayBuffer) { data = new Uint8Array(data); } return textDecoder.decode.call(textDecoder, data); }; } #endif #if TEXTDECODER == 2 var UTF8Decoder = new TextDecoder{{{ USE_PTHREADS ? 'Wrapper' : ''}}}('utf8'); #else // TEXTDECODER == 2 #if TEXTDECODER var UTF8Decoder = typeof TextDecoder !== 'undefined' ? new TextDecoder{{{ USE_PTHREADS ? 'Wrapper' : ''}}}('utf8') : undefined; #endif // TEXTDECODER #endif // TEXTDECODER == 2 /** * @param {number} idx * @param {number=} maxBytesToRead * @return {string} */ function UTF8ArrayToString(heap, idx, maxBytesToRead) { #if CAN_ADDRESS_2GB idx >>>= 0; #endif var endIdx = idx + maxBytesToRead; #if TEXTDECODER var endPtr = idx; // TextDecoder needs to know the byte length in advance, it doesn't stop on null terminator by itself. // Also, use the length info to avoid running tiny strings through TextDecoder, since .subarray() allocates garbage. // (As a tiny code save trick, compare endPtr against endIdx using a negation, so that undefined means Infinity) while (heap[endPtr] && !(endPtr >= endIdx)) ++endPtr; #endif // TEXTDECODER #if TEXTDECODER == 2 return UTF8Decoder.decode( heap.subarray ? heap.subarray(idx, endPtr) : new Uint8Array(heap.slice(idx, endPtr)) ); #else // TEXTDECODER == 2 #if TEXTDECODER if (endPtr - idx > 16 && heap.subarray && UTF8Decoder) { return UTF8Decoder.decode(heap.subarray(idx, endPtr)); } else { #endif // TEXTDECODER var str = ''; #if TEXTDECODER // If building with TextDecoder, we have already computed the string length above, so test loop end condition against that while (idx < endPtr) { #else while (!(idx >= endIdx)) { #endif // For UTF8 byte structure, see: // http://en.wikipedia.org/wiki/UTF-8#Description // https://www.ietf.org/rfc/rfc2279.txt // https://tools.ietf.org/html/rfc3629 var u0 = heap[idx++]; #if !TEXTDECODER // If not building with TextDecoder enabled, we don't know the string length, so scan for \0 byte. // If building with TextDecoder, we know exactly at what byte index the string ends, so checking for nulls here would be redundant. if (!u0) return str; #endif if (!(u0 & 0x80)) { str += String.fromCharCode(u0); continue; } var u1 = heap[idx++] & 63; if ((u0 & 0xE0) == 0xC0) { str += String.fromCharCode(((u0 & 31) << 6) | u1); continue; } var u2 = heap[idx++] & 63; if ((u0 & 0xF0) == 0xE0) { u0 = ((u0 & 15) << 12) | (u1 << 6) | u2; } else { #if ASSERTIONS if ((u0 & 0xF8) != 0xF0) warnOnce('Invalid UTF-8 leading byte 0x' + u0.toString(16) + ' encountered when deserializing a UTF-8 string in wasm memory to a JS string!'); #endif u0 = ((u0 & 7) << 18) | (u1 << 12) | (u2 << 6) | (heap[idx++] & 63); } if (u0 < 0x10000) { str += String.fromCharCode(u0); } else { var ch = u0 - 0x10000; str += String.fromCharCode(0xD800 | (ch >> 10), 0xDC00 | (ch & 0x3FF)); } } #if TEXTDECODER } #endif // TEXTDECODER return str; #endif // TEXTDECODER == 2 } // Given a pointer 'ptr' to a null-terminated UTF8-encoded string in the emscripten HEAP, returns a // copy of that string as a Javascript String object. // maxBytesToRead: an optional length that specifies the maximum number of bytes to read. You can omit // this parameter to scan the string until the first \0 byte. If maxBytesToRead is // passed, and the string at [ptr, ptr+maxBytesToReadr[ contains a null byte in the // middle, then the string will cut short at that byte index (i.e. maxBytesToRead will // not produce a string of exact length [ptr, ptr+maxBytesToRead[) // N.B. mixing frequent uses of UTF8ToString() with and without maxBytesToRead may // throw JS JIT optimizations off, so it is worth to consider consistently using one // style or the other. /** * @param {number} ptr * @param {number=} maxBytesToRead * @return {string} */ function UTF8ToString(ptr, maxBytesToRead) { #if CAN_ADDRESS_2GB ptr >>>= 0; #endif #if TEXTDECODER == 2 if (!ptr) return ''; var maxPtr = ptr + maxBytesToRead; for (var end = ptr; !(end >= maxPtr) && HEAPU8[end];) ++end; return UTF8Decoder.decode(HEAPU8.subarray(ptr, end)); #else return ptr ? UTF8ArrayToString(HEAPU8, ptr, maxBytesToRead) : ''; #endif } // Copies the given Javascript String object 'str' to the given byte array at address 'outIdx', // encoded in UTF8 form and null-terminated. The copy will require at most str.length*4+1 bytes of space in the HEAP. // Use the function lengthBytesUTF8 to compute the exact number of bytes (excluding null terminator) that this function will write. // Parameters: // str: the Javascript string to copy. // heap: the array to copy to. Each index in this array is assumed to be one 8-byte element. // outIdx: The starting offset in the array to begin the copying. // maxBytesToWrite: The maximum number of bytes this function can write to the array. // This count should include the null terminator, // i.e. if maxBytesToWrite=1, only the null terminator will be written and nothing else. // maxBytesToWrite=0 does not write any bytes to the output, not even the null terminator. // Returns the number of bytes written, EXCLUDING the null terminator. function stringToUTF8Array(str, heap, outIdx, maxBytesToWrite) { #if CAN_ADDRESS_2GB outIdx >>>= 0; #endif if (!(maxBytesToWrite > 0)) // Parameter maxBytesToWrite is not optional. Negative values, 0, null, undefined and false each don't write out any bytes. return 0; var startIdx = outIdx; var endIdx = outIdx + maxBytesToWrite - 1; // -1 for string null terminator. for (var i = 0; i < str.length; ++i) { // Gotcha: charCodeAt returns a 16-bit word that is a UTF-16 encoded code unit, not a Unicode code point of the character! So decode UTF16->UTF32->UTF8. // See http://unicode.org/faq/utf_bom.html#utf16-3 // For UTF8 byte structure, see http://en.wikipedia.org/wiki/UTF-8#Description and https://www.ietf.org/rfc/rfc2279.txt and https://tools.ietf.org/html/rfc3629 var u = str.charCodeAt(i); // possibly a lead surrogate if (u >= 0xD800 && u <= 0xDFFF) { var u1 = str.charCodeAt(++i); u = 0x10000 + ((u & 0x3FF) << 10) | (u1 & 0x3FF); } if (u <= 0x7F) { if (outIdx >= endIdx) break; heap[outIdx++] = u; } else if (u <= 0x7FF) { if (outIdx + 1 >= endIdx) break; heap[outIdx++] = 0xC0 | (u >> 6); heap[outIdx++] = 0x80 | (u & 63); } else if (u <= 0xFFFF) { if (outIdx + 2 >= endIdx) break; heap[outIdx++] = 0xE0 | (u >> 12); heap[outIdx++] = 0x80 | ((u >> 6) & 63); heap[outIdx++] = 0x80 | (u & 63); } else { if (outIdx + 3 >= endIdx) break; #if ASSERTIONS if (u >= 0x200000) warnOnce('Invalid Unicode code point 0x' + u.toString(16) + ' encountered when serializing a JS string to a UTF-8 string in wasm memory! (Valid unicode code points should be in range 0-0x1FFFFF).'); #endif heap[outIdx++] = 0xF0 | (u >> 18); heap[outIdx++] = 0x80 | ((u >> 12) & 63); heap[outIdx++] = 0x80 | ((u >> 6) & 63); heap[outIdx++] = 0x80 | (u & 63); } } // Null-terminate the pointer to the buffer. heap[outIdx] = 0; return outIdx - startIdx; } // Copies the given Javascript String object 'str' to the emscripten HEAP at address 'outPtr', // null-terminated and encoded in UTF8 form. The copy will require at most str.length*4+1 bytes of space in the HEAP. // Use the function lengthBytesUTF8 to compute the exact number of bytes (excluding null terminator) that this function will write. // Returns the number of bytes written, EXCLUDING the null terminator. function stringToUTF8(str, outPtr, maxBytesToWrite) { #if ASSERTIONS assert(typeof maxBytesToWrite == 'number', 'stringToUTF8(str, outPtr, maxBytesToWrite) is missing the third parameter that specifies the length of the output buffer!'); #endif return stringToUTF8Array(str, {{{ heapAndOffset('HEAPU8', 'outPtr') }}}, maxBytesToWrite); } // Returns the number of bytes the given Javascript string takes if encoded as a UTF8 byte array, EXCLUDING the null terminator byte. function lengthBytesUTF8(str) { var len = 0; for (var i = 0; i < str.length; ++i) { // Gotcha: charCodeAt returns a 16-bit word that is a UTF-16 encoded code unit, not a Unicode code point of the character! So decode UTF16->UTF32->UTF8. // See http://unicode.org/faq/utf_bom.html#utf16-3 var u = str.charCodeAt(i); // possibly a lead surrogate if (u >= 0xD800 && u <= 0xDFFF) u = 0x10000 + ((u & 0x3FF) << 10) | (str.charCodeAt(++i) & 0x3FF); if (u <= 0x7F) ++len; else if (u <= 0x7FF) len += 2; else if (u <= 0xFFFF) len += 3; else len += 4; } return len; }