Source: lib/util/string_utils.js

  1. /**
  2. * @license
  3. * Copyright 2016 Google Inc.
  4. *
  5. * Licensed under the Apache License, Version 2.0 (the "License");
  6. * you may not use this file except in compliance with the License.
  7. * You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software
  12. * distributed under the License is distributed on an "AS IS" BASIS,
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. * See the License for the specific language governing permissions and
  15. * limitations under the License.
  16. */
  17. goog.provide('shaka.util.StringUtils');
  18. goog.require('shaka.log');
  19. goog.require('shaka.util.Error');
  20. /**
  21. * @namespace shaka.util.StringUtils
  22. * @summary A set of string utility functions.
  23. * @exportDoc
  24. */
  25. /**
  26. * Creates a string from the given buffer as UTF-8 encoding.
  27. *
  28. * @param {?BufferSource} data
  29. * @return {string}
  30. * @throws {shaka.util.Error}
  31. * @export
  32. */
  33. shaka.util.StringUtils.fromUTF8 = function(data) {
  34. if (!data) return '';
  35. let uint8 = new Uint8Array(data);
  36. // If present, strip off the UTF-8 BOM.
  37. if (uint8[0] == 0xef && uint8[1] == 0xbb && uint8[2] == 0xbf) {
  38. uint8 = uint8.subarray(3);
  39. }
  40. // http://stackoverflow.com/a/13691499
  41. let utf8 = shaka.util.StringUtils.fromCharCode(uint8);
  42. // This converts each character in the string to an escape sequence. If the
  43. // character is in the ASCII range, it is not converted; otherwise it is
  44. // converted to a URI escape sequence.
  45. // Example: '\x67\x35\xe3\x82\xac' -> 'g#%E3%82%AC'
  46. let escaped = escape(utf8);
  47. // Decode the escaped sequence. This will interpret UTF-8 sequences into the
  48. // correct character.
  49. // Example: 'g#%E3%82%AC' -> 'g#€'
  50. try {
  51. return decodeURIComponent(escaped);
  52. } catch (e) {
  53. throw new shaka.util.Error(
  54. shaka.util.Error.Severity.CRITICAL, shaka.util.Error.Category.TEXT,
  55. shaka.util.Error.Code.BAD_ENCODING);
  56. }
  57. };
  58. /**
  59. * Creates a string from the given buffer as UTF-16 encoding.
  60. *
  61. * @param {?BufferSource} data
  62. * @param {boolean} littleEndian true to read little endian, false to read big.
  63. * @param {boolean=} noThrow true to avoid throwing in cases where we may
  64. * expect invalid input. If noThrow is true and the data has an odd length,
  65. * it will be truncated.
  66. * @return {string}
  67. * @throws {shaka.util.Error}
  68. * @export
  69. */
  70. shaka.util.StringUtils.fromUTF16 = function(data, littleEndian, noThrow) {
  71. if (!data) return '';
  72. if (!noThrow && data.byteLength % 2 != 0) {
  73. shaka.log.error('Data has an incorrect length, must be even.');
  74. throw new shaka.util.Error(
  75. shaka.util.Error.Severity.CRITICAL, shaka.util.Error.Category.TEXT,
  76. shaka.util.Error.Code.BAD_ENCODING);
  77. }
  78. /** @type {ArrayBuffer} */
  79. let buffer;
  80. if (data instanceof ArrayBuffer) {
  81. buffer = data;
  82. } else {
  83. // Have to create a new buffer because the argument may be a smaller
  84. // view on a larger ArrayBuffer. We cannot use an ArrayBufferView in
  85. // a DataView.
  86. let temp = new Uint8Array(data.byteLength);
  87. temp.set(new Uint8Array(data));
  88. buffer = temp.buffer;
  89. }
  90. // Use a DataView to ensure correct endianness.
  91. let length = Math.floor(data.byteLength / 2);
  92. let arr = new Uint16Array(length);
  93. let dataView = new DataView(buffer);
  94. for (let i = 0; i < length; i++) {
  95. arr[i] = dataView.getUint16(i * 2, littleEndian);
  96. }
  97. return shaka.util.StringUtils.fromCharCode(arr);
  98. };
  99. /**
  100. * Creates a string from the given buffer, auto-detecting the encoding that is
  101. * being used. If it cannot detect the encoding, it will throw an exception.
  102. *
  103. * @param {?BufferSource} data
  104. * @return {string}
  105. * @throws {shaka.util.Error}
  106. * @export
  107. */
  108. shaka.util.StringUtils.fromBytesAutoDetect = function(data) {
  109. const StringUtils = shaka.util.StringUtils;
  110. let uint8 = new Uint8Array(data);
  111. if (uint8[0] == 0xef && uint8[1] == 0xbb && uint8[2] == 0xbf) {
  112. return StringUtils.fromUTF8(uint8);
  113. } else if (uint8[0] == 0xfe && uint8[1] == 0xff) {
  114. return StringUtils.fromUTF16(uint8.subarray(2), false /* littleEndian */);
  115. } else if (uint8[0] == 0xff && uint8[1] == 0xfe) {
  116. return StringUtils.fromUTF16(uint8.subarray(2), true /* littleEndian */);
  117. }
  118. let isAscii = (function(arr, i) {
  119. // arr[i] >= ' ' && arr[i] <= '~';
  120. return arr.byteLength <= i || (arr[i] >= 0x20 && arr[i] <= 0x7e);
  121. }.bind(null, uint8));
  122. shaka.log.debug('Unable to find byte-order-mark, making an educated guess.');
  123. if (uint8[0] == 0 && uint8[2] == 0) {
  124. return StringUtils.fromUTF16(data, false /* littleEndian */);
  125. } else if (uint8[1] == 0 && uint8[3] == 0) {
  126. return StringUtils.fromUTF16(data, true /* littleEndian */);
  127. } else if (isAscii(0) && isAscii(1) && isAscii(2) && isAscii(3)) {
  128. return StringUtils.fromUTF8(data);
  129. }
  130. throw new shaka.util.Error(
  131. shaka.util.Error.Severity.CRITICAL,
  132. shaka.util.Error.Category.TEXT,
  133. shaka.util.Error.Code.UNABLE_TO_DETECT_ENCODING);
  134. };
  135. /**
  136. * Creates a ArrayBuffer from the given string, converting to UTF-8 encoding.
  137. *
  138. * @param {string} str
  139. * @return {!ArrayBuffer}
  140. * @export
  141. */
  142. shaka.util.StringUtils.toUTF8 = function(str) {
  143. // http://stackoverflow.com/a/13691499
  144. // Converts the given string to a URI encoded string. If a character falls
  145. // in the ASCII range, it is not converted; otherwise it will be converted to
  146. // a series of URI escape sequences according to UTF-8.
  147. // Example: 'g#€' -> 'g#%E3%82%AC'
  148. let encoded = encodeURIComponent(str);
  149. // Convert each escape sequence individually into a character. Each escape
  150. // sequence is interpreted as a code-point, so if an escape sequence happens
  151. // to be part of a multi-byte sequence, each byte will be converted to a
  152. // single character.
  153. // Example: 'g#%E3%82%AC' -> '\x67\x35\xe3\x82\xac'
  154. let utf8 = unescape(encoded);
  155. let result = new Uint8Array(utf8.length);
  156. for (let i = 0; i < utf8.length; ++i) {
  157. result[i] = utf8.charCodeAt(i);
  158. }
  159. return result.buffer;
  160. };
  161. /**
  162. * Creates a ArrayBuffer from the given string, converting to UTF-16 encoding.
  163. *
  164. * @param {string} str
  165. * @param {boolean} littleEndian
  166. * @return {!ArrayBuffer}
  167. * @export
  168. */
  169. shaka.util.StringUtils.toUTF16 = function(str, littleEndian) {
  170. const result = new Uint8Array(str.length * 2);
  171. const view = new DataView(result.buffer);
  172. for (let i = 0; i < str.length; ++i) {
  173. const value = str.charCodeAt(i);
  174. view.setUint16(/* position= */ i * 2, value, littleEndian);
  175. }
  176. return result.buffer;
  177. };
  178. /**
  179. * Creates a new string from the given array of char codes.
  180. *
  181. * Using String.fromCharCode.apply is risky because you can trigger stack errors
  182. * on very large arrays. This breaks up the array into several pieces to avoid
  183. * this.
  184. *
  185. * @param {!TypedArray} array
  186. * @return {string}
  187. */
  188. shaka.util.StringUtils.fromCharCode = function(array) {
  189. let max = 16000;
  190. let ret = '';
  191. for (let i = 0; i < array.length; i += max) {
  192. let subArray = array.subarray(i, i + max);
  193. ret += String.fromCharCode.apply(null, subArray);
  194. }
  195. return ret;
  196. };