HTML5_InputStream

The WooCommerce PDF Invoices & Packing Slips HTML5 InputStream class.

Defined (1)

The class is defined in the following location(s).

/lib/dompdf/lib/html5lib/InputStream.php  
  1. class HTML5_InputStream { 
  2. /** 
  3. * The string data we're parsing. 
  4. */ 
  5. private $data; 
  6.  
  7. /** 
  8. * The current integer byte position we are in $data 
  9. */ 
  10. private $char; 
  11.  
  12. /** 
  13. * Length of $data; when $char === $data, we are at the end-of-file. 
  14. */ 
  15. private $EOF; 
  16.  
  17. /** 
  18. * Parse errors. 
  19. */ 
  20. public $errors = array(); 
  21.  
  22. /** 
  23. * @param $data Data to parse 
  24. */ 
  25. public function __construct($data) { 
  26.  
  27. /** Given an encoding, the bytes in the input stream must be 
  28. converted to Unicode characters for the tokeniser, as 
  29. described by the rules for that encoding, except that the 
  30. leading U+FEFF BYTE ORDER MARK character, if any, must not 
  31. be stripped by the encoding layer (it is stripped by the rule below). 
  32.   
  33. Bytes or sequences of bytes in the original byte stream that 
  34. could not be converted to Unicode characters must be converted 
  35. to U+FFFD REPLACEMENT CHARACTER code points. */ 
  36.  
  37. // XXX currently assuming input data is UTF-8; once we 
  38. // build encoding detection this will no longer be the case 
  39. // 
  40. // We previously had an mbstring implementation here, but that 
  41. // implementation is heavily non-conforming, so it's been 
  42. // omitted. 
  43. if (extension_loaded('iconv')) { 
  44. // non-conforming 
  45. $data = @iconv('UTF-8', 'UTF-8//IGNORE', $data); 
  46. } else { 
  47. // we can make a conforming native implementation 
  48. throw new Exception('Not implemented, please install mbstring or iconv'); 
  49.  
  50. /** One leading U+FEFF BYTE ORDER MARK character must be 
  51. ignored if any are present. */ 
  52. if (substr($data, 0, 3) === "\xEF\xBB\xBF") { 
  53. $data = substr($data, 3); 
  54.  
  55. /** All U+0000 NULL characters in the input must be replaced 
  56. by U+FFFD REPLACEMENT CHARACTERs. Any occurrences of such 
  57. characters is a parse error. */ 
  58. for ($i = 0, $count = substr_count($data, "\0"); $i < $count; $i++) { 
  59. $this->errors[] = array( 
  60. 'type' => HTML5_Tokenizer::PARSEERROR,  
  61. 'data' => 'null-character' 
  62. ); 
  63. /** U+000D CARRIAGE RETURN (CR) characters and U+000A LINE FEED 
  64. (LF) characters are treated specially. Any CR characters 
  65. that are followed by LF characters must be removed, and any 
  66. CR characters not followed by LF characters must be converted 
  67. to LF characters. Thus, newlines in HTML DOMs are represented 
  68. by LF characters, and there are never any CR characters in the 
  69. input to the tokenization stage. */ 
  70. $data = str_replace( 
  71. array( 
  72. "\0",  
  73. "\r\n",  
  74. "\r" 
  75. ),  
  76. array( 
  77. "\xEF\xBF\xBD",  
  78. "\n",  
  79. "\n" 
  80. ),  
  81. $data 
  82. ); 
  83.  
  84. /** Any occurrences of any characters in the ranges U+0001 to 
  85. U+0008, U+000B, U+000E to U+001F, U+007F to U+009F,  
  86. U+D800 to U+DFFF , U+FDD0 to U+FDEF, and 
  87. characters U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF,  
  88. U+3FFFE, U+3FFFF, U+4FFFE, U+4FFFF, U+5FFFE, U+5FFFF, U+6FFFE,  
  89. U+6FFFF, U+7FFFE, U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF,  
  90. U+AFFFE, U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE,  
  91. U+DFFFF, U+EFFFE, U+EFFFF, U+FFFFE, U+FFFFF, U+10FFFE, and 
  92. U+10FFFF are parse errors. (These are all control characters 
  93. or permanently undefined Unicode characters.) */ 
  94. // Check PCRE is loaded. 
  95. if (extension_loaded('pcre')) { 
  96. $count = preg_match_all( 
  97. '/(?: 
  98. [\x01-\x08\x0B\x0E-\x1F\x7F] # U+0001 to U+0008, U+000B, U+000E to U+001F and U+007F 
  99. \xC2[\x80-\x9F] # U+0080 to U+009F 
  100. \xED(?:\xA0[\x80-\xFF]|[\xA1-\xBE][\x00-\xFF]|\xBF[\x00-\xBF]) # U+D800 to U+DFFFF 
  101. \xEF\xB7[\x90-\xAF] # U+FDD0 to U+FDEF 
  102. \xEF\xBF[\xBE\xBF] # U+FFFE and U+FFFF 
  103. [\xF0-\xF4][\x8F-\xBF]\xBF[\xBE\xBF] # U+nFFFE and U+nFFFF (1 <= n <= 10_{16}) 
  104. )/x',  
  105. $data,  
  106. $matches 
  107. ); 
  108. for ($i = 0; $i < $count; $i++) { 
  109. $this->errors[] = array( 
  110. 'type' => HTML5_Tokenizer::PARSEERROR,  
  111. 'data' => 'invalid-codepoint' 
  112. ); 
  113. } else { 
  114. // XXX: Need non-PCRE impl, probably using substr_count 
  115.  
  116. $this->data = $data; 
  117. $this->char = 0; 
  118. $this->EOF = strlen($data); 
  119.  
  120. /** 
  121. * Returns the current line that the tokenizer is at. 
  122. */ 
  123. public function getCurrentLine() { 
  124. // Check the string isn't empty 
  125. if($this->EOF) { 
  126. // Add one to $this->char because we want the number for the next 
  127. // byte to be processed. 
  128. return substr_count($this->data, "\n", 0, min($this->char, $this->EOF)) + 1; 
  129. } else { 
  130. // If the string is empty, we are on the first line (sorta). 
  131. return 1; 
  132.  
  133. /** 
  134. * Returns the current column of the current line that the tokenizer is at. 
  135. */ 
  136. public function getColumnOffset() { 
  137. // strrpos is weird, and the offset needs to be negative for what we 
  138. // want (i.e., the last \n before $this->char). This needs to not have 
  139. // one (to make it point to the next character, the one we want the 
  140. // position of) added to it because strrpos's behaviour includes the 
  141. // final offset byte. 
  142. $lastLine = strrpos($this->data, "\n", $this->char - 1 - strlen($this->data)); 
  143.  
  144. // However, for here we want the length up until the next byte to be 
  145. // processed, so add one to the current byte ($this->char). 
  146. if($lastLine !== false) { 
  147. $findLengthOf = substr($this->data, $lastLine + 1, $this->char - 1 - $lastLine); 
  148. } else { 
  149. $findLengthOf = substr($this->data, 0, $this->char); 
  150.  
  151. // Get the length for the string we need. 
  152. if(extension_loaded('iconv')) { 
  153. return iconv_strlen($findLengthOf, 'utf-8'); 
  154. } elseif(extension_loaded('mbstring')) { 
  155. return mb_strlen($findLengthOf, 'utf-8'); 
  156. } elseif(extension_loaded('xml')) { 
  157. return strlen(utf8_decode($findLengthOf)); 
  158. } else { 
  159. $count = count_chars($findLengthOf); 
  160. // 0x80 = 0x7F - 0 + 1 (one added to get inclusive range) 
  161. // 0x33 = 0xF4 - 0x2C + 1 (one added to get inclusive range) 
  162. return array_sum(array_slice($count, 0, 0x80)) + 
  163. array_sum(array_slice($count, 0xC2, 0x33)); 
  164.  
  165. /** 
  166. * Retrieve the currently consume character. 
  167. * @note This performs bounds checking 
  168. */ 
  169. public function char() { 
  170. return ($this->char++ < $this->EOF) 
  171. ? $this->data[$this->char - 1] 
  172. : false; 
  173.  
  174. /** 
  175. * Get all characters until EOF. 
  176. * @note This performs bounds checking 
  177. */ 
  178. public function remainingChars() { 
  179. if($this->char < $this->EOF) { 
  180. $data = substr($this->data, $this->char); 
  181. $this->char = $this->EOF; 
  182. return $data; 
  183. } else { 
  184. return false; 
  185.  
  186. /** 
  187. * Matches as far as possible until we reach a certain set of bytes 
  188. * and returns the matched substring. 
  189. * @param $bytes Bytes to match. 
  190. */ 
  191. public function charsUntil($bytes, $max = null) { 
  192. if ($this->char < $this->EOF) { 
  193. if ($max === 0 || $max) { 
  194. $len = strcspn($this->data, $bytes, $this->char, $max); 
  195. } else { 
  196. $len = strcspn($this->data, $bytes, $this->char); 
  197. $string = (string) substr($this->data, $this->char, $len); 
  198. $this->char += $len; 
  199. return $string; 
  200. } else { 
  201. return false; 
  202.  
  203. /** 
  204. * Matches as far as possible with a certain set of bytes 
  205. * and returns the matched substring. 
  206. * @param $bytes Bytes to match. 
  207. */ 
  208. public function charsWhile($bytes, $max = null) { 
  209. if ($this->char < $this->EOF) { 
  210. if ($max === 0 || $max) { 
  211. $len = strspn($this->data, $bytes, $this->char, $max); 
  212. } else { 
  213. $len = strspn($this->data, $bytes, $this->char); 
  214. $string = (string) substr($this->data, $this->char, $len); 
  215. $this->char += $len; 
  216. return $string; 
  217. } else { 
  218. return false; 
  219.  
  220. /** 
  221. * Unconsume one character. 
  222. */ 
  223. public function unget() { 
  224. if ($this->char <= $this->EOF) { 
  225. $this->char--;