simple_html_dom

Simple html dom parser Paperg - in the find routine: allow us to specify that we want case insensitive testing of the value of the selector.

Defined (1)

The class is defined in the following location(s).

/products/photocrati_nextgen/modules/simplehtmldom/simplehtmldom/simple_html_dom.php  
  1. class simple_html_dom 
  2. public $root = null; 
  3. public $nodes = array(); 
  4. public $callback = null; 
  5. public $lowercase = false; 
  6. // Used to keep track of how large the text was when we started. 
  7. public $original_size; 
  8. public $size; 
  9. protected $pos; 
  10. protected $doc; 
  11. protected $char; 
  12. protected $cursor; 
  13. protected $parent; 
  14. protected $noise = array(); 
  15. protected $token_blank = " \t\r\n"; 
  16. protected $token_equal = ' =/>'; 
  17. protected $token_slash = " />\r\n\t"; 
  18. protected $token_attr = ' >'; 
  19. // Note that this is referenced by a child node, and so it needs to be public for that node to see this information. 
  20. public $_charset = ''; 
  21. public $_target_charset = ''; 
  22. protected $default_br_text = ""; 
  23. public $default_span_text = ""; 
  24.  
  25. // use isset instead of in_array, performance boost about 30%... 
  26. protected $self_closing_tags = array('img'=>1, 'br'=>1, 'input'=>1, 'meta'=>1, 'link'=>1, 'hr'=>1, 'base'=>1, 'embed'=>1, 'spacer'=>1); 
  27. protected $block_tags = array('root'=>1, 'body'=>1, 'form'=>1, 'div'=>1, 'span'=>1, 'table'=>1); 
  28. // Known sourceforge issue #2977341 
  29. // B tags that are not closed cause us to return everything to the end of the document. 
  30. protected $optional_closing_tags = array( 
  31. 'tr'=>array('tr'=>1, 'td'=>1, 'th'=>1),  
  32. 'th'=>array('th'=>1),  
  33. 'td'=>array('td'=>1),  
  34. 'li'=>array('li'=>1),  
  35. 'dt'=>array('dt'=>1, 'dd'=>1),  
  36. 'dd'=>array('dd'=>1, 'dt'=>1),  
  37. 'dl'=>array('dd'=>1, 'dt'=>1),  
  38. 'p'=>array('p'=>1),  
  39. 'nobr'=>array('nobr'=>1),  
  40. 'b'=>array('b'=>1),  
  41. 'option'=>array('option'=>1),  
  42. ); 
  43.  
  44. function __construct($str=null, $lowercase=true, $forceTagsClosed=true, $target_charset=DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) 
  45. if ($str) 
  46. if (preg_match("/^http:\/\//i", $str) || is_file($str)) 
  47. $this->load_file($str); 
  48. else 
  49. $this->load($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText); 
  50. // Forcing tags to be closed implies that we don't trust the html, but it can lead to parsing errors if we SHOULD trust the html. 
  51. if (!$forceTagsClosed) { 
  52. $this->optional_closing_array=array(); 
  53. $this->_target_charset = $target_charset; 
  54.  
  55. function __destruct() 
  56. $this->clear(); 
  57.  
  58. // load html from string 
  59. function load($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) 
  60. global $debug_object; 
  61.  
  62. // prepare 
  63. $this->prepare($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText); 
  64. // strip out cdata 
  65. $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true); 
  66. // strip out comments 
  67. $this->remove_noise("'<!--(.*?)-->'is"); 
  68. // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037 
  69. // Script tags removal now preceeds style tag removal. 
  70. // strip out <script> tags 
  71. $this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is"); 
  72. $this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is"); 
  73. // strip out <style> tags 
  74. $this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is"); 
  75. $this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is"); 
  76. // strip out preformatted tags 
  77. $this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is"); 
  78. // strip out server side scripts 
  79. $this->remove_noise("'(<\?)(.*?)(\?>)'s", true); 
  80. // strip smarty scripts 
  81. $this->remove_noise("'(\{\w)(.*?)(\})'s", true); 
  82.  
  83. // parsing 
  84. while ($this->parse()); 
  85. // end 
  86. $this->root->_[HDOM_INFO_END] = $this->cursor; 
  87. $this->parse_charset(); 
  88.  
  89. // make load function chainable 
  90. return $this; 
  91.  
  92.  
  93. // load html from file 
  94. function load_file() 
  95. $args = func_get_args(); 
  96. $this->load(call_user_func_array('file_get_contents', $args), true); 
  97. // Throw an error if we can't properly load the dom. 
  98. if (($error=error_get_last())!==null) { 
  99. $this->clear(); 
  100. return false; 
  101.  
  102. // set callback function 
  103. function set_callback($function_name) 
  104. $this->callback = $function_name; 
  105.  
  106. // remove callback function 
  107. function remove_callback() 
  108. $this->callback = null; 
  109.  
  110. // save dom as string 
  111. function save($filepath='') 
  112. $ret = $this->root->innertext(); 
  113. if ($filepath!=='') file_put_contents($filepath, $ret, LOCK_EX); 
  114. return $ret; 
  115.  
  116. // find dom node by css selector 
  117. // Paperg - allow us to specify that we want case insensitive testing of the value of the selector. 
  118. function find($selector, $idx=null, $lowercase=false) 
  119. return $this->root->find($selector, $idx, $lowercase); 
  120.  
  121. // clean up memory due to php5 circular references memory leak... 
  122. function clear() 
  123. foreach ($this->nodes as $n) {$n->clear(); $n = null;} 
  124. // This add next line is documented in the sourceforge repository. 2977248 as a fix for ongoing memory leaks that occur even with the use of clear. 
  125. if (isset($this->children)) foreach ($this->children as $n) {$n->clear(); $n = null;} 
  126. if (isset($this->parent)) {$this->parent->clear(); unset($this->parent);} 
  127. if (isset($this->root)) {$this->root->clear(); unset($this->root);} 
  128. unset($this->doc); 
  129. unset($this->noise); 
  130.  
  131. function dump($show_attr=true) 
  132. $this->root->dump($show_attr); 
  133.  
  134. // prepare HTML data and init everything 
  135. protected function prepare($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) 
  136. $this->clear(); 
  137.  
  138. // set the length of content before we do anything to it. 
  139. $this->size = strlen($str); 
  140. // Save the original size of the html that we got in. It might be useful to someone. 
  141. $this->original_size = $this->size; 
  142.  
  143. //before we save the string as the doc... strip out the \r \n's if we are told to. 
  144. if ($stripRN) { 
  145. $str = str_replace("\r", " ", $str); 
  146. $str = str_replace("\n", " ", $str); 
  147.  
  148. // set the length of content since we have changed it. 
  149. $this->size = strlen($str); 
  150.  
  151. $this->doc = $str; 
  152. $this->pos = 0; 
  153. $this->cursor = 1; 
  154. $this->noise = array(); 
  155. $this->nodes = array(); 
  156. $this->lowercase = $lowercase; 
  157. $this->default_br_text = $defaultBRText; 
  158. $this->default_span_text = $defaultSpanText; 
  159. $this->root = new simple_html_dom_node($this); 
  160. $this->root->tag = 'root'; 
  161. $this->root->_[HDOM_INFO_BEGIN] = -1; 
  162. $this->root->nodetype = HDOM_TYPE_ROOT; 
  163. $this->parent = $this->root; 
  164. if ($this->size>0) $this->char = $this->doc[0]; 
  165.  
  166. // parse html content 
  167. protected function parse() 
  168. if (($s = $this->copy_until_char('<'))==='') 
  169. return $this->read_tag(); 
  170.  
  171. // text 
  172. $node = new simple_html_dom_node($this); 
  173. ++$this->cursor; 
  174. $node->_[HDOM_INFO_TEXT] = $s; 
  175. $this->link_nodes($node, false); 
  176. return true; 
  177.  
  178. // PAPERG - dkchou - added this to try to identify the character set of the page we have just parsed so we know better how to spit it out later. 
  179. // NOTE: IF you provide a routine called get_last_retrieve_url_contents_content_type which returns the CURLINFO_CONTENT_TYPE from the last curl_exec 
  180. // (or the content_type header from the last transfer), we will parse THAT, and if a charset is specified, we will use it over any other mechanism. 
  181. protected function parse_charset() 
  182. global $debug_object; 
  183.  
  184. $charset = null; 
  185.  
  186. if (function_exists('get_last_retrieve_url_contents_content_type')) 
  187. $contentTypeHeader = get_last_retrieve_url_contents_content_type(); 
  188. $success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches); 
  189. if ($success) 
  190. $charset = $matches[1]; 
  191. if (is_object($debug_object)) {$debug_object->debug_log(2, 'header content-type found charset of: ' . $charset);} 
  192.  
  193.  
  194. if (empty($charset)) 
  195. $el = $this->root->find('meta[http-equiv=Content-Type]', 0); 
  196. if (!empty($el)) 
  197. $fullvalue = $el->content; 
  198. if (is_object($debug_object)) {$debug_object->debug_log(2, 'meta content-type tag found' . $fullvalue);} 
  199.  
  200. if (!empty($fullvalue)) 
  201. $success = preg_match('/charset=(.+)/', $fullvalue, $matches); 
  202. if ($success) 
  203. $charset = $matches[1]; 
  204. else 
  205. // If there is a meta tag, and they don't specify the character set, research says that it's typically ISO-8859-1 
  206. if (is_object($debug_object)) {$debug_object->debug_log(2, 'meta content-type tag couldn\'t be parsed. using iso-8859 default.');} 
  207. $charset = 'ISO-8859-1'; 
  208.  
  209. // If we couldn't find a charset above, then lets try to detect one based on the text we got... 
  210. if (empty($charset)) 
  211. // Use this in case mb_detect_charset isn't installed/loaded on this machine. 
  212. $charset = false; 
  213. if (function_exists('mb_detect_encoding')) 
  214. // Have php try to detect the encoding from the text given to us. 
  215. $charset = mb_detect_encoding($this->root->plaintext . "ascii", $encoding_list = array( "UTF-8", "CP1252" ) ); 
  216. if (is_object($debug_object)) {$debug_object->debug_log(2, 'mb_detect found: ' . $charset);} 
  217.  
  218. // and if this doesn't work... then we need to just wrongheadedly assume it's UTF-8 so that we can move on - cause this will usually give us most of what we need... 
  219. if ($charset === false) 
  220. if (is_object($debug_object)) {$debug_object->debug_log(2, 'since mb_detect failed - using default of utf-8');} 
  221. $charset = 'UTF-8'; 
  222.  
  223. // Since CP1252 is a superset, if we get one of it's subsets, we want it instead. 
  224. if ((strtolower($charset) == strtolower('ISO-8859-1')) || (strtolower($charset) == strtolower('Latin1')) || (strtolower($charset) == strtolower('Latin-1'))) 
  225. if (is_object($debug_object)) {$debug_object->debug_log(2, 'replacing ' . $charset . ' with CP1252 as its a superset');} 
  226. $charset = 'CP1252'; 
  227.  
  228. if (is_object($debug_object)) {$debug_object->debug_log(1, 'EXIT - ' . $charset);} 
  229.  
  230. return $this->_charset = $charset; 
  231.  
  232. // read tag info 
  233. protected function read_tag() 
  234. if ($this->char!=='<') 
  235. $this->root->_[HDOM_INFO_END] = $this->cursor; 
  236. return false; 
  237. $begin_tag_pos = $this->pos; 
  238. $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 
  239.  
  240. // end tag 
  241. if ($this->char==='/') 
  242. $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 
  243. // This represents the change in the simple_html_dom trunk from revision 180 to 181. 
  244. // $this->skip($this->token_blank_t); 
  245. $this->skip($this->token_blank); 
  246. $tag = $this->copy_until_char('>'); 
  247.  
  248. // skip attributes in end tag 
  249. if (($pos = strpos($tag, ' '))!==false) 
  250. $tag = substr($tag, 0, $pos); 
  251.  
  252. $parent_lower = strtolower($this->parent->tag); 
  253. $tag_lower = strtolower($tag); 
  254.  
  255. if ($parent_lower!==$tag_lower) 
  256. if (isset($this->optional_closing_tags[$parent_lower]) && isset($this->block_tags[$tag_lower])) 
  257. $this->parent->_[HDOM_INFO_END] = 0; 
  258. $org_parent = $this->parent; 
  259.  
  260. while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower) 
  261. $this->parent = $this->parent->parent; 
  262.  
  263. if (strtolower($this->parent->tag)!==$tag_lower) { 
  264. $this->parent = $org_parent; // restore origonal parent 
  265. if ($this->parent->parent) $this->parent = $this->parent->parent; 
  266. $this->parent->_[HDOM_INFO_END] = $this->cursor; 
  267. return $this->as_text_node($tag); 
  268. else if (($this->parent->parent) && isset($this->block_tags[$tag_lower])) 
  269. $this->parent->_[HDOM_INFO_END] = 0; 
  270. $org_parent = $this->parent; 
  271.  
  272. while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower) 
  273. $this->parent = $this->parent->parent; 
  274.  
  275. if (strtolower($this->parent->tag)!==$tag_lower) 
  276. $this->parent = $org_parent; // restore origonal parent 
  277. $this->parent->_[HDOM_INFO_END] = $this->cursor; 
  278. return $this->as_text_node($tag); 
  279. else if (($this->parent->parent) && strtolower($this->parent->parent->tag)===$tag_lower) 
  280. $this->parent->_[HDOM_INFO_END] = 0; 
  281. $this->parent = $this->parent->parent; 
  282. else 
  283. return $this->as_text_node($tag); 
  284.  
  285. $this->parent->_[HDOM_INFO_END] = $this->cursor; 
  286. if ($this->parent->parent) $this->parent = $this->parent->parent; 
  287.  
  288. $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 
  289. return true; 
  290.  
  291. $node = new simple_html_dom_node($this); 
  292. $node->_[HDOM_INFO_BEGIN] = $this->cursor; 
  293. ++$this->cursor; 
  294. $tag = $this->copy_until($this->token_slash); 
  295. $node->tag_start = $begin_tag_pos; 
  296.  
  297. // doctype, cdata & comments... 
  298. if (isset($tag[0]) && $tag[0]==='!') { 
  299. $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>'); 
  300.  
  301. if (isset($tag[2]) && $tag[1]==='-' && $tag[2]==='-') { 
  302. $node->nodetype = HDOM_TYPE_COMMENT; 
  303. $node->tag = 'comment'; 
  304. } else { 
  305. $node->nodetype = HDOM_TYPE_UNKNOWN; 
  306. $node->tag = 'unknown'; 
  307. if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>'; 
  308. $this->link_nodes($node, true); 
  309. $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 
  310. return true; 
  311.  
  312. // text 
  313. if ($pos=strpos($tag, '<')!==false) { 
  314. $tag = '<' . substr($tag, 0, -1); 
  315. $node->_[HDOM_INFO_TEXT] = $tag; 
  316. $this->link_nodes($node, false); 
  317. $this->char = $this->doc[--$this->pos]; // prev 
  318. return true; 
  319.  
  320. if (!preg_match("/^[\w-:]+$/", $tag)) { 
  321. $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>'); 
  322. if ($this->char==='<') { 
  323. $this->link_nodes($node, false); 
  324. return true; 
  325.  
  326. if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>'; 
  327. $this->link_nodes($node, false); 
  328. $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 
  329. return true; 
  330.  
  331. // begin tag 
  332. $node->nodetype = HDOM_TYPE_ELEMENT; 
  333. $tag_lower = strtolower($tag); 
  334. $node->tag = ($this->lowercase) ? $tag_lower : $tag; 
  335.  
  336. // handle optional closing tags 
  337. if (isset($this->optional_closing_tags[$tag_lower]) ) 
  338. while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) 
  339. $this->parent->_[HDOM_INFO_END] = 0; 
  340. $this->parent = $this->parent->parent; 
  341. $node->parent = $this->parent; 
  342.  
  343. $guard = 0; // prevent infinity loop 
  344. $space = array($this->copy_skip($this->token_blank), '', ''); 
  345.  
  346. // attributes 
  347. do 
  348. if ($this->char!==null && $space[0]==='') 
  349. break; 
  350. $name = $this->copy_until($this->token_equal); 
  351. if ($guard===$this->pos) 
  352. $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 
  353. continue; 
  354. $guard = $this->pos; 
  355.  
  356. // handle endless '<' 
  357. if ($this->pos>=$this->size-1 && $this->char!=='>') { 
  358. $node->nodetype = HDOM_TYPE_TEXT; 
  359. $node->_[HDOM_INFO_END] = 0; 
  360. $node->_[HDOM_INFO_TEXT] = '<'.$tag . $space[0] . $name; 
  361. $node->tag = 'text'; 
  362. $this->link_nodes($node, false); 
  363. return true; 
  364.  
  365. // handle mismatch '<' 
  366. if ($this->doc[$this->pos-1]=='<') { 
  367. $node->nodetype = HDOM_TYPE_TEXT; 
  368. $node->tag = 'text'; 
  369. $node->attr = array(); 
  370. $node->_[HDOM_INFO_END] = 0; 
  371. $node->_[HDOM_INFO_TEXT] = substr($this->doc, $begin_tag_pos, $this->pos-$begin_tag_pos-1); 
  372. $this->pos -= 2; 
  373. $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 
  374. $this->link_nodes($node, false); 
  375. return true; 
  376.  
  377. if ($name!=='/' && $name!=='') { 
  378. $space[1] = $this->copy_skip($this->token_blank); 
  379. $name = $this->restore_noise($name); 
  380. if ($this->lowercase) $name = strtolower($name); 
  381. if ($this->char==='=') { 
  382. $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 
  383. $this->parse_attr($node, $name, $space); 
  384. else { 
  385. //no value attr: nowrap, checked selected... 
  386. $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO; 
  387. $node->attr[$name] = true; 
  388. if ($this->char!='>') $this->char = $this->doc[--$this->pos]; // prev 
  389. $node->_[HDOM_INFO_SPACE][] = $space; 
  390. $space = array($this->copy_skip($this->token_blank), '', ''); 
  391. else 
  392. break; 
  393. } while ($this->char!=='>' && $this->char!=='/'); 
  394.  
  395. $this->link_nodes($node, true); 
  396. $node->_[HDOM_INFO_ENDSPACE] = $space[0]; 
  397.  
  398. // check self closing 
  399. if ($this->copy_until_char_escape('>')==='/') 
  400. $node->_[HDOM_INFO_ENDSPACE] .= '/'; 
  401. $node->_[HDOM_INFO_END] = 0; 
  402. else 
  403. // reset parent 
  404. if (!isset($this->self_closing_tags[strtolower($node->tag)])) $this->parent = $node; 
  405. $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 
  406.  
  407. // If it's a BR tag, we need to set it's text to the default text. 
  408. // This way when we see it in plaintext, we can generate formatting that the user wants. 
  409. // since a br tag never has sub nodes, this works well. 
  410. if ($node->tag == "br") 
  411. $node->_[HDOM_INFO_INNER] = $this->default_br_text; 
  412.  
  413. return true; 
  414.  
  415. // parse attributes 
  416. protected function parse_attr($node, $name, &$space) 
  417. // Per sourceforge: http://sourceforge.net/tracker/?func=detail&aid=3061408&group_id=218559&atid=1044037 
  418. // If the attribute is already defined inside a tag, only pay atetntion to the first one as opposed to the last one. 
  419. if (isset($node->attr[$name])) 
  420. return; 
  421.  
  422. $space[2] = $this->copy_skip($this->token_blank); 
  423. switch ($this->char) { 
  424. case '"': 
  425. $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; 
  426. $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 
  427. $node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('"')); 
  428. $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 
  429. break; 
  430. case '\'': 
  431. $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_SINGLE; 
  432. $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 
  433. $node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('\'')); 
  434. $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 
  435. break; 
  436. default: 
  437. $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO; 
  438. $node->attr[$name] = $this->restore_noise($this->copy_until($this->token_attr)); 
  439. // PaperG: Attributes should not have \r or \n in them, that counts as html whitespace. 
  440. $node->attr[$name] = str_replace("\r", "", $node->attr[$name]); 
  441. $node->attr[$name] = str_replace("\n", "", $node->attr[$name]); 
  442. // PaperG: If this is a "class" selector, lets get rid of the preceeding and trailing space since some people leave it in the multi class case. 
  443. if ($name == "class") { 
  444. $node->attr[$name] = trim($node->attr[$name]); 
  445.  
  446. // link node's parent 
  447. protected function link_nodes(&$node, $is_child) 
  448. $node->parent = $this->parent; 
  449. $this->parent->nodes[] = $node; 
  450. if ($is_child) 
  451. $this->parent->children[] = $node; 
  452.  
  453. // as a text node 
  454. protected function as_text_node($tag) 
  455. $node = new simple_html_dom_node($this); 
  456. ++$this->cursor; 
  457. $node->_[HDOM_INFO_TEXT] = '</' . $tag . '>'; 
  458. $this->link_nodes($node, false); 
  459. $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 
  460. return true; 
  461.  
  462. protected function skip($chars) 
  463. $this->pos += strspn($this->doc, $chars, $this->pos); 
  464. $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 
  465.  
  466. protected function copy_skip($chars) 
  467. $pos = $this->pos; 
  468. $len = strspn($this->doc, $chars, $pos); 
  469. $this->pos += $len; 
  470. $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 
  471. if ($len===0) return ''; 
  472. return substr($this->doc, $pos, $len); 
  473.  
  474. protected function copy_until($chars) 
  475. $pos = $this->pos; 
  476. $len = strcspn($this->doc, $chars, $pos); 
  477. $this->pos += $len; 
  478. $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 
  479. return substr($this->doc, $pos, $len); 
  480.  
  481. protected function copy_until_char($char) 
  482. if ($this->char===null) return ''; 
  483.  
  484. if (($pos = strpos($this->doc, $char, $this->pos))===false) { 
  485. $ret = substr($this->doc, $this->pos, $this->size-$this->pos); 
  486. $this->char = null; 
  487. $this->pos = $this->size; 
  488. return $ret; 
  489.  
  490. if ($pos===$this->pos) return ''; 
  491. $pos_old = $this->pos; 
  492. $this->char = $this->doc[$pos]; 
  493. $this->pos = $pos; 
  494. return substr($this->doc, $pos_old, $pos-$pos_old); 
  495.  
  496. protected function copy_until_char_escape($char) 
  497. if ($this->char===null) return ''; 
  498.  
  499. $start = $this->pos; 
  500. while (1) 
  501. if (($pos = strpos($this->doc, $char, $start))===false) 
  502. $ret = substr($this->doc, $this->pos, $this->size-$this->pos); 
  503. $this->char = null; 
  504. $this->pos = $this->size; 
  505. return $ret; 
  506.  
  507. if ($pos===$this->pos) return ''; 
  508.  
  509. if ($this->doc[$pos-1]==='\\') { 
  510. $start = $pos+1; 
  511. continue; 
  512.  
  513. $pos_old = $this->pos; 
  514. $this->char = $this->doc[$pos]; 
  515. $this->pos = $pos; 
  516. return substr($this->doc, $pos_old, $pos-$pos_old); 
  517.  
  518. // remove noise from html content 
  519. // save the noise in the $this->noise array. 
  520. protected function remove_noise($pattern, $remove_tag=false) 
  521. global $debug_object; 
  522. if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 
  523.  
  524. $count = preg_match_all($pattern, $this->doc, $matches, PREG_SET_ORDER|PREG_OFFSET_CAPTURE); 
  525.  
  526. for ($i=$count-1; $i>-1; --$i) 
  527. $key = '___noise___'.sprintf('% 5d', count($this->noise)+1000); 
  528. if (is_object($debug_object)) { $debug_object->debug_log(2, 'key is: ' . $key); } 
  529. $idx = ($remove_tag) ? 0 : 1; 
  530. $this->noise[$key] = $matches[$i][$idx][0]; 
  531. $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0])); 
  532.  
  533. // reset the length of content 
  534. $this->size = strlen($this->doc); 
  535. if ($this->size>0) 
  536. $this->char = $this->doc[0]; 
  537.  
  538. // restore noise to html content 
  539. function restore_noise($text) 
  540. global $debug_object; 
  541. if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 
  542.  
  543. while (($pos=strpos($text, '___noise___'))!==false) 
  544. // Sometimes there is a broken piece of markup, and we don't GET the pos+11 etc... token which indicates a problem outside of us... 
  545. if (strlen($text) > $pos+15) 
  546. $key = '___noise___'.$text[$pos+11].$text[$pos+12].$text[$pos+13].$text[$pos+14].$text[$pos+15]; 
  547. if (is_object($debug_object)) { $debug_object->debug_log(2, 'located key of: ' . $key); } 
  548.  
  549. if (isset($this->noise[$key])) 
  550. $text = substr($text, 0, $pos).$this->noise[$key].substr($text, $pos+16); 
  551. else 
  552. // do this to prevent an infinite loop. 
  553. $text = substr($text, 0, $pos).'UNDEFINED NOISE FOR KEY: '.$key . substr($text, $pos+16); 
  554. else 
  555. // There is no valid key being given back to us... We must get rid of the ___noise___ or we will have a problem. 
  556. $text = substr($text, 0, $pos).'NO NUMERIC NOISE KEY' . substr($text, $pos+11); 
  557. return $text; 
  558.  
  559. // Sometimes we NEED one of the noise elements. 
  560. function search_noise($text) 
  561. global $debug_object; 
  562. if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 
  563.  
  564. foreach($this->noise as $noiseElement) 
  565. if (strpos($noiseElement, $text)!==false) 
  566. return $noiseElement; 
  567. function __toString() 
  568. return $this->root->innertext(); 
  569.  
  570. function __get($name) 
  571. switch ($name) 
  572. case 'outertext': 
  573. return $this->root->innertext(); 
  574. case 'innertext': 
  575. return $this->root->innertext(); 
  576. case 'plaintext': 
  577. return $this->root->text(); 
  578. case 'charset': 
  579. return $this->_charset; 
  580. case 'target_charset': 
  581. return $this->_target_charset; 
  582.  
  583. // camel naming conventions 
  584. function childNodes($idx=-1) {return $this->root->childNodes($idx);} 
  585. function firstChild() {return $this->root->first_child();} 
  586. function lastChild() {return $this->root->last_child();} 
  587. function createElement($name, $value=null) {return @str_get_html("<$name>$value</$name>")->first_child();} 
  588. function createTextNode($value) {return @end(str_get_html($value)->nodes);} 
  589. function getElementById($id) {return $this->find("#$id", 0);} 
  590. function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);} 
  591. function getElementByTagName($name) {return $this->find($name, 0);} 
  592. function getElementsByTagName($name, $idx=-1) {return $this->find($name, $idx);} 
  593. function loadFile() {$args = func_get_args();$this->load_file($args);}