simple_html_dom_node

Simple html dom node PaperG - added ability for "find" routine to lowercase the value of the selector.

Defined (1)

The class is defined in the following location(s).

/lib/simple-html-dom/simple_html_dom.php  
  1. class simple_html_dom_node 
  2. public $nodetype = HDOM_TYPE_TEXT; 
  3. public $tag = 'text'; 
  4. public $attr = array(); 
  5. public $children = array(); 
  6. public $nodes = array(); 
  7. public $parent = null; 
  8. // The "info" array - see HDOM_INFO_... for what each element contains. 
  9. public $_ = array(); 
  10. public $tag_start = 0; 
  11. private $dom = null; 
  12.  
  13. function __construct($dom) 
  14. $this->dom = $dom; 
  15. $dom->nodes[] = $this; 
  16.  
  17. function __destruct() 
  18. $this->clear(); 
  19.  
  20. function __toString() 
  21. return $this->outertext(); 
  22.  
  23. // clean up memory due to php5 circular references memory leak... 
  24. function clear() 
  25. $this->dom = null; 
  26. $this->nodes = null; 
  27. $this->parent = null; 
  28. $this->children = null; 
  29.  
  30. // dump node's tree 
  31. function dump($show_attr=true, $deep=0) 
  32. $lead = str_repeat(' ', $deep); 
  33.  
  34. echo $lead.$this->tag; 
  35. if ($show_attr && count($this->attr)>0) 
  36. echo '('; 
  37. foreach ($this->attr as $k=>$v) 
  38. echo "[$k]=>\"".$this->$k.'", '; 
  39. echo ')'; 
  40. echo "\n"; 
  41.  
  42. if ($this->nodes) 
  43. foreach ($this->nodes as $c) 
  44. $c->dump($show_attr, $deep+1); 
  45.  
  46.  
  47. // Debugging function to dump a single dom node with a bunch of information about it. 
  48. function dump_node($echo=true) 
  49.  
  50. $string = $this->tag; 
  51. if (count($this->attr)>0) 
  52. $string .= '('; 
  53. foreach ($this->attr as $k=>$v) 
  54. $string .= "[$k]=>\"".$this->$k.'", '; 
  55. $string .= ')'; 
  56. if (count($this->_)>0) 
  57. $string .= ' $_ ('; 
  58. foreach ($this->_ as $k=>$v) 
  59. if (is_array($v)) 
  60. $string .= "[$k]=>("; 
  61. foreach ($v as $k2=>$v2) 
  62. $string .= "[$k2]=>\"".$v2.'", '; 
  63. $string .= ")"; 
  64. } else { 
  65. $string .= "[$k]=>\"".$v.'", '; 
  66. $string .= ")"; 
  67.  
  68. if (isset($this->text)) 
  69. $string .= " text: (" . $this->text . ")"; 
  70.  
  71. $string .= " HDOM_INNER_INFO: '"; 
  72. if (isset($node->_[HDOM_INFO_INNER])) 
  73. $string .= $node->_[HDOM_INFO_INNER] . "'"; 
  74. else 
  75. $string .= ' NULL '; 
  76.  
  77. $string .= " children: " . count($this->children); 
  78. $string .= " nodes: " . count($this->nodes); 
  79. $string .= " tag_start: " . $this->tag_start; 
  80. $string .= "\n"; 
  81.  
  82. if ($echo) 
  83. echo $string; 
  84. return; 
  85. else 
  86. return $string; 
  87.  
  88. // returns the parent of node 
  89. // If a node is passed in, it will reset the parent of the current node to that one. 
  90. function parent($parent=null) 
  91. // I am SURE that this doesn't work properly. 
  92. // It fails to unset the current node from it's current parents nodes or children list first. 
  93. if ($parent !== null) 
  94. $this->parent = $parent; 
  95. $this->parent->nodes[] = $this; 
  96. $this->parent->children[] = $this; 
  97.  
  98. return $this->parent; 
  99.  
  100. // verify that node has children 
  101. function has_child() 
  102. return !empty($this->children); 
  103.  
  104. // returns children of node 
  105. function children($idx=-1) 
  106. if ($idx===-1) 
  107. return $this->children; 
  108. if (isset($this->children[$idx])) 
  109. return $this->children[$idx]; 
  110. return null; 
  111.  
  112. // returns the first child of node 
  113. function first_child() 
  114. if (count($this->children)>0) 
  115. return $this->children[0]; 
  116. return null; 
  117.  
  118. // returns the last child of node 
  119. function last_child() 
  120. if (($count=count($this->children))>0) 
  121. return $this->children[$count-1]; 
  122. return null; 
  123.  
  124. // returns the next sibling of node 
  125. function next_sibling() 
  126. if ($this->parent===null) 
  127. return null; 
  128.  
  129. $idx = 0; 
  130. $count = count($this->parent->children); 
  131. while ($idx<$count && $this!==$this->parent->children[$idx]) 
  132. ++$idx; 
  133. if (++$idx>=$count) 
  134. return null; 
  135. return $this->parent->children[$idx]; 
  136.  
  137. // returns the previous sibling of node 
  138. function prev_sibling() 
  139. if ($this->parent===null) return null; 
  140. $idx = 0; 
  141. $count = count($this->parent->children); 
  142. while ($idx<$count && $this!==$this->parent->children[$idx]) 
  143. ++$idx; 
  144. if (--$idx<0) return null; 
  145. return $this->parent->children[$idx]; 
  146.  
  147. // function to locate a specific ancestor tag in the path to the root. 
  148. function find_ancestor_tag($tag) 
  149. global $debug_object; 
  150. if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 
  151.  
  152. // Start by including ourselves in the comparison. 
  153. $returnDom = $this; 
  154.  
  155. while (!is_null($returnDom)) 
  156. if (is_object($debug_object)) { $debug_object->debug_log(2, "Current tag is: " . $returnDom->tag); } 
  157.  
  158. if ($returnDom->tag == $tag) 
  159. break; 
  160. $returnDom = $returnDom->parent; 
  161. return $returnDom; 
  162.  
  163. // get dom node's inner html 
  164. function innertext() 
  165. if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER]; 
  166. if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 
  167.  
  168. $ret = ''; 
  169. foreach ($this->nodes as $n) 
  170. $ret .= $n->outertext(); 
  171. return $ret; 
  172.  
  173. // get dom node's outer text (with tag) 
  174. function outertext() 
  175. global $debug_object; 
  176. if (is_object($debug_object)) 
  177. $text = ''; 
  178. if ($this->tag == 'text') 
  179. if (!empty($this->text)) 
  180. $text = " with text: " . $this->text; 
  181. $debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text); 
  182.  
  183. if ($this->tag==='root') return $this->innertext(); 
  184.  
  185. // trigger callback 
  186. if ($this->dom && $this->dom->callback!==null) 
  187. call_user_func_array($this->dom->callback, array($this)); 
  188.  
  189. if (isset($this->_[HDOM_INFO_OUTER])) return $this->_[HDOM_INFO_OUTER]; 
  190. if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 
  191.  
  192. // render begin tag 
  193. if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]) 
  194. $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup(); 
  195. } else { 
  196. $ret = ""; 
  197.  
  198. // render inner text 
  199. if (isset($this->_[HDOM_INFO_INNER])) 
  200. // If it's a br tag... don't return the HDOM_INNER_INFO that we may or may not have added. 
  201. if ($this->tag != "br") 
  202. $ret .= $this->_[HDOM_INFO_INNER]; 
  203. } else { 
  204. if ($this->nodes) 
  205. foreach ($this->nodes as $n) 
  206. $ret .= $this->convert_text($n->outertext()); 
  207.  
  208. // render end tag 
  209. if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END]!=0) 
  210. $ret .= '</'.$this->tag.'>'; 
  211. return $ret; 
  212.  
  213. // get dom node's plain text 
  214. function text() 
  215. if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER]; 
  216. switch ($this->nodetype) 
  217. case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 
  218. case HDOM_TYPE_COMMENT: return ''; 
  219. case HDOM_TYPE_UNKNOWN: return ''; 
  220. if (strcasecmp($this->tag, 'script')===0) return ''; 
  221. if (strcasecmp($this->tag, 'style')===0) return ''; 
  222.  
  223. $ret = ''; 
  224. // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed for some span tags, and some p tags) $this->nodes is set to NULL. 
  225. // NOTE: This indicates that there is a problem where it's set to NULL without a clear happening. 
  226. // WHY is this happening? 
  227. if (!is_null($this->nodes)) 
  228. foreach ($this->nodes as $n) 
  229. $ret .= $this->convert_text($n->text()); 
  230.  
  231. // If this node is a span... add a space at the end of it so multiple spans don't run into each other. This is plaintext after all. 
  232. if ($this->tag == "span") 
  233. $ret .= $this->dom->default_span_text; 
  234.  
  235.  
  236. return $ret; 
  237.  
  238. function xmltext() 
  239. $ret = $this->innertext(); 
  240. $ret = str_ireplace('<![CDATA[', '', $ret); 
  241. $ret = str_replace(']]>', '', $ret); 
  242. return $ret; 
  243.  
  244. // build node's text with tag 
  245. function makeup() 
  246. // text, comment, unknown 
  247. if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 
  248.  
  249. $ret = '<'.$this->tag; 
  250. $i = -1; 
  251.  
  252. foreach ($this->attr as $key=>$val) 
  253. ++$i; 
  254.  
  255. // skip removed attribute 
  256. if ($val===null || $val===false) 
  257. continue; 
  258.  
  259. $ret .= $this->_[HDOM_INFO_SPACE][$i][0]; 
  260. //no value attr: nowrap, checked selected... 
  261. if ($val===true) 
  262. $ret .= $key; 
  263. else { 
  264. switch ($this->_[HDOM_INFO_QUOTE][$i]) 
  265. case HDOM_QUOTE_DOUBLE: $quote = '"'; break; 
  266. case HDOM_QUOTE_SINGLE: $quote = '\''; break; 
  267. default: $quote = ''; 
  268. $ret .= $key.$this->_[HDOM_INFO_SPACE][$i][1].'='.$this->_[HDOM_INFO_SPACE][$i][2].$quote.$val.$quote; 
  269. $ret = $this->dom->restore_noise($ret); 
  270. return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>'; 
  271.  
  272. // find elements by css selector 
  273. //PaperG - added ability for find to lowercase the value of the selector. 
  274. function find($selector, $idx=null, $lowercase=false) 
  275. $selectors = $this->parse_selector($selector); 
  276. if (($count=count($selectors))===0) return array(); 
  277. $found_keys = array(); 
  278.  
  279. // find each selector 
  280. for ($c=0; $c<$count; ++$c) 
  281. // The change on the below line was documented on the sourceforge code tracker id 2788009 
  282. // used to be: if (($levle=count($selectors[0]))===0) return array(); 
  283. if (($levle=count($selectors[$c]))===0) return array(); 
  284. if (!isset($this->_[HDOM_INFO_BEGIN])) return array(); 
  285.  
  286. $head = array($this->_[HDOM_INFO_BEGIN]=>1); 
  287.  
  288. // handle descendant selectors, no recursive! 
  289. for ($l=0; $l<$levle; ++$l) 
  290. $ret = array(); 
  291. foreach ($head as $k=>$v) 
  292. $n = ($k===-1) ? $this->dom->root : $this->dom->nodes[$k]; 
  293. //PaperG - Pass this optional parameter on to the seek function. 
  294. $n->seek($selectors[$c][$l], $ret, $lowercase); 
  295. $head = $ret; 
  296.  
  297. foreach ($head as $k=>$v) 
  298. if (!isset($found_keys[$k])) 
  299. $found_keys[$k] = 1; 
  300.  
  301. // sort keys 
  302. ksort($found_keys); 
  303.  
  304. $found = array(); 
  305. foreach ($found_keys as $k=>$v) 
  306. $found[] = $this->dom->nodes[$k]; 
  307.  
  308. // return nth-element or array 
  309. if (is_null($idx)) return $found; 
  310. else if ($idx<0) $idx = count($found) + $idx; 
  311. return (isset($found[$idx])) ? $found[$idx] : null; 
  312.  
  313. // seek for given conditions 
  314. // PaperG - added parameter to allow for case insensitive testing of the value of a selector. 
  315. protected function seek($selector, &$ret, $lowercase=false) 
  316. global $debug_object; 
  317. if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 
  318.  
  319. list($tag, $key, $val, $exp, $no_key) = $selector; 
  320.  
  321. // xpath index 
  322. if ($tag && $key && is_numeric($key)) 
  323. $count = 0; 
  324. foreach ($this->children as $c) 
  325. if ($tag==='*' || $tag===$c->tag) { 
  326. if (++$count==$key) { 
  327. $ret[$c->_[HDOM_INFO_BEGIN]] = 1; 
  328. return; 
  329. return; 
  330.  
  331. $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0; 
  332. if ($end==0) { 
  333. $parent = $this->parent; 
  334. while (!isset($parent->_[HDOM_INFO_END]) && $parent!==null) { 
  335. $end -= 1; 
  336. $parent = $parent->parent; 
  337. $end += $parent->_[HDOM_INFO_END]; 
  338.  
  339. for ($i=$this->_[HDOM_INFO_BEGIN]+1; $i<$end; ++$i) { 
  340. $node = $this->dom->nodes[$i]; 
  341.  
  342. $pass = true; 
  343.  
  344. if ($tag==='*' && !$key) { 
  345. if (in_array($node, $this->children, true)) 
  346. $ret[$i] = 1; 
  347. continue; 
  348.  
  349. // compare tag 
  350. if ($tag && $tag!=$node->tag && $tag!=='*') {$pass=false;} 
  351. // compare key 
  352. if ($pass && $key) { 
  353. if ($no_key) { 
  354. if (isset($node->attr[$key])) $pass=false; 
  355. } else { 
  356. if (($key != "plaintext") && !isset($node->attr[$key])) $pass=false; 
  357. // compare value 
  358. if ($pass && $key && $val && $val!=='*') { 
  359. // If they have told us that this is a "plaintext" search then we want the plaintext of the node - right? 
  360. if ($key == "plaintext") { 
  361. // $node->plaintext actually returns $node->text(); 
  362. $nodeKeyValue = $node->text(); 
  363. } else { 
  364. // this is a normal search, we want the value of that attribute of the tag. 
  365. $nodeKeyValue = $node->attr[$key]; 
  366. if (is_object($debug_object)) {$debug_object->debug_log(2, "testing node: " . $node->tag . " for attribute: " . $key . $exp . $val . " where nodes value is: " . $nodeKeyValue);} 
  367.  
  368. //PaperG - If lowercase is set, do a case insensitive test of the value of the selector. 
  369. if ($lowercase) { 
  370. $check = $this->match($exp, strtolower($val), strtolower($nodeKeyValue)); 
  371. } else { 
  372. $check = $this->match($exp, $val, $nodeKeyValue); 
  373. if (is_object($debug_object)) {$debug_object->debug_log(2, "after match: " . ($check ? "true" : "false"));} 
  374.  
  375. // handle multiple class 
  376. if (!$check && strcasecmp($key, 'class')===0) { 
  377. foreach (explode(' ', $node->attr[$key]) as $k) { 
  378. // Without this, there were cases where leading, trailing, or double spaces lead to our comparing blanks - bad form. 
  379. if (!empty($k)) { 
  380. if ($lowercase) { 
  381. $check = $this->match($exp, strtolower($val), strtolower($k)); 
  382. } else { 
  383. $check = $this->match($exp, $val, $k); 
  384. if ($check) break; 
  385. if (!$check) $pass = false; 
  386. if ($pass) $ret[$i] = 1; 
  387. unset($node); 
  388. // It's passed by reference so this is actually what this function returns. 
  389. if (is_object($debug_object)) {$debug_object->debug_log(1, "EXIT - ret: ", $ret);} 
  390.  
  391. protected function match($exp, $pattern, $value) { 
  392. global $debug_object; 
  393. if (is_object($debug_object)) {$debug_object->debug_log_entry(1);} 
  394.  
  395. switch ($exp) { 
  396. case '=': 
  397. return ($value===$pattern); 
  398. case '!=': 
  399. return ($value!==$pattern); 
  400. case '^=': 
  401. return preg_match("/^".preg_quote($pattern, '/')."/", $value); 
  402. case '$=': 
  403. return preg_match("/".preg_quote($pattern, '/')."$/", $value); 
  404. case '*=': 
  405. if ($pattern[0]=='/') { 
  406. return preg_match($pattern, $value); 
  407. return preg_match("/".$pattern."/i", $value); 
  408. return false; 
  409.  
  410. protected function parse_selector($selector_string) { 
  411. global $debug_object; 
  412. if (is_object($debug_object)) {$debug_object->debug_log_entry(1);} 
  413.  
  414. // pattern of CSS selectors, modified from mootools 
  415. // Paperg: Add the colon to the attrbute, so that it properly finds <tag attr:ibute="something" > like google does. 
  416. // Note: if you try to look at this attribute, yo MUST use getAttribute since $dom->x:y will fail the php syntax check. 
  417. // Notice the \[ starting the attbute? and the @? following? This implies that an attribute can begin with an @ sign that is not captured. 
  418. // This implies that an html attribute specifier may start with an @ sign that is NOT captured by the expression. 
  419. // farther study is required to determine of this should be documented or removed. 
  420. // $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; 
  421. $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; 
  422. preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER); 
  423. if (is_object($debug_object)) {$debug_object->debug_log(2, "Matches Array: ", $matches);} 
  424.  
  425. $selectors = array(); 
  426. $result = array(); 
  427. //print_r($matches); 
  428.  
  429. foreach ($matches as $m) { 
  430. $m[0] = trim($m[0]); 
  431. if ($m[0]==='' || $m[0]==='/' || $m[0]==='//') continue; 
  432. // for browser generated xpath 
  433. if ($m[1]==='tbody') continue; 
  434.  
  435. list($tag, $key, $val, $exp, $no_key) = array($m[1], null, null, '=', false); 
  436. if (!empty($m[2])) {$key='id'; $val=$m[2];} 
  437. if (!empty($m[3])) {$key='class'; $val=$m[3];} 
  438. if (!empty($m[4])) {$key=$m[4];} 
  439. if (!empty($m[5])) {$exp=$m[5];} 
  440. if (!empty($m[6])) {$val=$m[6];} 
  441.  
  442. // convert to lowercase 
  443. if ($this->dom->lowercase) {$tag=strtolower($tag); $key=strtolower($key);} 
  444. //elements that do NOT have the specified attribute 
  445. if (isset($key[0]) && $key[0]==='!') {$key=substr($key, 1); $no_key=true;} 
  446.  
  447. $result[] = array($tag, $key, $val, $exp, $no_key); 
  448. if (trim($m[7])===', ') { 
  449. $selectors[] = $result; 
  450. $result = array(); 
  451. if (count($result)>0) 
  452. $selectors[] = $result; 
  453. return $selectors; 
  454.  
  455. function __get($name) 
  456. if (isset($this->attr[$name])) 
  457. return $this->convert_text($this->attr[$name]); 
  458. switch ($name) 
  459. case 'outertext': return $this->outertext(); 
  460. case 'innertext': return $this->innertext(); 
  461. case 'plaintext': return $this->text(); 
  462. case 'xmltext': return $this->xmltext(); 
  463. default: return array_key_exists($name, $this->attr); 
  464.  
  465. function __set($name, $value) 
  466. global $debug_object; 
  467. if (is_object($debug_object)) {$debug_object->debug_log_entry(1);} 
  468.  
  469. switch ($name) 
  470. case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value; 
  471. case 'innertext': 
  472. if (isset($this->_[HDOM_INFO_TEXT])) return $this->_[HDOM_INFO_TEXT] = $value; 
  473. return $this->_[HDOM_INFO_INNER] = $value; 
  474. if (!isset($this->attr[$name])) 
  475. $this->_[HDOM_INFO_SPACE][] = array(' ', '', ''); 
  476. $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; 
  477. $this->attr[$name] = $value; 
  478.  
  479. function __isset($name) 
  480. switch ($name) 
  481. case 'outertext': return true; 
  482. case 'innertext': return true; 
  483. case 'plaintext': return true; 
  484. //no value attr: nowrap, checked selected... 
  485. return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]); 
  486.  
  487. function __unset($name) { 
  488. if (isset($this->attr[$name])) 
  489. unset($this->attr[$name]); 
  490.  
  491. // PaperG - Function to convert the text from one character set to another if the two sets are not the same. 
  492. function convert_text($text) 
  493. global $debug_object; 
  494. if (is_object($debug_object)) {$debug_object->debug_log_entry(1);} 
  495.  
  496. $converted_text = $text; 
  497.  
  498. $sourceCharset = ""; 
  499. $targetCharset = ""; 
  500.  
  501. if ($this->dom) 
  502. $sourceCharset = strtoupper($this->dom->_charset); 
  503. $targetCharset = strtoupper($this->dom->_target_charset); 
  504. if (is_object($debug_object)) {$debug_object->debug_log(3, "source charset: " . $sourceCharset . " target charaset: " . $targetCharset);} 
  505.  
  506. if (!empty($sourceCharset) && !empty($targetCharset) && (strcasecmp($sourceCharset, $targetCharset) != 0)) 
  507. // Check if the reported encoding could have been incorrect and the text is actually already UTF-8 
  508. if ((strcasecmp($targetCharset, 'UTF-8') == 0) && ($this->is_utf8($text))) 
  509. $converted_text = $text; 
  510. else 
  511. $converted_text = iconv($sourceCharset, $targetCharset, $text); 
  512.  
  513. // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output. 
  514. if ($targetCharset == 'UTF-8') 
  515. if (substr($converted_text, 0, 3) == "\xef\xbb\xbf") 
  516. $converted_text = substr($converted_text, 3); 
  517. if (substr($converted_text, -3) == "\xef\xbb\xbf") 
  518. $converted_text = substr($converted_text, 0, -3); 
  519.  
  520. return $converted_text; 
  521.  
  522. /** 
  523. * Returns true if $string is valid UTF-8 and false otherwise. 
  524. * @param mixed $str String to be tested 
  525. * @return boolean 
  526. */ 
  527. static function is_utf8($str) 
  528. $c=0; $b=0; 
  529. $bits=0; 
  530. $len=strlen($str); 
  531. for($i=0; $i<$len; $i++) 
  532. $c=ord($str[$i]); 
  533. if($c > 128) 
  534. if(($c >= 254)) return false; 
  535. elseif($c >= 252) $bits=6; 
  536. elseif($c >= 248) $bits=5; 
  537. elseif($c >= 240) $bits=4; 
  538. elseif($c >= 224) $bits=3; 
  539. elseif($c >= 192) $bits=2; 
  540. else return false; 
  541. if(($i+$bits) > $len) return false; 
  542. while($bits > 1) 
  543. $i++; 
  544. $b=ord($str[$i]); 
  545. if($b < 128 || $b > 191) return false; 
  546. $bits--; 
  547. return true; 
  548. /** 
  549. function is_utf8($string) 
  550. //this is buggy 
  551. return (utf8_encode(utf8_decode($string)) == $string); 
  552. */ 
  553.  
  554. /** 
  555. * Function to try a few tricks to determine the displayed size of an img on the page. 
  556. * NOTE: This will ONLY work on an IMG tag. Returns FALSE on all other tag types. 
  557. * @author John Schlick 
  558. * @version April 19 2012 
  559. * @return array an array containing the 'height' and 'width' of the image on the page or -1 if we can't figure it out. 
  560. */ 
  561. function get_display_size() 
  562. global $debug_object; 
  563.  
  564. $width = -1; 
  565. $height = -1; 
  566.  
  567. if ($this->tag !== 'img') 
  568. return false; 
  569.  
  570. // See if there is aheight or width attribute in the tag itself. 
  571. if (isset($this->attr['width'])) 
  572. $width = $this->attr['width']; 
  573.  
  574. if (isset($this->attr['height'])) 
  575. $height = $this->attr['height']; 
  576.  
  577. // Now look for an inline style. 
  578. if (isset($this->attr['style'])) 
  579. // Thanks to user gnarf from stackoverflow for this regular expression. 
  580. $attributes = array(); 
  581. preg_match_all("/([\w-]+)\s*:\s*([^;]+)\s*;?/", $this->attr['style'], $matches, PREG_SET_ORDER); 
  582. foreach ($matches as $match) { 
  583. $attributes[$match[1]] = $match[2]; 
  584.  
  585. // If there is a width in the style attributes: 
  586. if (isset($attributes['width']) && $width == -1) 
  587. // check that the last two characters are px (pixels) 
  588. if (strtolower(substr($attributes['width'], -2)) == 'px') 
  589. $proposed_width = substr($attributes['width'], 0, -2); 
  590. // Now make sure that it's an integer and not something stupid. 
  591. if (filter_var($proposed_width, FILTER_VALIDATE_INT)) 
  592. $width = $proposed_width; 
  593.  
  594. // If there is a width in the style attributes: 
  595. if (isset($attributes['height']) && $height == -1) 
  596. // check that the last two characters are px (pixels) 
  597. if (strtolower(substr($attributes['height'], -2)) == 'px') 
  598. $proposed_height = substr($attributes['height'], 0, -2); 
  599. // Now make sure that it's an integer and not something stupid. 
  600. if (filter_var($proposed_height, FILTER_VALIDATE_INT)) 
  601. $height = $proposed_height; 
  602.  
  603.  
  604. // Future enhancement: 
  605. // Look in the tag to see if there is a class or id specified that has a height or width attribute to it. 
  606.  
  607. // Far future enhancement 
  608. // Look at all the parent tags of this image to see if they specify a class or id that has an img selector that specifies a height or width 
  609. // Note that in this case, the class or id will have the img subselector for it to apply to the image. 
  610.  
  611. // ridiculously far future development 
  612. // If the class or id is specified in a SEPARATE css file thats not on the page, go get it and do what we were just doing for the ones on the page. 
  613.  
  614. $result = array('height' => $height,  
  615. 'width' => $width); 
  616. return $result; 
  617.  
  618. // camel naming conventions 
  619. function getAllAttributes() {return $this->attr;} 
  620. function getAttribute($name) {return $this->__get($name);} 
  621. function setAttribute($name, $value) {$this->__set($name, $value);} 
  622. function hasAttribute($name) {return $this->__isset($name);} 
  623. function removeAttribute($name) {$this->__set($name, null);} 
  624. function getElementById($id) {return $this->find("#$id", 0);} 
  625. function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);} 
  626. function getElementByTagName($name) {return $this->find($name, 0);} 
  627. function getElementsByTagName($name, $idx=null) {return $this->find($name, $idx);} 
  628. function parentNode() {return $this->parent();} 
  629. function childNodes($idx=-1) {return $this->children($idx);} 
  630. function firstChild() {return $this->first_child();} 
  631. function lastChild() {return $this->last_child();} 
  632. function nextSibling() {return $this->next_sibling();} 
  633. function previousSibling() {return $this->prev_sibling();} 
  634. function hasChildNodes() {return $this->has_child();} 
  635. function nodeName() {return $this->tag;} 
  636. function appendChild($node) {$node->parent($this); return $node;} 
  637.