WXR_Parser_Regex

WXR Parser that uses regular expressions.

Defined (1)

The class is defined in the following location(s).

/parsers.php  
  1. class WXR_Parser_Regex { 
  2. var $authors = array(); 
  3. var $posts = array(); 
  4. var $categories = array(); 
  5. var $tags = array(); 
  6. var $terms = array(); 
  7. var $base_url = ''; 
  8.  
  9. function __construct() { 
  10. $this->has_gzip = is_callable( 'gzopen' ); 
  11.  
  12. function parse( $file ) { 
  13. $wxr_version = $in_post = false; 
  14.  
  15. $fp = $this->fopen( $file, 'r' ); 
  16. if ( $fp ) { 
  17. while ( ! $this->feof( $fp ) ) { 
  18. $importline = rtrim( $this->fgets( $fp ) ); 
  19.  
  20. if ( ! $wxr_version && preg_match( '|<wp:wxr_version>(\d+\.\d+)</wp:wxr_version>|', $importline, $version ) ) 
  21. $wxr_version = $version[1]; 
  22.  
  23. if ( false !== strpos( $importline, '<wp:base_site_url>' ) ) { 
  24. preg_match( '|<wp:base_site_url>(.*?)</wp:base_site_url>|is', $importline, $url ); 
  25. $this->base_url = $url[1]; 
  26. continue; 
  27. if ( false !== strpos( $importline, '<wp:category>' ) ) { 
  28. preg_match( '|<wp:category>(.*?)</wp:category>|is', $importline, $category ); 
  29. $this->categories[] = $this->process_category( $category[1] ); 
  30. continue; 
  31. if ( false !== strpos( $importline, '<wp:tag>' ) ) { 
  32. preg_match( '|<wp:tag>(.*?)</wp:tag>|is', $importline, $tag ); 
  33. $this->tags[] = $this->process_tag( $tag[1] ); 
  34. continue; 
  35. if ( false !== strpos( $importline, '<wp:term>' ) ) { 
  36. preg_match( '|<wp:term>(.*?)</wp:term>|is', $importline, $term ); 
  37. $this->terms[] = $this->process_term( $term[1] ); 
  38. continue; 
  39. if ( false !== strpos( $importline, '<wp:author>' ) ) { 
  40. preg_match( '|<wp:author>(.*?)</wp:author>|is', $importline, $author ); 
  41. $a = $this->process_author( $author[1] ); 
  42. $this->authors[$a['author_login']] = $a; 
  43. continue; 
  44. if ( false !== strpos( $importline, '<item>' ) ) { 
  45. $post = ''; 
  46. $in_post = true; 
  47. continue; 
  48. if ( false !== strpos( $importline, '</item>' ) ) { 
  49. $in_post = false; 
  50. $this->posts[] = $this->process_post( $post ); 
  51. continue; 
  52. if ( $in_post ) { 
  53. $post .= $importline . "\n"; 
  54.  
  55. $this->fclose($fp); 
  56.  
  57. if ( ! $wxr_version ) 
  58. return new WP_Error( 'WXR_parse_error', __( 'This does not appear to be a WXR file, missing/invalid WXR version number', 'wordpress-importer' ) ); 
  59.  
  60. return array( 
  61. 'authors' => $this->authors,  
  62. 'posts' => $this->posts,  
  63. 'categories' => $this->categories,  
  64. 'tags' => $this->tags,  
  65. 'terms' => $this->terms,  
  66. 'base_url' => $this->base_url,  
  67. 'version' => $wxr_version 
  68. ); 
  69.  
  70. function get_tag( $string, $tag ) { 
  71. preg_match( "|<$tag.*?>(.*?)</$tag>|is", $string, $return ); 
  72. if ( isset( $return[1] ) ) { 
  73. if ( substr( $return[1], 0, 9 ) == '<![CDATA[' ) { 
  74. if ( strpos( $return[1], ']]]]><![CDATA[>' ) !== false ) { 
  75. preg_match_all( '|<!\[CDATA\[(.*?)\]\]>|s', $return[1], $matches ); 
  76. $return = ''; 
  77. foreach( $matches[1] as $match ) 
  78. $return .= $match; 
  79. } else { 
  80. $return = preg_replace( '|^<!\[CDATA\[(.*)\]\]>$|s', '$1', $return[1] ); 
  81. } else { 
  82. $return = $return[1]; 
  83. } else { 
  84. $return = ''; 
  85. return $return; 
  86.  
  87. function process_category( $c ) { 
  88. return array( 
  89. 'term_id' => $this->get_tag( $c, 'wp:term_id' ),  
  90. 'cat_name' => $this->get_tag( $c, 'wp:cat_name' ),  
  91. 'category_nicename' => $this->get_tag( $c, 'wp:category_nicename' ),  
  92. 'category_parent' => $this->get_tag( $c, 'wp:category_parent' ),  
  93. 'category_description' => $this->get_tag( $c, 'wp:category_description' ),  
  94. ); 
  95.  
  96. function process_tag( $t ) { 
  97. return array( 
  98. 'term_id' => $this->get_tag( $t, 'wp:term_id' ),  
  99. 'tag_name' => $this->get_tag( $t, 'wp:tag_name' ),  
  100. 'tag_slug' => $this->get_tag( $t, 'wp:tag_slug' ),  
  101. 'tag_description' => $this->get_tag( $t, 'wp:tag_description' ),  
  102. ); 
  103.  
  104. function process_term( $t ) { 
  105. return array( 
  106. 'term_id' => $this->get_tag( $t, 'wp:term_id' ),  
  107. 'term_taxonomy' => $this->get_tag( $t, 'wp:term_taxonomy' ),  
  108. 'slug' => $this->get_tag( $t, 'wp:term_slug' ),  
  109. 'term_parent' => $this->get_tag( $t, 'wp:term_parent' ),  
  110. 'term_name' => $this->get_tag( $t, 'wp:term_name' ),  
  111. 'term_description' => $this->get_tag( $t, 'wp:term_description' ),  
  112. ); 
  113.  
  114. function process_author( $a ) { 
  115. return array( 
  116. 'author_id' => $this->get_tag( $a, 'wp:author_id' ),  
  117. 'author_login' => $this->get_tag( $a, 'wp:author_login' ),  
  118. 'author_email' => $this->get_tag( $a, 'wp:author_email' ),  
  119. 'author_display_name' => $this->get_tag( $a, 'wp:author_display_name' ),  
  120. 'author_first_name' => $this->get_tag( $a, 'wp:author_first_name' ),  
  121. 'author_last_name' => $this->get_tag( $a, 'wp:author_last_name' ),  
  122. ); 
  123.  
  124. function process_post( $post ) { 
  125. $post_id = $this->get_tag( $post, 'wp:post_id' ); 
  126. $post_title = $this->get_tag( $post, 'title' ); 
  127. $post_date = $this->get_tag( $post, 'wp:post_date' ); 
  128. $post_date_gmt = $this->get_tag( $post, 'wp:post_date_gmt' ); 
  129. $comment_status = $this->get_tag( $post, 'wp:comment_status' ); 
  130. $ping_status = $this->get_tag( $post, 'wp:ping_status' ); 
  131. $status = $this->get_tag( $post, 'wp:status' ); 
  132. $post_name = $this->get_tag( $post, 'wp:post_name' ); 
  133. $post_parent = $this->get_tag( $post, 'wp:post_parent' ); 
  134. $menu_order = $this->get_tag( $post, 'wp:menu_order' ); 
  135. $post_type = $this->get_tag( $post, 'wp:post_type' ); 
  136. $post_password = $this->get_tag( $post, 'wp:post_password' ); 
  137. $is_sticky = $this->get_tag( $post, 'wp:is_sticky' ); 
  138. $guid = $this->get_tag( $post, 'guid' ); 
  139. $post_author = $this->get_tag( $post, 'dc:creator' ); 
  140.  
  141. $post_excerpt = $this->get_tag( $post, 'excerpt:encoded' ); 
  142. $post_excerpt = preg_replace_callback( '|<(/?[A-Z]+)|', array( &$this, '_normalize_tag' ), $post_excerpt ); 
  143. $post_excerpt = str_replace( '<br>', '<br />', $post_excerpt ); 
  144. $post_excerpt = str_replace( '<hr>', '<hr />', $post_excerpt ); 
  145.  
  146. $post_content = $this->get_tag( $post, 'content:encoded' ); 
  147. $post_content = preg_replace_callback( '|<(/?[A-Z]+)|', array( &$this, '_normalize_tag' ), $post_content ); 
  148. $post_content = str_replace( '<br>', '<br />', $post_content ); 
  149. $post_content = str_replace( '<hr>', '<hr />', $post_content ); 
  150.  
  151. $postdata = compact( 'post_id', 'post_author', 'post_date', 'post_date_gmt', 'post_content', 'post_excerpt',  
  152. 'post_title', 'status', 'post_name', 'comment_status', 'ping_status', 'guid', 'post_parent',  
  153. 'menu_order', 'post_type', 'post_password', 'is_sticky' 
  154. ); 
  155.  
  156. $attachment_url = $this->get_tag( $post, 'wp:attachment_url' ); 
  157. if ( $attachment_url ) 
  158. $postdata['attachment_url'] = $attachment_url; 
  159.  
  160. preg_match_all( '|<category domain="([^"]+?)" nicename="([^"]+?)">(.+?)</category>|is', $post, $terms, PREG_SET_ORDER ); 
  161. foreach ( $terms as $t ) { 
  162. $post_terms[] = array( 
  163. 'slug' => $t[2],  
  164. 'domain' => $t[1],  
  165. 'name' => str_replace( array( '<![CDATA[', ']]>' ), '', $t[3] ),  
  166. ); 
  167. if ( ! empty( $post_terms ) ) $postdata['terms'] = $post_terms; 
  168.  
  169. preg_match_all( '|<wp:comment>(.+?)</wp:comment>|is', $post, $comments ); 
  170. $comments = $comments[1]; 
  171. if ( $comments ) { 
  172. foreach ( $comments as $comment ) { 
  173. preg_match_all( '|<wp:commentmeta>(.+?)</wp:commentmeta>|is', $comment, $commentmeta ); 
  174. $commentmeta = $commentmeta[1]; 
  175. $c_meta = array(); 
  176. foreach ( $commentmeta as $m ) { 
  177. $c_meta[] = array( 
  178. 'key' => $this->get_tag( $m, 'wp:meta_key' ),  
  179. 'value' => $this->get_tag( $m, 'wp:meta_value' ),  
  180. ); 
  181.  
  182. $post_comments[] = array( 
  183. 'comment_id' => $this->get_tag( $comment, 'wp:comment_id' ),  
  184. 'comment_author' => $this->get_tag( $comment, 'wp:comment_author' ),  
  185. 'comment_author_email' => $this->get_tag( $comment, 'wp:comment_author_email' ),  
  186. 'comment_author_IP' => $this->get_tag( $comment, 'wp:comment_author_IP' ),  
  187. 'comment_author_url' => $this->get_tag( $comment, 'wp:comment_author_url' ),  
  188. 'comment_date' => $this->get_tag( $comment, 'wp:comment_date' ),  
  189. 'comment_date_gmt' => $this->get_tag( $comment, 'wp:comment_date_gmt' ),  
  190. 'comment_content' => $this->get_tag( $comment, 'wp:comment_content' ),  
  191. 'comment_approved' => $this->get_tag( $comment, 'wp:comment_approved' ),  
  192. 'comment_type' => $this->get_tag( $comment, 'wp:comment_type' ),  
  193. 'comment_parent' => $this->get_tag( $comment, 'wp:comment_parent' ),  
  194. 'comment_user_id' => $this->get_tag( $comment, 'wp:comment_user_id' ),  
  195. 'commentmeta' => $c_meta,  
  196. ); 
  197. if ( ! empty( $post_comments ) ) $postdata['comments'] = $post_comments; 
  198.  
  199. preg_match_all( '|<wp:postmeta>(.+?)</wp:postmeta>|is', $post, $postmeta ); 
  200. $postmeta = $postmeta[1]; 
  201. if ( $postmeta ) { 
  202. foreach ( $postmeta as $p ) { 
  203. $post_postmeta[] = array( 
  204. 'key' => $this->get_tag( $p, 'wp:meta_key' ),  
  205. 'value' => $this->get_tag( $p, 'wp:meta_value' ),  
  206. ); 
  207. if ( ! empty( $post_postmeta ) ) $postdata['postmeta'] = $post_postmeta; 
  208.  
  209. return $postdata; 
  210.  
  211. function _normalize_tag( $matches ) { 
  212. return '<' . strtolower( $matches[1] ); 
  213.  
  214. function fopen( $filename, $mode = 'r' ) { 
  215. if ( $this->has_gzip ) 
  216. return gzopen( $filename, $mode ); 
  217. return fopen( $filename, $mode ); 
  218.  
  219. function feof( $fp ) { 
  220. if ( $this->has_gzip ) 
  221. return gzeof( $fp ); 
  222. return feof( $fp ); 
  223.  
  224. function fgets( $fp, $len = 8192 ) { 
  225. if ( $this->has_gzip ) 
  226. return gzgets( $fp, $len ); 
  227. return fgets( $fp, $len ); 
  228.  
  229. function fclose( $fp ) { 
  230. if ( $this->has_gzip ) 
  231. return gzclose( $fp ); 
  232. return fclose( $fp );