Jetpack_Media_Meta_Extractor

Class with methods to extract metadata from a post/page about videos, images, links, mentions embedded in or attached to the post/page.

Defined (1)

The class is defined in the following location(s).

/class.media-extractor.php  
  1. class Jetpack_Media_Meta_Extractor { 
  2.  
  3. // Some consts for what to extract 
  4. const ALL = 255; 
  5. const LINKS = 1; 
  6. const MENTIONS = 2; 
  7. const IMAGES = 4; 
  8. const SHORTCODES = 8; // Only the keeper shortcodes below 
  9. const EMBEDS = 16; 
  10. const HASHTAGS = 32; 
  11.  
  12. // For these, we try to extract some data from the shortcode, rather than just recording its presence (which we do for all) 
  13. // There should be a function get_{shortcode}_id( $atts ) or static method SomethingShortcode::get_{shortcode}_id( $atts ) for these. 
  14. private static $KEEPER_SHORTCODES = array( 
  15. 'youtube',  
  16. 'vimeo',  
  17. 'hulu',  
  18. 'ted',  
  19. 'wpvideo',  
  20. 'audio',  
  21. ); 
  22.  
  23. /** 
  24. * Gets the specified media and meta info from the given post. 
  25. * NOTE: If you have the post's HTML content already and don't need image data, use extract_from_content() instead. 
  26. * @param $blog_id The ID of the blog 
  27. * @param $post_id The ID of the post 
  28. * @param $what_to_extract (int) A mask of things to extract, e.g. Jetpack_Media_Meta_Extractor::IMAGES | Jetpack_Media_Meta_Extractor::MENTIONS 
  29. * @returns a structure containing metadata about the embedded things, or empty array if nothing found, or WP_Error on error 
  30. */ 
  31. static public function extract( $blog_id, $post_id, $what_to_extract = self::ALL ) { 
  32.  
  33. // multisite? 
  34. if ( function_exists( 'switch_to_blog') ) 
  35. switch_to_blog( $blog_id ); 
  36.  
  37. $post = get_post( $post_id ); 
  38. $content = $post->post_title . "\n\n" . $post->post_content; 
  39. $char_cnt = strlen( $content ); 
  40.  
  41. //prevent running extraction on really huge amounts of content 
  42. if ( $char_cnt > 100000 ) //about 20k English words 
  43. $content = substr( $content, 0, 100000 ); 
  44.  
  45. $extracted = array(); 
  46.  
  47. // Get images first, we need the full post for that 
  48. if ( self::IMAGES & $what_to_extract ) { 
  49. $extracted = self::get_image_fields( $post ); 
  50.  
  51. // Turn off images so we can safely call extract_from_content() below 
  52. $what_to_extract = $what_to_extract - self::IMAGES; 
  53.  
  54. if ( function_exists( 'switch_to_blog') ) 
  55. restore_current_blog(); 
  56.  
  57. // All of the other things besides images can be extracted from just the content 
  58. $extracted = self::extract_from_content( $content, $what_to_extract, $extracted ); 
  59.  
  60. return $extracted; 
  61.  
  62. /** 
  63. * Gets the specified meta info from the given post content. 
  64. * NOTE: If you want IMAGES, call extract( $blog_id, $post_id, ...) which will give you more/better image extraction 
  65. * This method will give you an error if you ask for IMAGES. 
  66. * @param $content The HTML post_content of a post 
  67. * @param $what_to_extract (int) A mask of things to extract, e.g. Jetpack_Media_Meta_Extractor::IMAGES | Jetpack_Media_Meta_Extractor::MENTIONS 
  68. * @param $already_extracted (array) Previously extracted things, e.g. images from extract(), which can be used for x-referencing here 
  69. * @returns a structure containing metadata about the embedded things, or empty array if nothing found, or WP_Error on error 
  70. */ 
  71. static public function extract_from_content( $content, $what_to_extract = self::ALL, $already_extracted = array() ) { 
  72. $stripped_content = self::get_stripped_content( $content ); 
  73.  
  74. // Maybe start with some previously extracted things (e.g. images from extract() 
  75. $extracted = $already_extracted; 
  76.  
  77. // Embedded media objects will have already been converted to shortcodes by pre_kses hooks on save. 
  78.  
  79. if ( self::IMAGES & $what_to_extract ) { 
  80. $images = Jetpack_Media_Meta_Extractor::extract_images_from_content( $stripped_content, array() ); 
  81. $extracted = array_merge( $extracted, $images ); 
  82.  
  83. // ----------------------------------- MENTIONS ------------------------------ 
  84.  
  85. if ( self::MENTIONS & $what_to_extract ) { 
  86. if ( preg_match_all( '/(^|\s)@(\w+)/u', $stripped_content, $matches ) ) { 
  87. $mentions = array_values( array_unique( $matches[2] ) ); //array_unique() retains the keys! 
  88. $mentions = array_map( 'strtolower', $mentions ); 
  89. $extracted['mention'] = array( 'name' => $mentions ); 
  90. if ( !isset( $extracted['has'] ) ) 
  91. $extracted['has'] = array(); 
  92. $extracted['has']['mention'] = count( $mentions ); 
  93.  
  94. // ----------------------------------- HASHTAGS ------------------------------ 
  95. /** Some hosts may not compile with --enable-unicode-properties and kick a warning: 
  96. * Warning: preg_match_all() [function.preg-match-all]: Compilation failed: support for \P, \p, and \X has not been compiled 
  97. * Therefore, we only run this code block on wpcom, not in Jetpack. 
  98. */ 
  99. if ( ( defined( 'IS_WPCOM' ) && IS_WPCOM ) && ( self::HASHTAGS & $what_to_extract ) ) { 
  100. //This regex does not exactly match Twitter's 
  101. // if there are problems/complaints we should implement this: 
  102. // https://github.com/twitter/twitter-text/blob/master/java/src/com/twitter/Regex.java 
  103. if ( preg_match_all( '/(?:^|\s)#(\w*\p{L}+\w*)/u', $stripped_content, $matches ) ) { 
  104. $hashtags = array_values( array_unique( $matches[1] ) ); //array_unique() retains the keys! 
  105. $hashtags = array_map( 'strtolower', $hashtags ); 
  106. $extracted['hashtag'] = array( 'name' => $hashtags ); 
  107. if ( !isset( $extracted['has'] ) ) 
  108. $extracted['has'] = array(); 
  109. $extracted['has']['hashtag'] = count( $hashtags ); 
  110.  
  111. // ----------------------------------- SHORTCODES ------------------------------ 
  112.  
  113. // Always look for shortcodes. 
  114. // If we don't want them, we'll just remove them, so we don't grab them as links below 
  115. $shortcode_pattern = '/' . get_shortcode_regex() . '/s'; 
  116. if ( preg_match_all( $shortcode_pattern, $content, $matches ) ) { 
  117.  
  118. $shortcode_total_count = 0; 
  119. $shortcode_type_counts = array(); 
  120. $shortcode_types = array(); 
  121. $shortcode_details = array(); 
  122.  
  123. if ( self::SHORTCODES & $what_to_extract ) { 
  124.  
  125. foreach( $matches[2] as $key => $shortcode ) { 
  126. //Elasticsearch (and probably other things) doesn't deal well with some chars as key names 
  127. $shortcode_name = preg_replace( '/[., *"\'\/\\\\#+ ]/', '_', $shortcode ); 
  128.  
  129. $attr = shortcode_parse_atts( $matches[3][ $key ] ); 
  130.  
  131. $shortcode_total_count++; 
  132. if ( ! isset( $shortcode_type_counts[$shortcode_name] ) ) 
  133. $shortcode_type_counts[$shortcode_name] = 0; 
  134. $shortcode_type_counts[$shortcode_name]++; 
  135.  
  136. // Store (uniquely) presence of all shortcode regardless of whether it's a keeper (for those, get ID below) 
  137. // @todo Store number of occurrences? 
  138. if ( ! in_array( $shortcode_name, $shortcode_types ) ) 
  139. $shortcode_types[] = $shortcode_name; 
  140.  
  141. // For keeper shortcodes, also store the id/url of the object (e.g. youtube video, TED talk, etc.) 
  142. if ( in_array( $shortcode, self::$KEEPER_SHORTCODES ) ) { 
  143. unset( $id ); // Clear shortcode ID data left from the last shortcode 
  144. // We'll try to get the salient ID from the function jetpack_shortcode_get_xyz_id() 
  145. // If the shortcode is a class, we'll call XyzShortcode::get_xyz_id() 
  146. $shortcode_get_id_func = "jetpack_shortcode_get_{$shortcode}_id"; 
  147. $shortcode_class_name = ucfirst( $shortcode ) . 'Shortcode'; 
  148. $shortcode_get_id_method = "get_{$shortcode}_id"; 
  149. if ( function_exists( $shortcode_get_id_func ) ) { 
  150. $id = call_user_func( $shortcode_get_id_func, $attr ); 
  151. } else if ( method_exists( $shortcode_class_name, $shortcode_get_id_method ) ) { 
  152. $id = call_user_func( array( $shortcode_class_name, $shortcode_get_id_method ), $attr ); 
  153. if ( ! empty( $id ) 
  154. && ( ! isset( $shortcode_details[$shortcode_name] ) || ! in_array( $id, $shortcode_details[$shortcode_name] ) ) ) 
  155. $shortcode_details[$shortcode_name][] = $id; 
  156.  
  157. if ( $shortcode_total_count > 0 ) { 
  158. // Add the shortcode info to the $extracted array 
  159. if ( !isset( $extracted['has'] ) ) 
  160. $extracted['has'] = array(); 
  161. $extracted['has']['shortcode'] = $shortcode_total_count; 
  162. $extracted['shortcode'] = array(); 
  163. foreach ( $shortcode_type_counts as $type => $count ) 
  164. $extracted['shortcode'][$type] = array( 'count' => $count ); 
  165. if ( ! empty( $shortcode_types ) ) 
  166. $extracted['shortcode_types'] = $shortcode_types; 
  167. foreach ( $shortcode_details as $type => $id ) 
  168. $extracted['shortcode'][$type]['id'] = $id; 
  169.  
  170. // Remove the shortcodes form our copy of $content, so we don't count links in them as links below. 
  171. $content = preg_replace( $shortcode_pattern, ' ', $content ); 
  172.  
  173. // ----------------------------------- LINKS ------------------------------ 
  174.  
  175. if ( self::LINKS & $what_to_extract ) { 
  176.  
  177. // To hold the extracted stuff we find 
  178. $links = array(); 
  179.  
  180. // @todo Get the text inside the links? 
  181.  
  182. // Grab any links, whether in <a href="..." or not, but subtract those from shortcodes and images 
  183. // (we treat embed links as just another link) 
  184. if ( preg_match_all( '#(?:^|\s|"|\')(https?://([^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/))))#', $content, $matches ) ) { 
  185.  
  186. foreach ( $matches[1] as $link_raw ) { 
  187. $url = parse_url( $link_raw ); 
  188.  
  189. // Data URI links 
  190. if ( isset( $url['scheme'] ) && 'data' === $url['scheme'] ) 
  191. continue; 
  192.  
  193. // Remove large (and likely invalid) links 
  194. if ( 4096 < strlen( $link_raw ) ) 
  195. continue; 
  196.  
  197. // Build a simple form of the URL so we can compare it to ones we found in IMAGES or SHORTCODES and exclude those 
  198. $simple_url = $url['scheme'] . '://' . $url['host'] . ( ! empty( $url['path'] ) ? $url['path'] : '' ); 
  199. if ( isset( $extracted['image']['url'] ) ) { 
  200. if ( in_array( $simple_url, (array) $extracted['image']['url'] ) ) 
  201. continue; 
  202.  
  203. list( $proto, $link_all_but_proto ) = explode( '://', $link_raw ); 
  204.  
  205. // Build a reversed hostname 
  206. $host_parts = array_reverse( explode( '.', $url['host'] ) ); 
  207. $host_reversed = ''; 
  208. foreach ( $host_parts as $part ) { 
  209. $host_reversed .= ( ! empty( $host_reversed ) ? '.' : '' ) . $part; 
  210.  
  211. $link_analyzed = ''; 
  212. if ( !empty( $url['path'] ) ) { 
  213. // The whole path (no query args or fragments) 
  214. $path = substr( $url['path'], 1 ); // strip the leading '/' 
  215. $link_analyzed .= ( ! empty( $link_analyzed ) ? ' ' : '' ) . $path; 
  216.  
  217. // The path split by / 
  218. $path_split = explode( '/', $path ); 
  219. if ( count( $path_split ) > 1 ) { 
  220. $link_analyzed .= ' ' . implode( ' ', $path_split ); 
  221.  
  222. // The fragment 
  223. if ( ! empty( $url['fragment'] ) ) 
  224. $link_analyzed .= ( ! empty( $link_analyzed ) ? ' ' : '' ) . $url['fragment']; 
  225.  
  226. // @todo Check unique before adding 
  227. $links[] = array( 
  228. 'url' => $link_all_but_proto,  
  229. 'host_reversed' => $host_reversed,  
  230. 'host' => $url['host'],  
  231. ); 
  232.  
  233.  
  234. $link_count = count( $links ); 
  235. if ( $link_count ) { 
  236. $extracted[ 'link' ] = $links; 
  237. if ( !isset( $extracted['has'] ) ) 
  238. $extracted['has'] = array(); 
  239. $extracted['has']['link'] = $link_count; 
  240.  
  241. // ----------------------------------- EMBEDS ------------------------------ 
  242.  
  243. //Embeds are just individual links on their own line 
  244. if ( self::EMBEDS & $what_to_extract ) { 
  245.  
  246. if ( !function_exists( '_wp_oembed_get_object' ) ) 
  247. include( ABSPATH . WPINC . '/class-oembed.php' ); 
  248.  
  249. // get an oembed object 
  250. $oembed = _wp_oembed_get_object(); 
  251.  
  252. // Grab any links on their own lines that may be embeds 
  253. if ( preg_match_all( '|^\s*(https?://[^\s"]+)\s*$|im', $content, $matches ) ) { 
  254.  
  255. // To hold the extracted stuff we find 
  256. $embeds = array(); 
  257.  
  258. foreach ( $matches[1] as $link_raw ) { 
  259. $url = parse_url( $link_raw ); 
  260.  
  261. list( $proto, $link_all_but_proto ) = explode( '://', $link_raw ); 
  262.  
  263. // Check whether this "link" is really an embed. 
  264. foreach ( $oembed->providers as $matchmask => $data ) { 
  265. list( $providerurl, $regex ) = $data; 
  266.  
  267. // Turn the asterisk-type provider URLs into regex 
  268. if ( !$regex ) { 
  269. $matchmask = '#' . str_replace( '___wildcard___', '(.+)', preg_quote( str_replace( '*', '___wildcard___', $matchmask ), '#' ) ) . '#i'; 
  270. $matchmask = preg_replace( '|^#http\\\://|', '#https?\://', $matchmask ); 
  271.  
  272. if ( preg_match( $matchmask, $link_raw ) ) { 
  273. $provider = str_replace( '{format}', 'json', $providerurl ); // JSON is easier to deal with than XML 
  274. $embeds[] = $link_all_but_proto; // @todo Check unique before adding 
  275.  
  276. // @todo Try to get ID's for the ones we care about (shortcode_keepers) 
  277. break; 
  278.  
  279. if ( ! empty( $embeds ) ) { 
  280. if ( !isset( $extracted['has'] ) ) 
  281. $extracted['has'] = array(); 
  282. $extracted['has']['embed'] = count( $embeds ); 
  283. $extracted['embed'] = array( 'url' => array() ); 
  284. foreach ( $embeds as $e ) 
  285. $extracted['embed']['url'][] = $e; 
  286.  
  287. return $extracted; 
  288.  
  289. /** 
  290. * @param $post A post object 
  291. * @param $args (array) Optional args, see defaults list for details 
  292. * @returns array Returns an array of all images meeting the specified criteria in $args 
  293. * Uses Jetpack Post Images 
  294. */ 
  295. private static function get_image_fields( $post, $args = array() ) { 
  296.  
  297. $defaults = array( 
  298. 'width' => 200, // Required minimum width (if possible to determine) 
  299. 'height' => 200, // Required minimum height (if possible to determine) 
  300. ); 
  301.  
  302. $args = wp_parse_args( $args, $defaults ); 
  303.  
  304. $image_list = array(); 
  305. $image_booleans = array(); 
  306. $image_booleans['gallery'] = 0; 
  307.  
  308. $from_featured_image = Jetpack_PostImages::from_thumbnail( $post->ID, $args['width'], $args['height'] ); 
  309. if ( !empty( $from_featured_image ) ) { 
  310. $srcs = wp_list_pluck( $from_featured_image, 'src' ); 
  311. $image_list = array_merge( $image_list, $srcs ); 
  312.  
  313. $from_slideshow = Jetpack_PostImages::from_slideshow( $post->ID, $args['width'], $args['height'] ); 
  314. if ( !empty( $from_slideshow ) ) { 
  315. $srcs = wp_list_pluck( $from_slideshow, 'src' ); 
  316. $image_list = array_merge( $image_list, $srcs ); 
  317.  
  318. $from_gallery = Jetpack_PostImages::from_gallery( $post->ID ); 
  319. if ( !empty( $from_gallery ) ) { 
  320. $srcs = wp_list_pluck( $from_gallery, 'src' ); 
  321. $image_list = array_merge( $image_list, $srcs ); 
  322. $image_booleans['gallery']++; // @todo This count isn't correct, will only every count 1 
  323.  
  324. // @todo Can we check width/height of these efficiently? Could maybe use query args at least, before we strip them out 
  325. $image_list = Jetpack_Media_Meta_Extractor::get_images_from_html( $post->post_content, $image_list ); 
  326.  
  327. return Jetpack_Media_Meta_Extractor::build_image_struct( $image_list ); 
  328.  
  329. public static function extract_images_from_content( $content, $image_list ) { 
  330. $image_list = Jetpack_Media_Meta_Extractor::get_images_from_html( $content, $image_list ); 
  331. return Jetpack_Media_Meta_Extractor::build_image_struct( $image_list ); 
  332.  
  333. public static function build_image_struct( $image_list ) { 
  334. if ( ! empty( $image_list ) ) { 
  335. $retval = array( 'image' => array() ); 
  336. $image_list = array_unique( $image_list ); 
  337. foreach ( $image_list as $img ) { 
  338. $retval['image'][] = array( 'url' => $img ); 
  339. $image_booleans['image'] = count( $retval['image'] ); 
  340. if ( ! empty( $image_booleans ) ) 
  341. $retval['has'] = $image_booleans; 
  342. return $retval; 
  343. } else { 
  344. return array(); 
  345.  
  346. /** 
  347. * @param string $html Some markup, possibly containing image tags 
  348. * @param array $images_already_extracted (just an array of image URLs without query strings, no special structure), used for de-duplication 
  349. * @return array Image URLs extracted from the HTML, stripped of query params and de-duped 
  350. */ 
  351. public static function get_images_from_html( $html, $images_already_extracted ) { 
  352. $image_list = $images_already_extracted; 
  353. $from_html = Jetpack_PostImages::from_html( $html ); 
  354. if ( !empty( $from_html ) ) { 
  355. $srcs = wp_list_pluck( $from_html, 'src' ); 
  356. foreach( $srcs as $image_url ) { 
  357. if ( ( $src = parse_url( $image_url ) ) && isset( $src['scheme'], $src['host'], $src['path'] ) ) { 
  358. // Rebuild the URL without the query string 
  359. $queryless = $src['scheme'] . '://' . $src['host'] . $src['path']; 
  360. } elseif ( $length = strpos( $image_url, '?' ) ) { 
  361. // If parse_url() didn't work, strip off the query string the old fashioned way 
  362. $queryless = substr( $image_url, 0, $length ); 
  363. } else { 
  364. // Failing that, there was no spoon! Err ... query string! 
  365. $queryless = $image_url; 
  366.  
  367. // Discard URLs that are longer then 4KB, these are likely data URIs or malformed HTML. 
  368. if ( 4096 < strlen( $queryless ) ) { 
  369. continue; 
  370.  
  371. if ( ! in_array( $queryless, $image_list ) ) { 
  372. $image_list[] = $queryless; 
  373. return $image_list; 
  374.  
  375. private static function get_stripped_content( $content ) { 
  376. $clean_content = strip_tags( $content ); 
  377. $clean_content = html_entity_decode( $clean_content ); 
  378. //completely strip shortcodes and any content they enclose 
  379. $clean_content = strip_shortcodes( $clean_content ); 
  380. return $clean_content;