class-wp-rest-url-details-controller.php 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668
  1. <?php
  2. /**
  3. * REST API: WP_REST_URL_Details_Controller class
  4. *
  5. * @package WordPress
  6. * @subpackage REST_API
  7. * @since 5.9.0
  8. */
  9. /**
  10. * Controller which provides REST endpoint for retrieving information
  11. * from a remote site's HTML response.
  12. *
  13. * @since 5.9.0
  14. *
  15. * @see WP_REST_Controller
  16. */
  17. class WP_REST_URL_Details_Controller extends WP_REST_Controller {
  18. /**
  19. * Constructs the controller.
  20. *
  21. * @since 5.9.0
  22. */
  23. public function __construct() {
  24. $this->namespace = 'wp-block-editor/v1';
  25. $this->rest_base = 'url-details';
  26. }
  27. /**
  28. * Registers the necessary REST API routes.
  29. *
  30. * @since 5.9.0
  31. */
  32. public function register_routes() {
  33. register_rest_route(
  34. $this->namespace,
  35. '/' . $this->rest_base,
  36. array(
  37. array(
  38. 'methods' => WP_REST_Server::READABLE,
  39. 'callback' => array( $this, 'parse_url_details' ),
  40. 'args' => array(
  41. 'url' => array(
  42. 'required' => true,
  43. 'description' => __( 'The URL to process.' ),
  44. 'validate_callback' => 'wp_http_validate_url',
  45. 'sanitize_callback' => 'sanitize_url',
  46. 'type' => 'string',
  47. 'format' => 'uri',
  48. ),
  49. ),
  50. 'permission_callback' => array( $this, 'permissions_check' ),
  51. 'schema' => array( $this, 'get_public_item_schema' ),
  52. ),
  53. )
  54. );
  55. }
  56. /**
  57. * Retrieves the item's schema, conforming to JSON Schema.
  58. *
  59. * @since 5.9.0
  60. *
  61. * @return array Item schema data.
  62. */
  63. public function get_item_schema() {
  64. if ( $this->schema ) {
  65. return $this->add_additional_fields_schema( $this->schema );
  66. }
  67. $this->schema = array(
  68. '$schema' => 'http://json-schema.org/draft-04/schema#',
  69. 'title' => 'url-details',
  70. 'type' => 'object',
  71. 'properties' => array(
  72. 'title' => array(
  73. 'description' => sprintf(
  74. /* translators: %s: HTML title tag. */
  75. __( 'The contents of the %s element from the URL.' ),
  76. '<title>'
  77. ),
  78. 'type' => 'string',
  79. 'context' => array( 'view', 'edit', 'embed' ),
  80. 'readonly' => true,
  81. ),
  82. 'icon' => array(
  83. 'description' => sprintf(
  84. /* translators: %s: HTML link tag. */
  85. __( 'The favicon image link of the %s element from the URL.' ),
  86. '<link rel="icon">'
  87. ),
  88. 'type' => 'string',
  89. 'format' => 'uri',
  90. 'context' => array( 'view', 'edit', 'embed' ),
  91. 'readonly' => true,
  92. ),
  93. 'description' => array(
  94. 'description' => sprintf(
  95. /* translators: %s: HTML meta tag. */
  96. __( 'The content of the %s element from the URL.' ),
  97. '<meta name="description">'
  98. ),
  99. 'type' => 'string',
  100. 'context' => array( 'view', 'edit', 'embed' ),
  101. 'readonly' => true,
  102. ),
  103. 'image' => array(
  104. 'description' => sprintf(
  105. /* translators: 1: HTML meta tag, 2: HTML meta tag. */
  106. __( 'The Open Graph image link of the %1$s or %2$s element from the URL.' ),
  107. '<meta property="og:image">',
  108. '<meta property="og:image:url">'
  109. ),
  110. 'type' => 'string',
  111. 'format' => 'uri',
  112. 'context' => array( 'view', 'edit', 'embed' ),
  113. 'readonly' => true,
  114. ),
  115. ),
  116. );
  117. return $this->add_additional_fields_schema( $this->schema );
  118. }
  119. /**
  120. * Retrieves the contents of the title tag from the HTML response.
  121. *
  122. * @since 5.9.0
  123. *
  124. * @param WP_REST_REQUEST $request Full details about the request.
  125. * @return WP_REST_Response|WP_Error The parsed details as a response object. WP_Error if there are errors.
  126. */
  127. public function parse_url_details( $request ) {
  128. $url = untrailingslashit( $request['url'] );
  129. if ( empty( $url ) ) {
  130. return new WP_Error( 'rest_invalid_url', __( 'Invalid URL' ), array( 'status' => 404 ) );
  131. }
  132. // Transient per URL.
  133. $cache_key = $this->build_cache_key_for_url( $url );
  134. // Attempt to retrieve cached response.
  135. $cached_response = $this->get_cache( $cache_key );
  136. if ( ! empty( $cached_response ) ) {
  137. $remote_url_response = $cached_response;
  138. } else {
  139. $remote_url_response = $this->get_remote_url( $url );
  140. // Exit if we don't have a valid body or it's empty.
  141. if ( is_wp_error( $remote_url_response ) || empty( $remote_url_response ) ) {
  142. return $remote_url_response;
  143. }
  144. // Cache the valid response.
  145. $this->set_cache( $cache_key, $remote_url_response );
  146. }
  147. $html_head = $this->get_document_head( $remote_url_response );
  148. $meta_elements = $this->get_meta_with_content_elements( $html_head );
  149. $data = $this->add_additional_fields_to_object(
  150. array(
  151. 'title' => $this->get_title( $html_head ),
  152. 'icon' => $this->get_icon( $html_head, $url ),
  153. 'description' => $this->get_description( $meta_elements ),
  154. 'image' => $this->get_image( $meta_elements, $url ),
  155. ),
  156. $request
  157. );
  158. // Wrap the data in a response object.
  159. $response = rest_ensure_response( $data );
  160. /**
  161. * Filters the URL data for the response.
  162. *
  163. * @since 5.9.0
  164. *
  165. * @param WP_REST_Response $response The response object.
  166. * @param string $url The requested URL.
  167. * @param WP_REST_Request $request Request object.
  168. * @param string $remote_url_response HTTP response body from the remote URL.
  169. */
  170. return apply_filters( 'rest_prepare_url_details', $response, $url, $request, $remote_url_response );
  171. }
  172. /**
  173. * Checks whether a given request has permission to read remote URLs.
  174. *
  175. * @since 5.9.0
  176. *
  177. * @return WP_Error|bool True if the request has permission, else WP_Error.
  178. */
  179. public function permissions_check() {
  180. if ( current_user_can( 'edit_posts' ) ) {
  181. return true;
  182. }
  183. foreach ( get_post_types( array( 'show_in_rest' => true ), 'objects' ) as $post_type ) {
  184. if ( current_user_can( $post_type->cap->edit_posts ) ) {
  185. return true;
  186. }
  187. }
  188. return new WP_Error(
  189. 'rest_cannot_view_url_details',
  190. __( 'Sorry, you are not allowed to process remote URLs.' ),
  191. array( 'status' => rest_authorization_required_code() )
  192. );
  193. }
  194. /**
  195. * Retrieves the document title from a remote URL.
  196. *
  197. * @since 5.9.0
  198. *
  199. * @param string $url The website URL whose HTML to access.
  200. * @return string|WP_Error The HTTP response from the remote URL on success.
  201. * WP_Error if no response or no content.
  202. */
  203. private function get_remote_url( $url ) {
  204. /*
  205. * Provide a modified UA string to workaround web properties which block WordPress "Pingbacks".
  206. * Why? The UA string used for pingback requests contains `WordPress/` which is very similar
  207. * to that used as the default UA string by the WP HTTP API. Therefore requests from this
  208. * REST endpoint are being unintentionally blocked as they are misidentified as pingback requests.
  209. * By slightly modifying the UA string, but still retaining the "WordPress" identification (via "WP")
  210. * we are able to work around this issue.
  211. * Example UA string: `WP-URLDetails/5.9-alpha-51389 (+http://localhost:8888)`.
  212. */
  213. $modified_user_agent = 'WP-URLDetails/' . get_bloginfo( 'version' ) . ' (+' . get_bloginfo( 'url' ) . ')';
  214. $args = array(
  215. 'limit_response_size' => 150 * KB_IN_BYTES,
  216. 'user-agent' => $modified_user_agent,
  217. );
  218. /**
  219. * Filters the HTTP request args for URL data retrieval.
  220. *
  221. * Can be used to adjust response size limit and other WP_Http::request() args.
  222. *
  223. * @since 5.9.0
  224. *
  225. * @param array $args Arguments used for the HTTP request.
  226. * @param string $url The attempted URL.
  227. */
  228. $args = apply_filters( 'rest_url_details_http_request_args', $args, $url );
  229. $response = wp_safe_remote_get( $url, $args );
  230. if ( WP_Http::OK !== wp_remote_retrieve_response_code( $response ) ) {
  231. // Not saving the error response to cache since the error might be temporary.
  232. return new WP_Error(
  233. 'no_response',
  234. __( 'URL not found. Response returned a non-200 status code for this URL.' ),
  235. array( 'status' => WP_Http::NOT_FOUND )
  236. );
  237. }
  238. $remote_body = wp_remote_retrieve_body( $response );
  239. if ( empty( $remote_body ) ) {
  240. return new WP_Error(
  241. 'no_content',
  242. __( 'Unable to retrieve body from response at this URL.' ),
  243. array( 'status' => WP_Http::NOT_FOUND )
  244. );
  245. }
  246. return $remote_body;
  247. }
  248. /**
  249. * Parses the title tag contents from the provided HTML.
  250. *
  251. * @since 5.9.0
  252. *
  253. * @param string $html The HTML from the remote website at URL.
  254. * @return string The title tag contents on success. Empty string if not found.
  255. */
  256. private function get_title( $html ) {
  257. $pattern = '#<title[^>]*>(.*?)<\s*/\s*title>#is';
  258. preg_match( $pattern, $html, $match_title );
  259. if ( empty( $match_title[1] ) || ! is_string( $match_title[1] ) ) {
  260. return '';
  261. }
  262. $title = trim( $match_title[1] );
  263. return $this->prepare_metadata_for_output( $title );
  264. }
  265. /**
  266. * Parses the site icon from the provided HTML.
  267. *
  268. * @since 5.9.0
  269. *
  270. * @param string $html The HTML from the remote website at URL.
  271. * @param string $url The target website URL.
  272. * @return string The icon URI on success. Empty string if not found.
  273. */
  274. private function get_icon( $html, $url ) {
  275. // Grab the icon's link element.
  276. $pattern = '#<link\s[^>]*rel=(?:[\"\']??)\s*(?:icon|shortcut icon|icon shortcut)\s*(?:[\"\']??)[^>]*\/?>#isU';
  277. preg_match( $pattern, $html, $element );
  278. if ( empty( $element[0] ) || ! is_string( $element[0] ) ) {
  279. return '';
  280. }
  281. $element = trim( $element[0] );
  282. // Get the icon's href value.
  283. $pattern = '#href=([\"\']??)([^\" >]*?)\\1[^>]*#isU';
  284. preg_match( $pattern, $element, $icon );
  285. if ( empty( $icon[2] ) || ! is_string( $icon[2] ) ) {
  286. return '';
  287. }
  288. $icon = trim( $icon[2] );
  289. // If the icon is a data URL, return it.
  290. $parsed_icon = parse_url( $icon );
  291. if ( isset( $parsed_icon['scheme'] ) && 'data' === $parsed_icon['scheme'] ) {
  292. return $icon;
  293. }
  294. // Attempt to convert relative URLs to absolute.
  295. if ( ! is_string( $url ) || '' === $url ) {
  296. return $icon;
  297. }
  298. $parsed_url = parse_url( $url );
  299. if ( isset( $parsed_url['scheme'] ) && isset( $parsed_url['host'] ) ) {
  300. $root_url = $parsed_url['scheme'] . '://' . $parsed_url['host'] . '/';
  301. $icon = WP_Http::make_absolute_url( $icon, $root_url );
  302. }
  303. return $icon;
  304. }
  305. /**
  306. * Parses the meta description from the provided HTML.
  307. *
  308. * @since 5.9.0
  309. *
  310. * @param array $meta_elements {
  311. * A multi-dimensional indexed array on success, else empty array.
  312. *
  313. * @type string[] $0 Meta elements with a content attribute.
  314. * @type string[] $1 Content attribute's opening quotation mark.
  315. * @type string[] $2 Content attribute's value for each meta element.
  316. * }
  317. * @return string The meta description contents on success. Empty string if not found.
  318. */
  319. private function get_description( $meta_elements ) {
  320. // Bail out if there are no meta elements.
  321. if ( empty( $meta_elements[0] ) ) {
  322. return '';
  323. }
  324. $description = $this->get_metadata_from_meta_element(
  325. $meta_elements,
  326. 'name',
  327. '(?:description|og:description)'
  328. );
  329. // Bail out if description not found.
  330. if ( '' === $description ) {
  331. return '';
  332. }
  333. return $this->prepare_metadata_for_output( $description );
  334. }
  335. /**
  336. * Parses the Open Graph (OG) Image from the provided HTML.
  337. *
  338. * See: https://ogp.me/.
  339. *
  340. * @since 5.9.0
  341. *
  342. * @param array $meta_elements {
  343. * A multi-dimensional indexed array on success, else empty array.
  344. *
  345. * @type string[] $0 Meta elements with a content attribute.
  346. * @type string[] $1 Content attribute's opening quotation mark.
  347. * @type string[] $2 Content attribute's value for each meta element.
  348. * }
  349. * @param string $url The target website URL.
  350. * @return string The OG image on success. Empty string if not found.
  351. */
  352. private function get_image( $meta_elements, $url ) {
  353. $image = $this->get_metadata_from_meta_element(
  354. $meta_elements,
  355. 'property',
  356. '(?:og:image|og:image:url)'
  357. );
  358. // Bail out if image not found.
  359. if ( '' === $image ) {
  360. return '';
  361. }
  362. // Attempt to convert relative URLs to absolute.
  363. $parsed_url = parse_url( $url );
  364. if ( isset( $parsed_url['scheme'] ) && isset( $parsed_url['host'] ) ) {
  365. $root_url = $parsed_url['scheme'] . '://' . $parsed_url['host'] . '/';
  366. $image = WP_Http::make_absolute_url( $image, $root_url );
  367. }
  368. return $image;
  369. }
  370. /**
  371. * Prepares the metadata by:
  372. * - stripping all HTML tags and tag entities.
  373. * - converting non-tag entities into characters.
  374. *
  375. * @since 5.9.0
  376. *
  377. * @param string $metadata The metadata content to prepare.
  378. * @return string The prepared metadata.
  379. */
  380. private function prepare_metadata_for_output( $metadata ) {
  381. $metadata = html_entity_decode( $metadata, ENT_QUOTES, get_bloginfo( 'charset' ) );
  382. $metadata = wp_strip_all_tags( $metadata );
  383. return $metadata;
  384. }
  385. /**
  386. * Utility function to build cache key for a given URL.
  387. *
  388. * @since 5.9.0
  389. *
  390. * @param string $url The URL for which to build a cache key.
  391. * @return string The cache key.
  392. */
  393. private function build_cache_key_for_url( $url ) {
  394. return 'g_url_details_response_' . md5( $url );
  395. }
  396. /**
  397. * Utility function to retrieve a value from the cache at a given key.
  398. *
  399. * @since 5.9.0
  400. *
  401. * @param string $key The cache key.
  402. * @return mixed The value from the cache.
  403. */
  404. private function get_cache( $key ) {
  405. return get_site_transient( $key );
  406. }
  407. /**
  408. * Utility function to cache a given data set at a given cache key.
  409. *
  410. * @since 5.9.0
  411. *
  412. * @param string $key The cache key under which to store the value.
  413. * @param string $data The data to be stored at the given cache key.
  414. * @return bool True when transient set. False if not set.
  415. */
  416. private function set_cache( $key, $data = '' ) {
  417. $ttl = HOUR_IN_SECONDS;
  418. /**
  419. * Filters the cache expiration.
  420. *
  421. * Can be used to adjust the time until expiration in seconds for the cache
  422. * of the data retrieved for the given URL.
  423. *
  424. * @since 5.9.0
  425. *
  426. * @param int $ttl The time until cache expiration in seconds.
  427. */
  428. $cache_expiration = apply_filters( 'rest_url_details_cache_expiration', $ttl );
  429. return set_site_transient( $key, $data, $cache_expiration );
  430. }
  431. /**
  432. * Retrieves the head element section.
  433. *
  434. * @since 5.9.0
  435. *
  436. * @param string $html The string of HTML to parse.
  437. * @return string The `<head>..</head>` section on success. Given `$html` if not found.
  438. */
  439. private function get_document_head( $html ) {
  440. $head_html = $html;
  441. // Find the opening `<head>` tag.
  442. $head_start = strpos( $html, '<head' );
  443. if ( false === $head_start ) {
  444. // Didn't find it. Return the original HTML.
  445. return $html;
  446. }
  447. // Find the closing `</head>` tag.
  448. $head_end = strpos( $head_html, '</head>' );
  449. if ( false === $head_end ) {
  450. // Didn't find it. Find the opening `<body>` tag.
  451. $head_end = strpos( $head_html, '<body' );
  452. // Didn't find it. Return the original HTML.
  453. if ( false === $head_end ) {
  454. return $html;
  455. }
  456. }
  457. // Extract the HTML from opening tag to the closing tag. Then add the closing tag.
  458. $head_html = substr( $head_html, $head_start, $head_end );
  459. $head_html .= '</head>';
  460. return $head_html;
  461. }
  462. /**
  463. * Gets all the meta tag elements that have a 'content' attribute.
  464. *
  465. * @since 5.9.0
  466. *
  467. * @param string $html The string of HTML to be parsed.
  468. * @return array {
  469. * A multi-dimensional indexed array on success, else empty array.
  470. *
  471. * @type string[] $0 Meta elements with a content attribute.
  472. * @type string[] $1 Content attribute's opening quotation mark.
  473. * @type string[] $2 Content attribute's value for each meta element.
  474. * }
  475. */
  476. private function get_meta_with_content_elements( $html ) {
  477. /*
  478. * Parse all meta elements with a content attribute.
  479. *
  480. * Why first search for the content attribute rather than directly searching for name=description element?
  481. * tl;dr The content attribute's value will be truncated when it contains a > symbol.
  482. *
  483. * The content attribute's value (i.e. the description to get) can have HTML in it and be well-formed as
  484. * it's a string to the browser. Imagine what happens when attempting to match for the name=description
  485. * first. Hmm, if a > or /> symbol is in the content attribute's value, then it terminates the match
  486. * as the element's closing symbol. But wait, it's in the content attribute and is not the end of the
  487. * element. This is a limitation of using regex. It can't determine "wait a minute this is inside of quotation".
  488. * If this happens, what gets matched is not the entire element or all of the content.
  489. *
  490. * Why not search for the name=description and then content="(.*)"?
  491. * The attribute order could be opposite. Plus, additional attributes may exist including being between
  492. * the name and content attributes.
  493. *
  494. * Why not lookahead?
  495. * Lookahead is not constrained to stay within the element. The first <meta it finds may not include
  496. * the name or content, but rather could be from a different element downstream.
  497. */
  498. $pattern = '#<meta\s' .
  499. /*
  500. * Allows for additional attributes before the content attribute.
  501. * Searches for anything other than > symbol.
  502. */
  503. '[^>]*' .
  504. /*
  505. * Find the content attribute. When found, capture its value (.*).
  506. *
  507. * Allows for (a) single or double quotes and (b) whitespace in the value.
  508. *
  509. * Why capture the opening quotation mark, i.e. (["\']), and then backreference,
  510. * i.e \1, for the closing quotation mark?
  511. * To ensure the closing quotation mark matches the opening one. Why? Attribute values
  512. * can contain quotation marks, such as an apostrophe in the content.
  513. */
  514. 'content=(["\']??)(.*)\1' .
  515. /*
  516. * Allows for additional attributes after the content attribute.
  517. * Searches for anything other than > symbol.
  518. */
  519. '[^>]*' .
  520. /*
  521. * \/?> searches for the closing > symbol, which can be in either /> or > format.
  522. * # ends the pattern.
  523. */
  524. '\/?>#' .
  525. /*
  526. * These are the options:
  527. * - i : case insensitive
  528. * - s : allows newline characters for the . match (needed for multiline elements)
  529. * - U means non-greedy matching
  530. */
  531. 'isU';
  532. preg_match_all( $pattern, $html, $elements );
  533. return $elements;
  534. }
  535. /**
  536. * Gets the metadata from a target meta element.
  537. *
  538. * @since 5.9.0
  539. *
  540. * @param array $meta_elements {
  541. * A multi-dimensional indexed array on success, else empty array.
  542. *
  543. * @type string[] $0 Meta elements with a content attribute.
  544. * @type string[] $1 Content attribute's opening quotation mark.
  545. * @type string[] $2 Content attribute's value for each meta element.
  546. * }
  547. * @param string $attr Attribute that identifies the element with the target metadata.
  548. * @param string $attr_value The attribute's value that identifies the element with the target metadata.
  549. * @return string The metadata on success. Empty string if not found.
  550. */
  551. private function get_metadata_from_meta_element( $meta_elements, $attr, $attr_value ) {
  552. // Bail out if there are no meta elements.
  553. if ( empty( $meta_elements[0] ) ) {
  554. return '';
  555. }
  556. $metadata = '';
  557. $pattern = '#' .
  558. /*
  559. * Target this attribute and value to find the metadata element.
  560. *
  561. * Allows for (a) no, single, double quotes and (b) whitespace in the value.
  562. *
  563. * Why capture the opening quotation mark, i.e. (["\']), and then backreference,
  564. * i.e \1, for the closing quotation mark?
  565. * To ensure the closing quotation mark matches the opening one. Why? Attribute values
  566. * can contain quotation marks, such as an apostrophe in the content.
  567. */
  568. $attr . '=([\"\']??)\s*' . $attr_value . '\s*\1' .
  569. /*
  570. * These are the options:
  571. * - i : case insensitive
  572. * - s : allows newline characters for the . match (needed for multiline elements)
  573. * - U means non-greedy matching
  574. */
  575. '#isU';
  576. // Find the metadata element.
  577. foreach ( $meta_elements[0] as $index => $element ) {
  578. preg_match( $pattern, $element, $match );
  579. // This is not the metadata element. Skip it.
  580. if ( empty( $match ) ) {
  581. continue;
  582. }
  583. /*
  584. * Found the metadata element.
  585. * Get the metadata from its matching content array.
  586. */
  587. if ( isset( $meta_elements[2][ $index ] ) && is_string( $meta_elements[2][ $index ] ) ) {
  588. $metadata = trim( $meta_elements[2][ $index ] );
  589. }
  590. break;
  591. }
  592. return $metadata;
  593. }
  594. }