Copie de Tumblr sur Shaarli
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

228 lines
6.8 KiB

  1. <?php
  2. namespace League\HTMLToMarkdown;
  3. /**
  4. * Class HtmlConverter
  5. *
  6. * A helper class to convert HTML to Markdown.
  7. *
  8. * @author Colin O'Dell <colinodell@gmail.com>
  9. * @author Nick Cernis <nick@cern.is>
  10. *
  11. * @link https://github.com/thephpleague/html-to-markdown/ Latest version on GitHub.
  12. *
  13. * @license http://www.opensource.org/licenses/mit-license.php MIT
  14. */
  15. class HtmlConverter
  16. {
  17. /**
  18. * @var Environment
  19. */
  20. protected $environment;
  21. /**
  22. * Constructor
  23. *
  24. * @param array $options Configuration options
  25. */
  26. public function __construct(array $options = array())
  27. {
  28. $defaults = array(
  29. 'header_style' => 'setext', // Set to 'atx' to output H1 and H2 headers as # Header1 and ## Header2
  30. 'suppress_errors' => true, // Set to false to show warnings when loading malformed HTML
  31. 'strip_tags' => false, // Set to true to strip tags that don't have markdown equivalents. N.B. Strips tags, not their content. Useful to clean MS Word HTML output.
  32. 'bold_style' => '**', // Set to '__' if you prefer the underlined style
  33. 'italic_style' => '_', // Set to '*' if you prefer the asterisk style
  34. 'remove_nodes' => '', // space-separated list of dom nodes that should be removed. example: 'meta style script'
  35. );
  36. $this->environment = Environment::createDefaultEnvironment($defaults);
  37. $this->environment->getConfig()->merge($options);
  38. }
  39. /**
  40. * @return Environment
  41. */
  42. public function getEnvironment()
  43. {
  44. return $this->environment;
  45. }
  46. /**
  47. * @return Configuration
  48. */
  49. public function getConfig()
  50. {
  51. return $this->environment->getConfig();
  52. }
  53. /**
  54. * Convert
  55. *
  56. * @see HtmlConverter::convert
  57. *
  58. * @param string $html
  59. *
  60. * @return string The Markdown version of the html
  61. */
  62. public function __invoke($html)
  63. {
  64. return $this->convert($html);
  65. }
  66. /**
  67. * Convert
  68. *
  69. * Loads HTML and passes to getMarkdown()
  70. *
  71. * @param $html
  72. *
  73. * @return string The Markdown version of the html
  74. */
  75. public function convert($html)
  76. {
  77. if (trim($html) === '') {
  78. return '';
  79. }
  80. $document = $this->createDOMDocument($html);
  81. // Work on the entire DOM tree (including head and body)
  82. if (!($root = $document->getElementsByTagName('html')->item(0))) {
  83. throw new \InvalidArgumentException('Invalid HTML was provided');
  84. }
  85. $rootElement = new Element($root);
  86. $this->convertChildren($rootElement);
  87. // Store the now-modified DOMDocument as a string
  88. $markdown = $document->saveHTML();
  89. $markdown = $this->sanitize($markdown);
  90. return $markdown;
  91. }
  92. /**
  93. * @param string $html
  94. *
  95. * @return \DOMDocument
  96. */
  97. private function createDOMDocument($html)
  98. {
  99. $document = new \DOMDocument();
  100. if ($this->getConfig()->getOption('suppress_errors')) {
  101. // Suppress conversion errors (from http://bit.ly/pCCRSX)
  102. libxml_use_internal_errors(true);
  103. }
  104. // Hack to load utf-8 HTML (from http://bit.ly/pVDyCt)
  105. $document->loadHTML('<?xml encoding="UTF-8">' . $html);
  106. $document->encoding = 'UTF-8';
  107. if ($this->getConfig()->getOption('suppress_errors')) {
  108. libxml_clear_errors();
  109. }
  110. return $document;
  111. }
  112. /**
  113. * Convert Children
  114. *
  115. * Recursive function to drill into the DOM and convert each node into Markdown from the inside out.
  116. *
  117. * Finds children of each node and convert those to #text nodes containing their Markdown equivalent,
  118. * starting with the innermost element and working up to the outermost element.
  119. *
  120. * @param ElementInterface $element
  121. */
  122. private function convertChildren(ElementInterface $element)
  123. {
  124. // Don't convert HTML code inside <code> and <pre> blocks to Markdown - that should stay as HTML
  125. if ($element->isDescendantOf(array('pre', 'code'))) {
  126. return;
  127. }
  128. // If the node has children, convert those to Markdown first
  129. if ($element->hasChildren()) {
  130. foreach ($element->getChildren() as $child) {
  131. $this->convertChildren($child);
  132. }
  133. }
  134. // Now that child nodes have been converted, convert the original node
  135. $markdown = $this->convertToMarkdown($element);
  136. // Create a DOM text node containing the Markdown equivalent of the original node
  137. // Replace the old $node e.g. '<h3>Title</h3>' with the new $markdown_node e.g. '### Title'
  138. $element->setFinalMarkdown($markdown);
  139. }
  140. /**
  141. * Convert to Markdown
  142. *
  143. * Converts an individual node into a #text node containing a string of its Markdown equivalent.
  144. *
  145. * Example: An <h3> node with text content of 'Title' becomes a text node with content of '### Title'
  146. *
  147. * @param ElementInterface $element
  148. *
  149. * @return string The converted HTML as Markdown
  150. */
  151. protected function convertToMarkdown(ElementInterface $element)
  152. {
  153. $tag = $element->getTagName();
  154. // Strip nodes named in remove_nodes
  155. $tags_to_remove = explode(' ', $this->getConfig()->getOption('remove_nodes'));
  156. if (in_array($tag, $tags_to_remove)) {
  157. return false;
  158. }
  159. $converter = $this->environment->getConverterByTag($tag);
  160. return $converter->convert($element);
  161. }
  162. /**
  163. * @param string $markdown
  164. *
  165. * @return string
  166. */
  167. protected function sanitize($markdown)
  168. {
  169. $markdown = html_entity_decode($markdown, ENT_QUOTES, 'UTF-8');
  170. $markdown = preg_replace('/<!DOCTYPE [^>]+>/', '', $markdown); // Strip doctype declaration
  171. $markdown = trim($markdown); // Remove blank spaces at the beggining of the html
  172. /*
  173. * Removing unwanted tags. Tags should be added to the array in the order they are expected.
  174. * XML, html and body opening tags should be in that order. Same case with closing tags
  175. */
  176. $unwanted = array('<?xml encoding="UTF-8">', '<html>', '</html>', '<body>', '</body>', '<head>', '</head>', '&#xD;');
  177. foreach ($unwanted as $tag) {
  178. if (strpos($tag, '/') === false) {
  179. // Opening tags
  180. if (strpos($markdown, $tag) === 0) {
  181. $markdown = substr($markdown, strlen($tag));
  182. }
  183. } else {
  184. // Closing tags
  185. if (strpos($markdown, $tag) === strlen($markdown) - strlen($tag)) {
  186. $markdown = substr($markdown, 0, -strlen($tag));
  187. }
  188. }
  189. }
  190. $markdown = trim($markdown, "\n\r\0\x0B");
  191. return $markdown;
  192. }
  193. }