* @link https://github.com/nickcernis/html2markdown/ Latest version on GitHub. * @link http://twitter.com/nickcernis Nick on twitter. * @license http://www.opensource.org/licenses/mit-license.php MIT */ class HTML_To_Markdown { /** * @var DOMDocument The root of the document tree that holds our HTML. */ private $document; /** * @var string|boolean The Markdown version of the original HTML, or false if conversion failed */ private $output; /** * @var array Class-wide options users can override. */ private $options = array( 'header_style' => 'atx', // Set to "atx" to output H1 and H2 headers as # Header1 and ## Header2 'suppress_errors' => true, // Set to false to show warnings when loading malformed HTML 'strip_tags' => false, // Set to true to strip tags that don't have markdown equivalents. N.B. Strips tags, not their content. Useful to clean MS Word HTML output. 'bold_style' => '**', // Set to '__' if you prefer the underlined style 'italic_style' => '*', // Set to '_' if you prefer the underlined style ); /** * Constructor * * Set up a new DOMDocument from the supplied HTML, convert it to Markdown, and store it in $this->$output. * * @param string $html The HTML to convert to Markdown. * @param array $overrides [optional] List of style and error display overrides. */ public function __construct( $html = null, $overrides = null ) { if ( $overrides ) { $this->options = array_merge( $this->options, $overrides ); } if ( $html ) { $this->convert( $html ); } } /** * Setter for conversion options * * @param $name * @param $value */ public function set_option( $name, $value ) { $this->options[$name] = $value; } /** * Convert * * Loads HTML and passes to get_markdown() * * @param $html * @return string The Markdown version of the html */ public function convert( $html ) { $html = preg_replace( '~>\s+<~', '><', $html ); // Strip white space between tags to prevent creation of empty #text nodes $this->document = new DOMDocument(); if ( $this->options['suppress_errors'] ) { libxml_use_internal_errors( true ); // Suppress conversion errors (from http://bit.ly/pCCRSX ) } $this->document->loadHTML( '' . $html ); // Hack to load utf-8 HTML (from http://bit.ly/pVDyCt ) $this->document->encoding = 'UTF-8'; if ( $this->options['suppress_errors'] ) { libxml_clear_errors(); } return $this->get_markdown( $html ); } /** * Is Child Of? * * Is the node a child of the given parent tag? * * @param $parent_name string The name of the parent node to search for (e.g. 'code') * @param $node * @return bool */ private static function is_child_of( $parent_name, $node ) { for ( $p = $node->parentNode; $p != false; $p = $p->parentNode ) { if ( is_null( $p ) ) { return false; } if ( $p->nodeName == $parent_name ) { return true; } } return false; } /** * Convert Children * * Recursive function to drill into the DOM and convert each node into Markdown from the inside out. * * Finds children of each node and convert those to #text nodes containing their Markdown equivalent, * starting with the innermost element and working up to the outermost element. * * @param $node */ private function convert_children( $node ) { // Don't convert HTML code inside blocks to Markdown - that should stay as HTML if ( self::is_child_of( 'code', $node ) ) { return; } // If the node has children, convert those to Markdown first if ( $node->hasChildNodes() ) { $length = $node->childNodes->length; for ( $i = 0; $i < $length; $i++ ) { $child = $node->childNodes->item( $i ); $this->convert_children( $child ); } } // Now that child nodes have been converted, convert the original node $this->convert_to_markdown( $node ); } /** * Get Markdown * * Sends the body node to convert_children() to change inner nodes to Markdown #text nodes, then saves and * returns the resulting converted document as a string in Markdown format. * * @return string|boolean The converted HTML as Markdown, or false if conversion failed */ private function get_markdown() { // Use the body tag as our root element $body = $this->document->getElementsByTagName( 'body' )->item( 0 ); // Try the head tag if there's no body tag (e.g. the user's passed a single