* @link https://github.com/nickcernis/html2markdown/ Latest version on GitHub.
* @link http://twitter.com/nickcernis Nick on twitter.
* @license http://www.opensource.org/licenses/mit-license.php MIT
*/
class HTML_To_Markdown {
/**
* @var DOMDocument The root of the document tree that holds our HTML.
*/
private $document;
/**
* @var string|boolean The Markdown version of the original HTML, or false if conversion failed
*/
private $output;
/**
* @var array Class-wide options users can override.
*/
private $options = array(
'header_style' => 'atx', // Set to "atx" to output H1 and H2 headers as # Header1 and ## Header2
'suppress_errors' => true, // Set to false to show warnings when loading malformed HTML
'strip_tags' => false, // Set to true to strip tags that don't have markdown equivalents. N.B. Strips tags, not their content. Useful to clean MS Word HTML output.
'bold_style' => '**', // Set to '__' if you prefer the underlined style
'italic_style' => '*', // Set to '_' if you prefer the underlined style
);
/**
* Constructor
*
* Set up a new DOMDocument from the supplied HTML, convert it to Markdown, and store it in $this->$output.
*
* @param string $html The HTML to convert to Markdown.
* @param array $overrides [optional] List of style and error display overrides.
*/
public function __construct( $html = null, $overrides = null ) {
if ( $overrides ) {
$this->options = array_merge( $this->options, $overrides );
}
if ( $html ) {
$this->convert( $html );
}
}
/**
* Setter for conversion options
*
* @param $name
* @param $value
*/
public function set_option( $name, $value ) {
$this->options[$name] = $value;
}
/**
* Convert
*
* Loads HTML and passes to get_markdown()
*
* @param $html
* @return string The Markdown version of the html
*/
public function convert( $html )
{
$html = preg_replace( '~>\s+<~', '><', $html ); // Strip white space between tags to prevent creation of empty #text nodes
$this->document = new DOMDocument();
if ( $this->options['suppress_errors'] ) {
libxml_use_internal_errors( true ); // Suppress conversion errors (from http://bit.ly/pCCRSX )
}
$this->document->loadHTML( '' . $html ); // Hack to load utf-8 HTML (from http://bit.ly/pVDyCt )
$this->document->encoding = 'UTF-8';
if ( $this->options['suppress_errors'] ) {
libxml_clear_errors();
}
return $this->get_markdown( $html );
}
/**
* Is Child Of?
*
* Is the node a child of the given parent tag?
*
* @param $parent_name string The name of the parent node to search for (e.g. 'code')
* @param $node
* @return bool
*/
private static function is_child_of( $parent_name, $node ) {
for ( $p = $node->parentNode; $p != false; $p = $p->parentNode ) {
if ( is_null( $p ) ) {
return false;
}
if ( $p->nodeName == $parent_name ) {
return true;
}
}
return false;
}
/**
* Convert Children
*
* Recursive function to drill into the DOM and convert each node into Markdown from the inside out.
*
* Finds children of each node and convert those to #text nodes containing their Markdown equivalent,
* starting with the innermost element and working up to the outermost element.
*
* @param $node
*/
private function convert_children( $node ) {
// Don't convert HTML code inside blocks to Markdown - that should stay as HTML
if ( self::is_child_of( 'code', $node ) ) {
return;
}
// If the node has children, convert those to Markdown first
if ( $node->hasChildNodes() ) {
$length = $node->childNodes->length;
for ( $i = 0; $i < $length; $i++ ) {
$child = $node->childNodes->item( $i );
$this->convert_children( $child );
}
}
// Now that child nodes have been converted, convert the original node
$this->convert_to_markdown( $node );
}
/**
* Get Markdown
*
* Sends the body node to convert_children() to change inner nodes to Markdown #text nodes, then saves and
* returns the resulting converted document as a string in Markdown format.
*
* @return string|boolean The converted HTML as Markdown, or false if conversion failed
*/
private function get_markdown()
{
// Use the body tag as our root element
$body = $this->document->getElementsByTagName( 'body' )->item( 0 );
// Try the head tag if there's no body tag (e.g. the user's passed a single