From 111efb3ddb6ccc0b3848cce2dc4b96914aac6020 Mon Sep 17 00:00:00 2001 From: Clement Desmidt Date: Mon, 27 Feb 2017 14:07:17 +0100 Subject: [PATCH] :tada: First commit --- HTML-To-Markdown.php | 591 +++++++++++++++++++++++++++++++++++++++++++ export.php | 167 ++++++++++++ 2 files changed, 758 insertions(+) create mode 100644 HTML-To-Markdown.php create mode 100644 export.php diff --git a/HTML-To-Markdown.php b/HTML-To-Markdown.php new file mode 100644 index 0000000..efe102a --- /dev/null +++ b/HTML-To-Markdown.php @@ -0,0 +1,591 @@ + + * @link https://github.com/nickcernis/html2markdown/ Latest version on GitHub. + * @link http://twitter.com/nickcernis Nick on twitter. + * @license http://www.opensource.org/licenses/mit-license.php MIT + */ +class HTML_To_Markdown { + /** + * @var DOMDocument The root of the document tree that holds our HTML. + */ + private $document; + + /** + * @var string|boolean The Markdown version of the original HTML, or false if conversion failed + */ + private $output; + + /** + * @var array Class-wide options users can override. + */ + private $options = array( + 'header_style' => 'atx', // Set to "atx" to output H1 and H2 headers as # Header1 and ## Header2 + 'suppress_errors' => true, // Set to false to show warnings when loading malformed HTML + 'strip_tags' => false, // Set to true to strip tags that don't have markdown equivalents. N.B. Strips tags, not their content. Useful to clean MS Word HTML output. + 'bold_style' => '**', // Set to '__' if you prefer the underlined style + 'italic_style' => '*', // Set to '_' if you prefer the underlined style + ); + + + /** + * Constructor + * + * Set up a new DOMDocument from the supplied HTML, convert it to Markdown, and store it in $this->$output. + * + * @param string $html The HTML to convert to Markdown. + * @param array $overrides [optional] List of style and error display overrides. + */ + public function __construct( $html = null, $overrides = null ) { + if ( $overrides ) { + $this->options = array_merge( $this->options, $overrides ); + } + + if ( $html ) { + $this->convert( $html ); + + } + } + + + /** + * Setter for conversion options + * + * @param $name + * @param $value + */ + public function set_option( $name, $value ) { + $this->options[$name] = $value; + } + + + /** + * Convert + * + * Loads HTML and passes to get_markdown() + * + * @param $html + * @return string The Markdown version of the html + */ + public function convert( $html ) + { + $html = preg_replace( '~>\s+<~', '><', $html ); // Strip white space between tags to prevent creation of empty #text nodes + + $this->document = new DOMDocument(); + + if ( $this->options['suppress_errors'] ) { + libxml_use_internal_errors( true ); // Suppress conversion errors (from http://bit.ly/pCCRSX ) + } + + $this->document->loadHTML( '' . $html ); // Hack to load utf-8 HTML (from http://bit.ly/pVDyCt ) + $this->document->encoding = 'UTF-8'; + + if ( $this->options['suppress_errors'] ) { + libxml_clear_errors(); + } + + return $this->get_markdown( $html ); + } + + + /** + * Is Child Of? + * + * Is the node a child of the given parent tag? + * + * @param $parent_name string The name of the parent node to search for (e.g. 'code') + * @param $node + * @return bool + */ + private static function is_child_of( $parent_name, $node ) { + for ( $p = $node->parentNode; $p != false; $p = $p->parentNode ) { + if ( is_null( $p ) ) { + return false; + } + + if ( $p->nodeName == $parent_name ) { + return true; + } + } + return false; + } + + + /** + * Convert Children + * + * Recursive function to drill into the DOM and convert each node into Markdown from the inside out. + * + * Finds children of each node and convert those to #text nodes containing their Markdown equivalent, + * starting with the innermost element and working up to the outermost element. + * + * @param $node + */ + private function convert_children( $node ) { + // Don't convert HTML code inside blocks to Markdown - that should stay as HTML + if ( self::is_child_of( 'code', $node ) ) { + return; + } + + // If the node has children, convert those to Markdown first + if ( $node->hasChildNodes() ) { + $length = $node->childNodes->length; + + for ( $i = 0; $i < $length; $i++ ) { + $child = $node->childNodes->item( $i ); + $this->convert_children( $child ); + } + } + + // Now that child nodes have been converted, convert the original node + $this->convert_to_markdown( $node ); + } + + + /** + * Get Markdown + * + * Sends the body node to convert_children() to change inner nodes to Markdown #text nodes, then saves and + * returns the resulting converted document as a string in Markdown format. + * + * @return string|boolean The converted HTML as Markdown, or false if conversion failed + */ + private function get_markdown() + { + // Use the body tag as our root element + $body = $this->document->getElementsByTagName( 'body' )->item( 0 ); + + // Try the head tag if there's no body tag (e.g. the user's passed a single