diff --git a/lib/Html2Text/Html2Text.php b/lib/Html2Text/Html2Text.php index af59e84..9b26ebe 100644 --- a/lib/Html2Text/Html2Text.php +++ b/lib/Html2Text/Html2Text.php @@ -22,45 +22,43 @@ namespace Html2Text; +/** + * Converts HTML to formatted plain text + */ class Html2Text { - /** - * Contains the HTML content to convert. + * Contains the HTML content to convert. * - * @var string $html - * @access public + * @var string $html */ - public $html; + protected $html; /** - * Contains the converted, formatted text. + * Contains the converted, formatted text. * - * @var string $text - * @access public + * @var string $text */ - public $text; + protected $text; /** - * Maximum width of the formatted text, in columns. + * Maximum width of the formatted text, in columns. * - * Set this value to 0 (or less) to ignore word wrapping - * and not constrain text to a fixed-width column. + * Set this value to 0 (or less) to ignore word wrapping + * and not constrain text to a fixed-width column. * - * @var integer $width - * @access public + * @var integer $width */ - public $width = 70; + protected $width = 70; /** - * List of preg* regular expression patterns to search for, - * used in conjunction with $replace. + * List of preg* regular expression patterns to search for, + * used in conjunction with $replace. * - * @var array $search - * @access public - * @see $replace + * @var array $search + * @see $replace */ - public $search = array( + protected $search = array( "/\r/", // Non-legal carriage return "/[\n\t]+/", // Newlines and tabs '/
]*>.*?<\/head>/i', // @@ -86,13 +84,12 @@ class Html2Text ); /** - * List of pattern replacements corresponding to patterns searched. + * List of pattern replacements corresponding to patterns searched. * - * @var array $replace - * @access public - * @see $search + * @var array $replace + * @see $search */ - public $replace = array( + protected $replace = array( '', // Non-legal carriage return ' ', // Newlines and tabs '', // @@ -118,17 +115,16 @@ class Html2Text ); /** - * List of preg* regular expression patterns to search for, - * used in conjunction with $ent_replace. + * List of preg* regular expression patterns to search for, + * used in conjunction with $ent_replace. * - * @var array $ent_search - * @access public - * @see $ent_replace + * @var array $ent_search + * @see $ent_replace */ - public $ent_search = array( + protected $ent_search = array( '/&(nbsp|#160);/i', // Non-breaking space '/&(quot|rdquo|ldquo|#8220|#8221|#147|#148);/i', - // Double quotes + // Double quotes '/&(apos|rsquo|lsquo|#8216|#8217);/i', // Single quotes '/>/i', // Greater-than '/</i', // Less-than @@ -145,13 +141,12 @@ class Html2Text ); /** - * List of pattern replacements corresponding to patterns searched. + * List of pattern replacements corresponding to patterns searched. * - * @var array $ent_replace - * @access public - * @see $ent_search + * @var array $ent_replace + * @see $ent_search */ - public $ent_replace = array( + protected $ent_replace = array( ' ', // Non-breaking space '"', // Double quotes "'", // Single quotes @@ -170,29 +165,27 @@ class Html2Text ); /** - * List of preg* regular expression patterns to search for - * and replace using callback function. + * List of preg* regular expression patterns to search for + * and replace using callback function. * - * @var array $callback_search - * @access public + * @var array $callback_search */ - public $callback_search = array( + protected $callback_search = array( '/<(a) [^>]*href=("|\')([^"\']+)\2([^>]*)>(.*?)<\/a>/i', // - '/<(h)[123456]( [^>]*)?>(.*?)<\/h[123456]>/i', // h1 - h6 - '/<(b)( [^>]*)?>(.*?)<\/b>/i', // - '/<(strong)( [^>]*)?>(.*?)<\/strong>/i', // - '/<(th)( [^>]*)?>(.*?)<\/th>/i', //", with no corresponding closing tag. + * Sets the allowed HTML tags to pass through to the resulting text. * - * @access public - * @return void + * Tags should be in the form "
", with no corresponding closing tag. */ - public function set_allowed_tags( $allowed_tags = '' ) + public function set_allowed_tags($allowed_tags = '') { - if ( !empty($allowed_tags) ) { + if (!empty($allowed_tags)) { $this->allowed_tags = $allowed_tags; } } /** - * Sets a base URL to handle relative links. - * - * @access public - * @return void + * Sets a base URL to handle relative links. */ - public function set_base_url( $url = '' ) + public function set_base_url($url = '') { - if ( empty($url) ) { - if ( !empty($_SERVER['HTTP_HOST']) ) { + if (empty($url)) { + if (!empty($_SERVER['HTTP_HOST'])) { $this->url = 'http://' . $_SERVER['HTTP_HOST']; } else { $this->url = ''; @@ -382,7 +354,7 @@ public function set_base_url( $url = '' ) } else { // Strip any trailing slashes for consistency (relative // URLs may already start with a slash like "/file.html") - if ( substr($url, -1) == '/' ) { + if (substr($url, -1) == '/') { $url = substr($url, 0, -1); } $this->url = $url; @@ -390,12 +362,9 @@ public function set_base_url( $url = '' ) } /** - * Workhorse function that does actual conversion (calls _converter() method). - * - * @access private - * @return void + * Workhorse function that does actual conversion (calls _converter() method). */ - private function _convert() + protected function _convert() { // Variables used for building the link list $this->_link_list = array(); @@ -419,19 +388,16 @@ private function _convert() } /** - * Workhorse function that does actual conversion. - * - * First performs custom tag replacement specified by $search and - * $replace arrays. Then strips any remaining HTML tags, reduces whitespace - * and newlines to a readable format, and word wraps the text to - * $this->_options['width'] characters. + * Workhorse function that does actual conversion. * - * @param string Reference to HTML content string + * First performs custom tag replacement specified by $search and + * $replace arrays. Then strips any remaining HTML tags, reduces whitespace + * and newlines to a readable format, and word wraps the text to + * $this->_options['width'] characters. * - * @access private - * @return void + * @param string Reference to HTML content string */ - private function _converter(&$text) + protected function _converter(&$text) { // Convert
(before PRE!) $this->_convert_blockquotes($text); @@ -477,19 +443,18 @@ private function _converter(&$text) } /** - * Helper function called by preg_replace() on link replacement. + * Helper function called by preg_replace() on link replacement. * - * Maintains an internal list of links to be displayed at the end of the - * text, with numeric indices to the original point in the text they - * appeared. Also makes an effort at identifying and handling absolute - * and relative links. + * Maintains an internal list of links to be displayed at the end of the + * text, with numeric indices to the original point in the text they + * appeared. Also makes an effort at identifying and handling absolute + * and relative links. * - * @param string $link URL of the link - * @param string $display Part of the text to associate number with - * @access private - * @return string + * @param string $link URL of the link + * @param string $display Part of the text to associate number with + * @return string */ - private function _build_link_list( $link, $display, $link_override = null) + protected function _build_link_list($link, $display, $link_override = null) { $link_method = ($link_override) ? $link_override : $this->_options['do_links']; if ($link_method == 'none') @@ -500,6 +465,7 @@ private function _build_link_list( $link, $display, $link_override = null) if (preg_match('!^(javascript:|mailto:|#)!i', $link)) { return $display; } + if (preg_match('!^([a-z][a-z0-9.+-]+:)!i', $link)) { $url = $link; } else { @@ -526,12 +492,11 @@ private function _build_link_list( $link, $display, $link_override = null) } /** - * Helper function for PRE body conversion. + * Helper function for PRE body conversion. * - * @param string HTML content - * @access private + * @param string HTML content */ - private function _convert_pre(&$text) + protected function _convert_pre(&$text) { // get the content of PRE element while (preg_match('/]*>(.*)<\/pre>/ismU', $text, $matches)) { @@ -544,6 +509,7 @@ private function _convert_pre(&$text) // convert the content $this->pre_content = sprintf('', preg_replace($this->pre_search, $this->pre_replace, $this->pre_content)); + // replace the content (use callback because content can contain $0 variable) $text = preg_replace_callback('/
%s]*>.*<\/pre>/ismU', array($this, '_preg_pre_callback'), $text, 1); @@ -554,12 +520,11 @@ private function _convert_pre(&$text) } /** - * Helper function for BLOCKQUOTE body conversion. + * Helper function for BLOCKQUOTE body conversion. * - * @param string HTML content - * @access private + * @param string HTML content */ - private function _convert_blockquotes(&$text) + protected function _convert_blockquotes(&$text) { if (preg_match_all('/<\/*blockquote[^>]*>/i', $text, $matches, PREG_OFFSET_CAPTURE)) { $start = 0; @@ -609,12 +574,12 @@ private function _convert_blockquotes(&$text) } /** - * Callback function for preg_replace_callback use. + * Callback function for preg_replace_callback use. * - * @param array PREG matches - * @return string + * @param array PREG matches + * @return string */ - private function _preg_callback($matches) + protected function _preg_callback($matches) { switch (strtolower($matches[1])) { case 'b': @@ -638,12 +603,12 @@ private function _preg_callback($matches) } /** - * Callback function for preg_replace_callback use in PRE content handler. + * Callback function for preg_replace_callback use in PRE content handler. * - * @param array PREG matches - * @return string + * @param array PREG matches + * @return string */ - private function _preg_pre_callback($matches) + protected function _preg_pre_callback($matches) { return $this->pre_content; }