From 80a0510a8d36e6cc3f446d3eb67dfdc7084c575c Mon Sep 17 00:00:00 2001 From: Ivan Tcholakov Date: Wed, 2 Mar 2011 18:53:52 +0200 Subject: [PATCH] Task #2972 - Inserting some fixes from the current development code of HTMLPurifier. --- .../library/HTMLPurifier.autoload.php | 5 + .../library/HTMLPurifier/AttrDef/URI/Host.php | 6 + .../HTMLPurifier/AttrTransform/SafeParam.php | 3 +- .../library/HTMLPurifier/Bootstrap.php | 10 +- .../library/HTMLPurifier/Config.php | 231 ++++++++++++++---- .../library/HTMLPurifier/ConfigSchema.php | 8 +- .../library/HTMLPurifier/Definition.php | 11 + .../HTMLPurifier/HTMLModule/SafeEmbed.php | 2 +- .../HTMLPurifier/HTMLModuleManager.php | 5 +- .../library/HTMLPurifier/Lexer.php | 6 +- 10 files changed, 224 insertions(+), 63 deletions(-) mode change 100755 => 100644 main/inc/lib/htmlpurifier/library/HTMLPurifier.autoload.php mode change 100755 => 100644 main/inc/lib/htmlpurifier/library/HTMLPurifier/AttrDef/URI/Host.php mode change 100755 => 100644 main/inc/lib/htmlpurifier/library/HTMLPurifier/Bootstrap.php mode change 100755 => 100644 main/inc/lib/htmlpurifier/library/HTMLPurifier/ConfigSchema.php mode change 100755 => 100644 main/inc/lib/htmlpurifier/library/HTMLPurifier/Definition.php mode change 100755 => 100644 main/inc/lib/htmlpurifier/library/HTMLPurifier/HTMLModule/SafeEmbed.php mode change 100755 => 100644 main/inc/lib/htmlpurifier/library/HTMLPurifier/HTMLModuleManager.php diff --git a/main/inc/lib/htmlpurifier/library/HTMLPurifier.autoload.php b/main/inc/lib/htmlpurifier/library/HTMLPurifier.autoload.php old mode 100755 new mode 100644 index 8d40176406..62da5b60d4 --- a/main/inc/lib/htmlpurifier/library/HTMLPurifier.autoload.php +++ b/main/inc/lib/htmlpurifier/library/HTMLPurifier.autoload.php @@ -3,6 +3,7 @@ /** * @file * Convenience file that registers autoload handler for HTML Purifier. + * It also does some sanity checks. */ if (function_exists('spl_autoload_register') && function_exists('spl_autoload_unregister')) { @@ -18,4 +19,8 @@ if (function_exists('spl_autoload_register') && function_exists('spl_autoload_un } } +if (ini_get('zend.ze1_compatibility_mode')) { + trigger_error("HTML Purifier is not compatible with zend.ze1_compatibility_mode; please turn it off", E_USER_ERROR); +} + // vim: et sw=4 sts=4 diff --git a/main/inc/lib/htmlpurifier/library/HTMLPurifier/AttrDef/URI/Host.php b/main/inc/lib/htmlpurifier/library/HTMLPurifier/AttrDef/URI/Host.php old mode 100755 new mode 100644 index 2156c10c66..feca469d70 --- a/main/inc/lib/htmlpurifier/library/HTMLPurifier/AttrDef/URI/Host.php +++ b/main/inc/lib/htmlpurifier/library/HTMLPurifier/AttrDef/URI/Host.php @@ -23,6 +23,12 @@ class HTMLPurifier_AttrDef_URI_Host extends HTMLPurifier_AttrDef public function validate($string, $config, $context) { $length = strlen($string); + // empty hostname is OK; it's usually semantically equivalent: + // the default host as defined by a URI scheme is used: + // + // If the URI scheme defines a default for host, then that + // default applies when the host subcomponent is undefined + // or when the registered name is empty (zero length). if ($string === '') return ''; if ($length > 1 && $string[0] === '[' && $string[$length-1] === ']') { //IPv6 diff --git a/main/inc/lib/htmlpurifier/library/HTMLPurifier/AttrTransform/SafeParam.php b/main/inc/lib/htmlpurifier/library/HTMLPurifier/AttrTransform/SafeParam.php index 91f67b08cb..4ceea62c0a 100644 --- a/main/inc/lib/htmlpurifier/library/HTMLPurifier/AttrTransform/SafeParam.php +++ b/main/inc/lib/htmlpurifier/library/HTMLPurifier/AttrTransform/SafeParam.php @@ -19,6 +19,7 @@ class HTMLPurifier_AttrTransform_SafeParam extends HTMLPurifier_AttrTransform public function __construct() { $this->uri = new HTMLPurifier_AttrDef_URI(true); // embedded + $this->wmode = new HTMLPurifier_AttrDef_Enum(array('window', 'opaque', 'transparent')); } public function transform($attr, $config, $context) { @@ -41,7 +42,7 @@ class HTMLPurifier_AttrTransform_SafeParam extends HTMLPurifier_AttrTransform } break; case 'wmode': - $attr['value'] = 'window'; + $attr['value'] = $this->wmode->validate($attr['value'], $config, $context); break; case 'movie': case 'src': diff --git a/main/inc/lib/htmlpurifier/library/HTMLPurifier/Bootstrap.php b/main/inc/lib/htmlpurifier/library/HTMLPurifier/Bootstrap.php old mode 100755 new mode 100644 index 559f61a232..607c5b1880 --- a/main/inc/lib/htmlpurifier/library/HTMLPurifier/Bootstrap.php +++ b/main/inc/lib/htmlpurifier/library/HTMLPurifier/Bootstrap.php @@ -37,7 +37,12 @@ class HTMLPurifier_Bootstrap public static function autoload($class) { $file = HTMLPurifier_Bootstrap::getPath($class); if (!$file) return false; - require HTMLPURIFIER_PREFIX . '/' . $file; + // Technically speaking, it should be ok and more efficient to + // just do 'require', but Antonio Parraga reports that with + // Zend extensions such as Zend debugger and APC, this invariant + // may be broken. Since we have efficient alternatives, pay + // the cost here and avoid the bug. + require_once HTMLPURIFIER_PREFIX . '/' . $file; return true; } @@ -65,10 +70,11 @@ class HTMLPurifier_Bootstrap if ( ($funcs = spl_autoload_functions()) === false ) { spl_autoload_register($autoload); } elseif (function_exists('spl_autoload_unregister')) { + $buggy = version_compare(PHP_VERSION, '5.2.11', '<'); $compat = version_compare(PHP_VERSION, '5.1.2', '<=') && version_compare(PHP_VERSION, '5.1.0', '>='); foreach ($funcs as $func) { - if (is_array($func)) { + if ($buggy && is_array($func)) { // :TRICKY: There are some compatibility issues and some // places where we need to error out $reflector = new ReflectionMethod($func[0], $func[1]); diff --git a/main/inc/lib/htmlpurifier/library/HTMLPurifier/Config.php b/main/inc/lib/htmlpurifier/library/HTMLPurifier/Config.php index 3461c9f822..69c5683233 100644 --- a/main/inc/lib/htmlpurifier/library/HTMLPurifier/Config.php +++ b/main/inc/lib/htmlpurifier/library/HTMLPurifier/Config.php @@ -76,7 +76,8 @@ class HTMLPurifier_Config /** * Set to false if you do not want line and file numbers in errors - * (useful when unit testing) + * (useful when unit testing). This will also compress some errors + * and exceptions. */ public $chatty = true; @@ -318,26 +319,64 @@ class HTMLPurifier_Config * Retrieves object reference to the HTML definition. * @param $raw Return a copy that has not been setup yet. Must be * called before it's been setup, otherwise won't work. - */ - public function getHTMLDefinition($raw = false) { - return $this->getDefinition('HTML', $raw); + * @param $optimized If true, this method may return null, to + * indicate that a cached version of the modified + * definition object is available and no further edits + * are necessary. Consider using + * maybeGetRawHTMLDefinition, which is more explicitly + * named, instead. + */ + public function getHTMLDefinition($raw = false, $optimized = false) { + return $this->getDefinition('HTML', $raw, $optimized); } /** * Retrieves object reference to the CSS definition * @param $raw Return a copy that has not been setup yet. Must be * called before it's been setup, otherwise won't work. - */ - public function getCSSDefinition($raw = false) { - return $this->getDefinition('CSS', $raw); + * @param $optimized If true, this method may return null, to + * indicate that a cached version of the modified + * definition object is available and no further edits + * are necessary. Consider using + * maybeGetRawCSSDefinition, which is more explicitly + * named, instead. + */ + public function getCSSDefinition($raw = false, $optimized = false) { + return $this->getDefinition('CSS', $raw, $optimized); + } + + /** + * Retrieves object reference to the URI definition + * @param $raw Return a copy that has not been setup yet. Must be + * called before it's been setup, otherwise won't work. + * @param $optimized If true, this method may return null, to + * indicate that a cached version of the modified + * definition object is available and no further edits + * are necessary. Consider using + * maybeGetRawURIDefinition, which is more explicitly + * named, instead. + */ + public function getURIDefinition($raw = false, $optimized = false) { + return $this->getDefinition('URI', $raw, $optimized); } /** * Retrieves a definition * @param $type Type of definition: HTML, CSS, etc * @param $raw Whether or not definition should be returned raw - */ - public function getDefinition($type, $raw = false) { + * @param $optimized Only has an effect when $raw is true. Whether + * or not to return null if the result is already present in + * the cache. This is off by default for backwards + * compatibility reasons, but you need to do things this + * way in order to ensure that caching is done properly. + * Check out enduser-customize.html for more details. + * We probably won't ever change this default, as much as the + * maybe semantics is the "right thing to do." + */ + public function getDefinition($type, $raw = false, $optimized = false) { + if ($optimized && !$raw) { + throw new HTMLPurifier_Exception("Cannot set optimized = true when raw = false"); + } if (!$this->finalized) $this->autoFinalize(); // temporarily suspend locks, so we can handle recursive definition calls $lock = $this->lock; @@ -346,52 +385,137 @@ class HTMLPurifier_Config $cache = $factory->create($type, $this); $this->lock = $lock; if (!$raw) { - // see if we can quickly supply a definition + // full definition + // --------------- + // check if definition is in memory if (!empty($this->definitions[$type])) { - if (!$this->definitions[$type]->setup) { - $this->definitions[$type]->setup($this); - $cache->set($this->definitions[$type], $this); - } - return $this->definitions[$type]; - } - // memory check missed, try cache - $this->definitions[$type] = $cache->get($this); - if ($this->definitions[$type]) { - // definition in cache, return it - return $this->definitions[$type]; - } - } elseif ( - !empty($this->definitions[$type]) && - !$this->definitions[$type]->setup - ) { - // raw requested, raw in memory, quick return - return $this->definitions[$type]; + $def = $this->definitions[$type]; + // check if the definition is setup + if ($def->setup) { + return $def; + } else { + $def->setup($this); + if ($def->optimized) $cache->add($def, $this); + return $def; + } + } + // check if definition is in cache + $def = $cache->get($this); + if ($def) { + // definition in cache, save to memory and return it + $this->definitions[$type] = $def; + return $def; + } + // initialize it + $def = $this->initDefinition($type); + // set it up + $this->lock = $type; + $def->setup($this); + $this->lock = null; + // save in cache + $cache->add($def, $this); + // return it + return $def; + } else { + // raw definition + // -------------- + // check preconditions + $def = null; + if ($optimized) { + if (is_null($this->get($type . '.DefinitionID'))) { + // fatally error out if definition ID not set + throw new HTMLPurifier_Exception("Cannot retrieve raw version without specifying %$type.DefinitionID"); + } + } + if (!empty($this->definitions[$type])) { + $def = $this->definitions[$type]; + if ($def->setup && !$optimized) { + $extra = $this->chatty ? " (try moving this code block earlier in your initialization)" : ""; + throw new HTMLPurifier_Exception("Cannot retrieve raw definition after it has already been setup" . $extra); + } + if ($def->optimized === null) { + $extra = $this->chatty ? " (try flushing your cache)" : ""; + throw new HTMLPurifier_Exception("Optimization status of definition is unknown" . $extra); + } + if ($def->optimized !== $optimized) { + $msg = $optimized ? "optimized" : "unoptimized"; + $extra = $this->chatty ? " (this backtrace is for the first inconsistent call, which was for a $msg raw definition)" : ""; + throw new HTMLPurifier_Exception("Inconsistent use of optimized and unoptimized raw definition retrievals" . $extra); + } + } + // check if definition was in memory + if ($def) { + if ($def->setup) { + // invariant: $optimized === true (checked above) + return null; + } else { + return $def; + } + } + // if optimized, check if definition was in cache + // (because we do the memory check first, this formulation + // is prone to cache slamming, but I think + // guaranteeing that either /all/ of the raw + // setup code or /none/ of it is run is more important.) + if ($optimized) { + // This code path only gets run once; once we put + // something in $definitions (which is guaranteed by the + // trailing code), we always short-circuit above. + $def = $cache->get($this); + if ($def) { + // save the full definition for later, but don't + // return it yet + $this->definitions[$type] = $def; + return null; + } + } + // check invariants for creation + if (!$optimized) { + if (!is_null($this->get($type . '.DefinitionID'))) { + if ($this->chatty) { + $this->triggerError("Due to a documentation error in previous version of HTML Purifier, your definitions are not being cached. If this is OK, you can remove the %$type.DefinitionRev and %$type.DefinitionID declaration. Otherwise, modify your code to use maybeGetRawDefinition, and test if the returned value is null before making any edits (if it is null, that means that a cached version is available, and no raw operations are necessary). See Customize for more details", E_USER_WARNING); + } else { + $this->triggerError("Useless DefinitionID declaration", E_USER_WARNING); + } + } } + // initialize it + $def = $this->initDefinition($type); + $def->optimized = $optimized; + return $def; + } + throw new HTMLPurifier_Exception("The impossible happened!"); + } + + private function initDefinition($type) { // quick checks failed, let's create the object if ($type == 'HTML') { - $this->definitions[$type] = new HTMLPurifier_HTMLDefinition(); + $def = new HTMLPurifier_HTMLDefinition(); } elseif ($type == 'CSS') { - $this->definitions[$type] = new HTMLPurifier_CSSDefinition(); + $def = new HTMLPurifier_CSSDefinition(); } elseif ($type == 'URI') { - $this->definitions[$type] = new HTMLPurifier_URIDefinition(); + $def = new HTMLPurifier_URIDefinition(); } else { throw new HTMLPurifier_Exception("Definition of $type type not supported"); } - // quick abort if raw - if ($raw) { - if (is_null($this->get($type . '.DefinitionID'))) { - // fatally error out if definition ID not set - throw new HTMLPurifier_Exception("Cannot retrieve raw version without specifying %$type.DefinitionID"); + $this->definitions[$type] = $def; + return $def; } - return $this->definitions[$type]; + + public function maybeGetRawDefinition($name) { + return $this->getDefinition($name, true, true); } - // set it up - $this->lock = $type; - $this->definitions[$type]->setup($this); - $this->lock = null; - // save in cache - $cache->set($this->definitions[$type], $this); - return $this->definitions[$type]; + + public function maybeGetRawHTMLDefinition() { + return $this->getDefinition('HTML', true, true); + } + + public function maybeGetRawCSSDefinition() { + return $this->getDefinition('CSS', true, true); + } + + public function maybeGetRawURIDefinition() { + return $this->getDefinition('URI', true, true); } /** @@ -549,17 +673,22 @@ class HTMLPurifier_Config /** * Produces a nicely formatted error message by supplying the - * stack frame information from two levels up and OUTSIDE of - * HTMLPurifier_Config. + * stack frame information OUTSIDE of HTMLPurifier_Config. */ protected function triggerError($msg, $no) { // determine previous stack frame - $backtrace = debug_backtrace(); - if ($this->chatty && isset($backtrace[1])) { - $frame = $backtrace[1]; - $extra = " on line {$frame['line']} in file {$frame['file']}"; - } else { $extra = ''; + if ($this->chatty) { + $trace = debug_backtrace(); + // zip(tail(trace), trace) -- but PHP is not Haskell har har + for ($i = 0, $c = count($trace); $i < $c - 1; $i++) { + if ($trace[$i + 1]['class'] === 'HTMLPurifier_Config') { + continue; + } + $frame = $trace[$i]; + $extra = " invoked on line {$frame['line']} in file {$frame['file']}"; + break; + } } trigger_error($msg . $extra, $no); } diff --git a/main/inc/lib/htmlpurifier/library/HTMLPurifier/ConfigSchema.php b/main/inc/lib/htmlpurifier/library/HTMLPurifier/ConfigSchema.php old mode 100755 new mode 100644 index 67be5c71fd..fadf7a5890 --- a/main/inc/lib/htmlpurifier/library/HTMLPurifier/ConfigSchema.php +++ b/main/inc/lib/htmlpurifier/library/HTMLPurifier/ConfigSchema.php @@ -60,7 +60,13 @@ class HTMLPurifier_ConfigSchema { * Unserializes the default ConfigSchema. */ public static function makeFromSerial() { - return unserialize(file_get_contents(HTMLPURIFIER_PREFIX . '/HTMLPurifier/ConfigSchema/schema.ser')); + $contents = file_get_contents(HTMLPURIFIER_PREFIX . '/HTMLPurifier/ConfigSchema/schema.ser'); + $r = unserialize($contents); + if (!$r) { + $hash = sha1($contents); + trigger_error("Unserialization of configuration schema failed, sha1 of file was $hash", E_USER_ERROR); + } + return $r; } /** diff --git a/main/inc/lib/htmlpurifier/library/HTMLPurifier/Definition.php b/main/inc/lib/htmlpurifier/library/HTMLPurifier/Definition.php old mode 100755 new mode 100644 index a7408c9749..c7f82eba43 --- a/main/inc/lib/htmlpurifier/library/HTMLPurifier/Definition.php +++ b/main/inc/lib/htmlpurifier/library/HTMLPurifier/Definition.php @@ -12,6 +12,17 @@ abstract class HTMLPurifier_Definition */ public $setup = false; + /** + * If true, write out the final definition object to the cache after + * setup. This will be true only if all invocations to get a raw + * definition object are also optimized. This does not cause file + * system thrashing because on subsequent calls the cached object + * is used and any writes to the raw definition object are short + * circuited. See enduser-customize.html for the high-level + * picture. + */ + public $optimized = null; + /** * What type of definition is it? */ diff --git a/main/inc/lib/htmlpurifier/library/HTMLPurifier/HTMLModule/SafeEmbed.php b/main/inc/lib/htmlpurifier/library/HTMLPurifier/HTMLModule/SafeEmbed.php old mode 100755 new mode 100644 index ea256716bb..9f3758a322 --- a/main/inc/lib/htmlpurifier/library/HTMLPurifier/HTMLModule/SafeEmbed.php +++ b/main/inc/lib/htmlpurifier/library/HTMLPurifier/HTMLModule/SafeEmbed.php @@ -21,7 +21,7 @@ class HTMLPurifier_HTMLModule_SafeEmbed extends HTMLPurifier_HTMLModule 'allowscriptaccess' => 'Enum#never', 'allownetworking' => 'Enum#internal', 'flashvars' => 'Text', - 'wmode' => 'Enum#window', + 'wmode' => 'Enum#window,transparent,opaque', 'name' => 'ID', ) ); diff --git a/main/inc/lib/htmlpurifier/library/HTMLPurifier/HTMLModuleManager.php b/main/inc/lib/htmlpurifier/library/HTMLPurifier/HTMLModuleManager.php old mode 100755 new mode 100644 index f5c4a1d2cb..ce27efa6da --- a/main/inc/lib/htmlpurifier/library/HTMLPurifier/HTMLModuleManager.php +++ b/main/inc/lib/htmlpurifier/library/HTMLPurifier/HTMLModuleManager.php @@ -216,13 +216,10 @@ class HTMLPurifier_HTMLModuleManager } } - // add proprietary module (this gets special treatment because - // it is completely removed from doctypes, etc.) + // custom modules if ($config->get('HTML.Proprietary')) { $modules[] = 'Proprietary'; } - - // add SafeObject/Safeembed modules if ($config->get('HTML.SafeObject')) { $modules[] = 'SafeObject'; } diff --git a/main/inc/lib/htmlpurifier/library/HTMLPurifier/Lexer.php b/main/inc/lib/htmlpurifier/library/HTMLPurifier/Lexer.php index 61e065f33b..24a5418b3e 100644 --- a/main/inc/lib/htmlpurifier/library/HTMLPurifier/Lexer.php +++ b/main/inc/lib/htmlpurifier/library/HTMLPurifier/Lexer.php @@ -235,7 +235,7 @@ class HTMLPurifier_Lexer */ protected static function removeIEConditional($string) { return preg_replace( - '##si', // probably should generalize for all strings + '##si', // probably should generalize for all strings '', $string ); @@ -273,11 +273,11 @@ class HTMLPurifier_Lexer $html = $this->escapeCommentedCDATA($html); } - $html = $this->removeIEConditional($html); - // escape CDATA $html = $this->escapeCDATA($html); + $html = $this->removeIEConditional($html); + // extract body from document if applicable if ($config->get('Core.ConvertDocumentToFragment')) { $e = false;