Updated 2012-06-23; major security flaw.
Here's a class from another project that should do what you're looking for:
final class Filter {
private function __construct() {}
const SafeTags = 'a abbr acronym address b bdo big blockquote br caption center cite code col colgroup dd del dfn dir div dl dt em font h1 h2 h3 h4 h5 h6 hr i img ins kbd legend li ol p pre q s samp small span strike strong sub sup table tbody td tfoot th thead tr tt u ul var article aside figure footer header nav section rp rt ruby dialog hgroup mark time';
const SafeAttributes = 'href src title alt type rowspan colspan lang';
const URLAttributes = 'href src';
public static function HTML($html) {
# Get array representations of the safe tags and attributes:
$safeTags = explode(' ', self::SafeTags);
$safeAttributes = explode(' ', self::SafeAttributes);
$urlAttributes = explode(' ', self::URLAttributes);
# Parse the HTML into a document object:
$dom = new DOMDocument();
$dom->loadHTML('<div>' . $html . '</div>');
# Loop through all of the nodes:
$stack = new SplStack();
$stack->push($dom->documentElement);
while($stack->count() > 0) {
# Get the next element for processing:
$element = $stack->pop();
# Add all the element's child nodes to the stack:
foreach($element->childNodes as $child) {
if($child instanceof DOMElement) {
$stack->push($child);
}
}
# And now, we do the filtering:
if(!in_array(strtolower($element->nodeName), $safeTags)) {
# It's not a safe tag; unwrap it:
while($element->hasChildNodes()) {
$element->parentNode->insertBefore($element->firstChild, $element);
}
# Finally, delete the offending element:
$element->parentNode->removeChild($element);
} else {
# The tag is safe; now filter its attributes:
for($i = 0; $i < $element->attributes->length; $i++) {
$attribute = $element->attributes->item($i);
$name = strtolower($attribute->name);
if(!in_array($name, $safeAttributes) || (in_array($name, $urlAttributes) && substr($attribute->value, 0, 7) !== 'http://')) {
# Found an unsafe attribute; remove it:
$element->removeAttribute($attribute->name);
$i--;
}
}
}
}
# Finally, return the safe HTML, minus the DOCTYPE, <html> and <body>:
$html = $dom->saveHTML();
$start = strpos($html, '<div>');
$end = strrpos($html, '</div>');
return substr($html, $start + 5, $end - $start - 5);
}
}