FAQ | This is a LIVE service | Changelog

Skip to content
Snippets Groups Projects

Feature/am 427 sidebar only basic html

Merged A. Michaels requested to merge feature/am-427-sidebar-only-basic-html into main
Files
4
+ 277
0
<?php
namespace Drupal\cambridge_migrations\Drush\Commands;
use Drush\Commands\DrushCommands;
use Drupal\Core\Entity\EntityTypeManagerInterface;
use Drupal\paragraphs\Entity\Paragraph;
use Symfony\Component\DependencyInjection\ContainerInterface;
use DOMDocument;
use DOMElement;
use DOMNode;
/**
* Drush command for cleaning up HTML content in text paragraphs.
*
* This command:
* - Queries all paragraphs of type "text"
* - For each paragraph, processes its HTML content to:
* - Remove all class and style attributes from every element
* - Recursively remove empty tags or tags with only non-breaking spaces
* - Remove all <style> and <script> elements entirely
*
* To run:
* ddev drush cambridge:migrate-cleanup
*/
class MigrateCleanupCommand extends DrushCommands {
/**
* The entity type manager.
*
* @var \Drupal\Core\Entity\EntityTypeManagerInterface
*/
protected $entityTypeManager;
/**
* Constructs a new MigrateCleanupCommand object.
*
* @param \Drupal\Core\Entity\EntityTypeManagerInterface $entity_type_manager
* The entity type manager.
*/
public function __construct(
EntityTypeManagerInterface $entity_type_manager
) {
parent::__construct();
$this->entityTypeManager = $entity_type_manager;
}
/**
* Clean up HTML content in text paragraphs.
*
* @command cambridge:migrate-cleanup
* @aliases cm-cleanup
*/
public function migrateCleanup() {
try {
// Query all paragraphs of type "text".
$paragraph_storage = $this->entityTypeManager->getStorage('paragraph');
$query = $paragraph_storage->getQuery()
->condition('type', 'text')
->accessCheck(FALSE);
$paragraph_ids = $query->execute();
if (empty($paragraph_ids)) {
$this->logger()->warning(dt('No text paragraphs found to clean up.'));
return;
}
$this->logger()->notice(dt('Found @count text paragraphs to process.', [
'@count' => count($paragraph_ids),
]));
$success_count = 0;
$error_count = 0;
// Load paragraphs in chunks to avoid memory issues.
$chunk_size = 50;
$chunks = array_chunk($paragraph_ids, $chunk_size, TRUE);
foreach ($chunks as $chunk) {
$paragraphs = $paragraph_storage->loadMultiple($chunk);
foreach ($paragraphs as $paragraph_id => $paragraph) {
try {
// Ensure we're working with a Paragraph entity
if (!($paragraph instanceof Paragraph)) {
$paragraph = Paragraph::load($paragraph_id);
if (!$paragraph) {
$this->logger()->warning(dt('Could not load paragraph @id', [
'@id' => $paragraph_id,
]));
continue;
}
}
// Skip if paragraph doesn't have a text field.
if (!$paragraph->hasField('field_text')) {
$this->logger()->warning(dt('Skipping paragraph @id - no text field found', [
'@id' => $paragraph->id(),
]));
continue;
}
// Get the current HTML content.
$text_field = $paragraph->get('field_text');
$html = $text_field->value;
$format = $text_field->format;
if (empty($html)) {
$this->logger()->notice(dt('Skipping paragraph @id - empty content', [
'@id' => $paragraph->id(),
]));
continue;
}
// Process the HTML content.
$cleaned_html = $this->cleanupHtml($html);
// Update the paragraph with the cleaned HTML.
$paragraph->set('field_text', [
'value' => $cleaned_html,
'format' => $format,
]);
$paragraph->save();
$success_count++;
$this->logger()->notice(dt('Successfully cleaned up paragraph @id', [
'@id' => $paragraph->id(),
]));
}
catch (\Exception $e) {
$error_count++;
$this->logger()->error(dt('Error cleaning up paragraph @id: @message', [
'@id' => $paragraph->id(),
'@message' => $e->getMessage(),
]));
}
}
// Clear static caches to avoid memory issues.
$paragraph_storage->resetCache($chunk);
}
$this->logger()->notice(dt('Cleanup complete. Successes: @success, Errors: @errors', [
'@success' => $success_count,
'@errors' => $error_count,
]));
}
catch (\Exception $e) {
$this->logger()->error(dt('Cleanup failed: @message', [
'@message' => $e->getMessage(),
]));
}
}
/**
* Clean up HTML content.
*
* @param string $html
* The HTML content to clean up.
*
* @return string
* The cleaned up HTML content.
*/
protected function cleanupHtml($html) {
// If the HTML is empty, return it as is.
if (empty($html)) {
return $html;
}
// Create a new DOMDocument.
$doc = new DOMDocument();
// Preserve whitespace to avoid unwanted text nodes.
$doc->preserveWhiteSpace = true;
// Disable error reporting temporarily to suppress warnings about HTML5 tags.
$previous_value = libxml_use_internal_errors(true);
// Load the HTML content.
// Add a wrapper to ensure proper parsing of fragments.
$doc->loadHTML('<!DOCTYPE html><html><head><meta charset="UTF-8"></head><body>' . $html . '</body></html>', LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
// Restore error reporting.
libxml_use_internal_errors($previous_value);
// Get the body element.
$body = $doc->getElementsByTagName('body')->item(0);
// Process the body element recursively.
$this->processNode($body);
// Save the processed HTML.
$processed_html = '';
$children = $body->childNodes;
foreach ($children as $child) {
$processed_html .= $doc->saveHTML($child);
}
return $processed_html;
}
/**
* Process a DOM node recursively.
*
* @param \DOMNode $node
* The DOM node to process.
*
* @return bool
* TRUE if the node should be kept, FALSE if it should be removed.
*/
protected function processNode(DOMNode $node) {
// If it's not an element node, keep it unless it's empty.
if ($node->nodeType !== XML_ELEMENT_NODE) {
// For text nodes, check if they contain only whitespace or non-breaking spaces.
if ($node->nodeType === XML_TEXT_NODE) {
$text = trim($node->textContent);
$text = str_replace('&nbsp;', '', $text);
$text = str_replace("\xC2\xA0", '', $text); // UTF-8 non-breaking space
return !empty($text);
}
return true;
}
// If it's a script or style element, remove it.
if (in_array(strtolower($node->nodeName), ['script', 'style'])) {
$node->parentNode->removeChild($node);
return false;
}
// If it's an element node, remove class and style attributes.
if ($node instanceof DOMElement) {
$node->removeAttribute('class');
$node->removeAttribute('style');
}
// Process child nodes recursively.
$children = [];
foreach ($node->childNodes as $child) {
$children[] = $child;
}
$keep_node = false;
foreach ($children as $child) {
$keep_child = $this->processNode($child);
if (!$keep_child && $child->parentNode) {
$child->parentNode->removeChild($child);
}
else {
$keep_node = true;
}
}
// Only check specific tags for emptiness
$tags_to_check = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'strong', 'b', 'em', 'i',
'small', 'mark', 'del', 'ins', 'sub', 'sup', 'q', 'cite', 'pre'];
// If the node has no children and no text content, check if it's in our list of tags to check
if (!$keep_node && $node->childNodes->length === 0) {
if (in_array(strtolower($node->nodeName), $tags_to_check)) {
return false;
}
}
return true;
}
/**
* {@inheritdoc}
*/
public static function create(ContainerInterface $container) {
return new static(
$container->get('entity_type.manager')
);
}
}
Loading