A. Michaels · c82c87cf · 8bab4800 · 95a0c966 · d57b4ff5 · dc73033f
--- a/src/Drush/Commands/MigrateCleanupCommand.php 0 → 100644

+ 277

− 0
+++ b/src/Drush/Commands/MigrateCleanupCommand.php 0 → 100644

+ 277

− 0
+<?php
+
+namespace Drupal\cambridge_migrations\Drush\Commands;
+
+use Drush\Commands\DrushCommands;
+use Drupal\Core\Entity\EntityTypeManagerInterface;
+use Drupal\paragraphs\Entity\Paragraph;
+use Symfony\Component\DependencyInjection\ContainerInterface;
+use DOMDocument;
+use DOMElement;
+use DOMNode;
+
+/**
+ * Drush command for cleaning up HTML content in text paragraphs.
+ *
+ * This command:
+ * - Queries all paragraphs of type "text"
+ * - For each paragraph, processes its HTML content to:
+ *   - Remove all class and style attributes from every element
+ *   - Recursively remove empty tags or tags with only non-breaking spaces
+ *   - Remove all <style> and <script> elements entirely
+ *
+ * To run:
+ *   ddev drush cambridge:migrate-cleanup
+ */
+class MigrateCleanupCommand extends DrushCommands {
+
+  /**
+   * The entity type manager.
+   *
+   * @var \Drupal\Core\Entity\EntityTypeManagerInterface
+   */
+  protected $entityTypeManager;
+
+  /**
+   * Constructs a new MigrateCleanupCommand object.
+   *
+   * @param \Drupal\Core\Entity\EntityTypeManagerInterface $entity_type_manager
+   *   The entity type manager.
+   */
+  public function __construct(
+    EntityTypeManagerInterface $entity_type_manager
+  ) {
+    parent::__construct();
+    $this->entityTypeManager = $entity_type_manager;
+  }
+
+  /**
+   * Clean up HTML content in text paragraphs.
+   *
+   * @command cambridge:migrate-cleanup
+   * @aliases cm-cleanup
+   */
+  public function migrateCleanup() {
+    try {
+      // Query all paragraphs of type "text".
+      $paragraph_storage = $this->entityTypeManager->getStorage('paragraph');
+      $query = $paragraph_storage->getQuery()
+        ->condition('type', 'text')
+        ->accessCheck(FALSE);
+      $paragraph_ids = $query->execute();
+
+      if (empty($paragraph_ids)) {
+        $this->logger()->warning(dt('No text paragraphs found to clean up.'));
+        return;
+      }
+
+      $this->logger()->notice(dt('Found @count text paragraphs to process.', [
+        '@count' => count($paragraph_ids),
+      ]));
+
+      $success_count = 0;
+      $error_count = 0;
+
+      // Load paragraphs in chunks to avoid memory issues.
+      $chunk_size = 50;
+      $chunks = array_chunk($paragraph_ids, $chunk_size, TRUE);
+
+      foreach ($chunks as $chunk) {
+        $paragraphs = $paragraph_storage->loadMultiple($chunk);
+
+        foreach ($paragraphs as $paragraph_id => $paragraph) {
+          try {
+            // Ensure we're working with a Paragraph entity
+            if (!($paragraph instanceof Paragraph)) {
+              $paragraph = Paragraph::load($paragraph_id);
+              if (!$paragraph) {
+                $this->logger()->warning(dt('Could not load paragraph @id', [
+                  '@id' => $paragraph_id,
+                ]));
+                continue;
+              }
+            }
+            
+            // Skip if paragraph doesn't have a text field.
+            if (!$paragraph->hasField('field_text')) {
+              $this->logger()->warning(dt('Skipping paragraph @id - no text field found', [
+                '@id' => $paragraph->id(),
+              ]));
+              continue;
+            }
+
+            // Get the current HTML content.
+            $text_field = $paragraph->get('field_text');
+            $html = $text_field->value;
+            $format = $text_field->format;
+
+            if (empty($html)) {
+              $this->logger()->notice(dt('Skipping paragraph @id - empty content', [
+                '@id' => $paragraph->id(),
+              ]));
+              continue;
+            }
+
+            // Process the HTML content.
+            $cleaned_html = $this->cleanupHtml($html);
+
+            // Update the paragraph with the cleaned HTML.
+            $paragraph->set('field_text', [
+              'value' => $cleaned_html,
+              'format' => $format,
+            ]);
+            $paragraph->save();
+
+            $success_count++;
+            $this->logger()->notice(dt('Successfully cleaned up paragraph @id', [
+              '@id' => $paragraph->id(),
+            ]));
+          }
+          catch (\Exception $e) {
+            $error_count++;
+            $this->logger()->error(dt('Error cleaning up paragraph @id: @message', [
+              '@id' => $paragraph->id(),
+              '@message' => $e->getMessage(),
+            ]));
+          }
+        }
+
+        // Clear static caches to avoid memory issues.
+        $paragraph_storage->resetCache($chunk);
+      }
+
+      $this->logger()->notice(dt('Cleanup complete. Successes: @success, Errors: @errors', [
+        '@success' => $success_count,
+        '@errors' => $error_count,
+      ]));
+    }
+    catch (\Exception $e) {
+      $this->logger()->error(dt('Cleanup failed: @message', [
+        '@message' => $e->getMessage(),
+      ]));
+    }
+  }
+
+  /**
+   * Clean up HTML content.
+   *
+   * @param string $html
+   *   The HTML content to clean up.
+   *
+   * @return string
+   *   The cleaned up HTML content.
+   */
+  protected function cleanupHtml($html) {
+    // If the HTML is empty, return it as is.
+    if (empty($html)) {
+      return $html;
+    }
+
+    // Create a new DOMDocument.
+    $doc = new DOMDocument();
+    
+    // Preserve whitespace to avoid unwanted text nodes.
+    $doc->preserveWhiteSpace = true;
+    
+    // Disable error reporting temporarily to suppress warnings about HTML5 tags.
+    $previous_value = libxml_use_internal_errors(true);
+    
+    // Load the HTML content.
+    // Add a wrapper to ensure proper parsing of fragments.
+    $doc->loadHTML('<!DOCTYPE html><html><head><meta charset="UTF-8"></head><body>' . $html . '</body></html>', LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
+    
+    // Restore error reporting.
+    libxml_use_internal_errors($previous_value);
+
+    // Get the body element.
+    $body = $doc->getElementsByTagName('body')->item(0);
+
+    // Process the body element recursively.
+    $this->processNode($body);
+
+    // Save the processed HTML.
+    $processed_html = '';
+    $children = $body->childNodes;
+    
+    foreach ($children as $child) {
+      $processed_html .= $doc->saveHTML($child);
+    }
+
+    return $processed_html;
+  }
+
+  /**
+   * Process a DOM node recursively.
+   *
+   * @param \DOMNode $node
+   *   The DOM node to process.
+   *
+   * @return bool
+   *   TRUE if the node should be kept, FALSE if it should be removed.
+   */
+  protected function processNode(DOMNode $node) {
+    // If it's not an element node, keep it unless it's empty.
+    if ($node->nodeType !== XML_ELEMENT_NODE) {
+      // For text nodes, check if they contain only whitespace or non-breaking spaces.
+      if ($node->nodeType === XML_TEXT_NODE) {
+        $text = trim($node->textContent);
+        $text = str_replace('&nbsp;', '', $text);
+        $text = str_replace("\xC2\xA0", '', $text); // UTF-8 non-breaking space
+        return !empty($text);
+      }
+      return true;
+    }
+    
+    // If it's a script or style element, remove it.
+    if (in_array(strtolower($node->nodeName), ['script', 'style'])) {
+      $node->parentNode->removeChild($node);
+      return false;
+    }
+
+    // If it's an element node, remove class and style attributes.
+    if ($node instanceof DOMElement) {
+      $node->removeAttribute('class');
+      $node->removeAttribute('style');
+    }
+
+    // Process child nodes recursively.
+    $children = [];
+    foreach ($node->childNodes as $child) {
+      $children[] = $child;
+    }
+
+    $keep_node = false;
+    foreach ($children as $child) {
+      $keep_child = $this->processNode($child);
+      if (!$keep_child && $child->parentNode) {
+        $child->parentNode->removeChild($child);
+      }
+      else {
+        $keep_node = true;
+      }
+    }
+
+    // Only check specific tags for emptiness
+    $tags_to_check = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'strong', 'b', 'em', 'i', 
+                      'small', 'mark', 'del', 'ins', 'sub', 'sup', 'q', 'cite', 'pre'];
+    
+    // If the node has no children and no text content, check if it's in our list of tags to check
+    if (!$keep_node && $node->childNodes->length === 0) {
+      if (in_array(strtolower($node->nodeName), $tags_to_check)) {
+        return false;
+      }
+    }
+
+    return true;
+  }
+
+  /**
+   * {@inheritdoc}
+   */
+  public static function create(ContainerInterface $container) {
+    return new static(
+      $container->get('entity_type.manager')
+    );
+  }
+
+}