From 6d5200ad20772325b0bbcefa54ae6376dca87557 Mon Sep 17 00:00:00 2001
From: Anthony Michaels <a.michaels@webpackager.com>
Date: Tue, 18 Mar 2025 10:19:11 +0000
Subject: [PATCH 01/12] Include sidebar Text block paragraph type in migration
 script;

---
 .../Commands/MigrateTextBlocksCommand.php     | 314 +++++++++++++++---
 1 file changed, 262 insertions(+), 52 deletions(-)

diff --git a/src/Drush/Commands/MigrateTextBlocksCommand.php b/src/Drush/Commands/MigrateTextBlocksCommand.php
index 50cfa66..79b8ed2 100644
--- a/src/Drush/Commands/MigrateTextBlocksCommand.php
+++ b/src/Drush/Commands/MigrateTextBlocksCommand.php
@@ -108,6 +108,32 @@ class MigrateTextBlocksCommand extends DrushCommands {
     }
   }
 
+  /**
+   * Remove all paragraphs from a node's sidebar items field.
+   */
+  protected function removeSidebarItems($node) {
+    if (!$node->hasField('field_sidebar_items')) {
+      return;
+    }
+
+    $paragraphs = $node->get('field_sidebar_items')->referencedEntities();
+    $removed_count = 0;
+
+    foreach ($paragraphs as $paragraph) {
+      $paragraph->delete();
+      $removed_count++;
+    }
+
+    if ($removed_count > 0) {
+      $node->set('field_sidebar_items', []);
+      $node->save();
+      $this->logger()->notice(dt('Removed @count sidebar paragraph(s) from node @nid', [
+        '@count' => $removed_count,
+        '@nid' => $node->id(),
+      ]));
+    }
+  }
+
   /**
    * Helper to find the new D10 Media entity that corresponds to a given old D7 file fid.
    *
@@ -169,6 +195,68 @@ class MigrateTextBlocksCommand extends DrushCommands {
     return $requested_view_mode;
   }
 
+  /**
+   * Process text block content to convert media tokens and clean up.
+   *
+   * @param string $content
+   *   The raw content from D7.
+   * @param string $format
+   *   The text format to use.
+   *
+   * @return array
+   *   An array with 'value' and 'format' keys for the processed content.
+   */
+  protected function processTextBlockContent($content, $format) {
+    // Clean up content slightly.
+    $content = $this->cleanContent($content);
+
+    // Remove literal occurrences of 'text_block'.
+    $content = str_replace('text_block', '', $content);
+
+    // Determine the text format to use in D10: if not found, fall back to 'filtered_html'.
+    $paragraph_format = !empty($format) ? $format : 'filtered_html';
+
+    // Convert D7 media tokens to <drupal-media>.
+    $content = preg_replace_callback(
+      '/\[\[\s*(\{.*?"fid":.*?\})\s*\]\]/s',
+      function ($matches) {
+        $json_string = $matches[1];
+        $embed_data = json_decode($json_string, TRUE);
+        if (is_array($embed_data) && isset($embed_data['fid'])) {
+          // Get the correct new media entity.
+          $old_fid = $embed_data['fid'];
+          $media = $this->getMediaEntityFromOldFid($old_fid);
+          if ($media) {
+            // If "fields.format" is set in the old JSON, treat that as the requested view mode.
+            // Otherwise default to "media_library".
+            $requested_mode = !empty($embed_data['fields']['format'])
+              ? $embed_data['fields']['format']
+              : 'media_library';
+
+            // Validate if this view mode actually exists in D10; fallback if not.
+            $view_mode = $this->getValidMediaViewMode($requested_mode);
+
+            $uuid = $media->uuid();
+            return '<drupal-media data-entity-type="media" data-entity-uuid="' . $uuid . '" data-view-mode="' . $view_mode . '"></drupal-media>';
+          }
+          else {
+            // No mapping => log warning & keep original token.
+            $this->logger()->warning(sprintf('No media mapping found for old fid "%s".', $old_fid));
+            return $matches[0];
+          }
+        }
+        // If JSON parse fails or no "fid", leave token as-is.
+        return $matches[0];
+      },
+      $content
+    );
+
+    return [
+      'value' => $content,
+      'format' => $paragraph_format,
+    ];
+  }
+
   /**
    * Migrate text blocks to text paragraphs.
    *
@@ -176,6 +264,26 @@ class MigrateTextBlocksCommand extends DrushCommands {
    * @aliases cm-text-blocks
    */
   public function migrateTextBlocks() {
+    try {
+      // First, migrate main content text blocks
+      $this->migrateMainContentTextBlocks();
+      
+      // Then, migrate sidebar text blocks
+      $this->migrateSidebarTextBlocks();
+      
+      $this->logger()->notice(dt('Text block migration complete for both main content and sidebar items.'));
+    }
+    catch (\Exception $e) {
+      $this->logger()->error(dt('Migration failed: @message', [
+        '@message' => $e->getMessage(),
+      ]));
+    }
+  }
+
+  /**
+   * Migrate text blocks to text paragraphs in the main content field.
+   */
+  protected function migrateMainContentTextBlocks() {
     try {
       // Query the D7 database for text blocks.
       $query = $this->sourceDb->select('paragraphs_item', 'p')
@@ -210,7 +318,7 @@ class MigrateTextBlocksCommand extends DrushCommands {
 
       $text_blocks = $query->execute()->fetchAll();
       if (empty($text_blocks)) {
-        $this->logger()->warning(dt('No text blocks found to migrate.'));
+        $this->logger()->warning(dt('No main content text blocks found to migrate.'));
         return;
       }
 
@@ -236,50 +344,10 @@ class MigrateTextBlocksCommand extends DrushCommands {
             $content .= $block->field_paragraph_text_content_value;
           }
 
-          // Clean up content slightly.
-          $content = $this->cleanContent($content);
-
-          // Remove literal occurrences of 'text_block'.
-          $content = str_replace('text_block', '', $content);
-
-          // Determine the text format to use in D10: if not found, fall back to 'filtered_html'.
-          $paragraph_format = !empty($block->field_paragraph_text_content_format)
-            ? $block->field_paragraph_text_content_format
-            : 'filtered_html';
-
-          // Convert D7 media tokens to <drupal-media>.
-          $content = preg_replace_callback(
-            '/\[\[\s*(\{.*?"fid":.*?\})\s*\]\]/s',
-            function ($matches) {
-              $json_string = $matches[1];
-              $embed_data = json_decode($json_string, TRUE);
-              if (is_array($embed_data) && isset($embed_data['fid'])) {
-                // Get the correct new media entity.
-                $old_fid = $embed_data['fid'];
-                $media = $this->getMediaEntityFromOldFid($old_fid);
-                if ($media) {
-                  // If "fields.format" is set in the old JSON, treat that as the requested view mode.
-                  // Otherwise default to "media_library".
-                  $requested_mode = !empty($embed_data['fields']['format'])
-                    ? $embed_data['fields']['format']
-                    : 'media_library';
-
-                  // Validate if this view mode actually exists in D10; fallback if not.
-                  $view_mode = $this->getValidMediaViewMode($requested_mode);
-
-                  $uuid = $media->uuid();
-                  return '<drupal-media data-entity-type="media" data-entity-uuid="' . $uuid . '" data-view-mode="' . $view_mode . '"></drupal-media>';
-                }
-                else {
-                  // No mapping => log warning & keep original token.
-                  $this->logger()->warning(sprintf('No media mapping found for old fid "%s".', $old_fid));
-                  return $matches[0];
-                }
-              }
-              // If JSON parse fails or no "fid", leave token as-is.
-              return $matches[0];
-            },
-            $content
+          // Process the content
+          $processed_content = $this->processTextBlockContent(
+            $content,
+            $block->field_paragraph_text_content_format
           );
 
           // Load the corresponding D10 node.
@@ -294,42 +362,184 @@ class MigrateTextBlocksCommand extends DrushCommands {
           // Create a new paragraph of type 'text'.
           $paragraph = Paragraph::create([
             'type' => 'text',
+            'langcode' => 'en', // Explicitly set language to English
             'field_text' => [
-              'value' => $content,
-              'format' => $paragraph_format,
+              'value' => $processed_content['value'],
+              'format' => $processed_content['format'],
             ],
           ]);
           $paragraph->save();
 
-          // Attach the new paragraph to the node’s field_paragraph.
+          // Attach the new paragraph to the node's field_paragraph.
           $node->field_paragraph[] = [
             'target_id' => $paragraph->id(),
             'target_revision_id' => $paragraph->getRevisionId(),
           ];
           $node->save();
+          
+          // Debug: Log the structure of the main content field after saving
+          $this->logger()->debug(dt('Main content field structure after save for node @nid: @count paragraphs', [
+            '@nid' => $node->id(),
+            '@count' => count($node->field_paragraph),
+          ]));
 
           $success_count++;
-          $this->logger()->notice(dt('Successfully migrated text block @id to node @nid', [
+          $this->logger()->notice(dt('Successfully migrated main content text block @id to node @nid', [
             '@id' => $block->item_id,
             '@nid' => $block->entity_id,
           ]));
         }
         catch (\Exception $e) {
           $error_count++;
-          $this->logger()->error(dt('Error migrating text block @id: @message', [
+          $this->logger()->error(dt('Error migrating main content text block @id: @message', [
             '@id' => $block->item_id,
             '@message' => $e->getMessage(),
           ]));
         }
       }
 
-      $this->logger()->notice(dt('Migration complete. Successes: @success, Errors: @errors', [
+      $this->logger()->notice(dt('Main content text block migration complete. Successes: @success, Errors: @errors', [
         '@success' => $success_count,
         '@errors' => $error_count,
       ]));
     }
     catch (\Exception $e) {
-      $this->logger()->error(dt('Migration failed: @message', [
+      $this->logger()->error(dt('Main content text block migration failed: @message', [
+        '@message' => $e->getMessage(),
+      ]));
+    }
+  }
+
+  /**
+   * Migrate text blocks to text paragraphs in the sidebar items field.
+   */
+  protected function migrateSidebarTextBlocks() {
+    try {
+      // Query the D7 database for sidebar text blocks.
+      $query = $this->sourceDb->select('paragraphs_item', 'p')
+        ->fields('p', ['item_id', 'bundle', 'field_name'])
+        ->condition('p.bundle', 'text_block');
+
+      // Join with field_data_field_paragraph_heading for headings.
+      $query->leftJoin(
+        'field_data_field_paragraph_heading',
+        'h',
+        'p.item_id = h.entity_id AND h.entity_type = :entity_type',
+        [':entity_type' => 'paragraphs_item']
+      );
+      $query->fields('h', ['field_paragraph_heading_value']);
+
+      // Join with field_data_field_paragraph_text_content for body + format.
+      $query->leftJoin(
+        'field_data_field_paragraph_text_content',
+        't',
+        'p.item_id = t.entity_id AND t.entity_type = :entity_type',
+        [':entity_type' => 'paragraphs_item']
+      );
+      $query->fields('t', ['field_paragraph_text_content_value', 'field_paragraph_text_content_format']);
+
+      // Join with node reference field to get parent node ID, specifically for sidebar items.
+      $query->leftJoin(
+        'field_data_field_sidebar_items',
+        'n',
+        'p.item_id = n.field_sidebar_items_value'
+      );
+      $query->fields('n', ['entity_id']);
+
+      $text_blocks = $query->execute()->fetchAll();
+      if (empty($text_blocks)) {
+        $this->logger()->warning(dt('No sidebar text blocks found to migrate.'));
+        return;
+      }
+
+      $success_count = 0;
+      $error_count = 0;
+      $processed_nodes = [];
+
+      foreach ($text_blocks as $block) {
+        try {
+          // Skip if no node reference.
+          if (empty($block->entity_id)) {
+            $this->logger()->warning(dt('Skipping sidebar text block @id - no node reference found', [
+              '@id' => $block->item_id,
+            ]));
+            continue;
+          }
+
+          // Load the corresponding D10 node.
+          $node = $this->entityTypeManager->getStorage('node')->load($block->entity_id);
+          if (!$node) {
+            throw new \Exception("Node {$block->entity_id} not found in D10.");
+          }
+
+          // Remove existing sidebar items before adding new ones, but only once per node
+          if (!in_array($block->entity_id, $processed_nodes)) {
+            $this->removeSidebarItems($node);
+            $processed_nodes[] = $block->entity_id;
+          }
+
+          // Combine heading + body text.
+          $content = '';
+          if (!empty($block->field_paragraph_heading_value)) {
+            $content .= '<h2>' . $block->field_paragraph_heading_value . '</h2>';
+          }
+          if (!empty($block->field_paragraph_text_content_value)) {
+            $content .= $block->field_paragraph_text_content_value;
+          }
+
+          // Process the content
+          $processed_content = $this->processTextBlockContent(
+            $content,
+            $block->field_paragraph_text_content_format
+          );
+
+          // Create a new paragraph of type 'text'.
+          $paragraph = Paragraph::create([
+            'type' => 'text',
+            'langcode' => 'en', // Explicitly set language to English
+            'field_text' => [
+              'value' => $processed_content['value'],
+              'format' => $processed_content['format'],
+            ],
+          ]);
+          $paragraph->save();
+
+          // Attach the new paragraph to the node's field_sidebar_items.
+          // Use the field directly to ensure proper structure
+          $node->field_sidebar_items[] = [
+            'target_id' => $paragraph->id(),
+            'target_revision_id' => $paragraph->getRevisionId(),
+          ];
+          $node->save();
+          
+          // Debug: Log the structure of the sidebar field after saving
+          $this->logger()->debug(dt('Sidebar field structure after save for node @nid: @count paragraphs', [
+            '@nid' => $node->id(),
+            '@count' => count($node->field_sidebar_items),
+          ]));
+
+          $success_count++;
+          $this->logger()->notice(dt('Successfully migrated sidebar text block @id to node @nid', [
+            '@id' => $block->item_id,
+            '@nid' => $block->entity_id,
+          ]));
+        }
+        catch (\Exception $e) {
+          $error_count++;
+          $this->logger()->error(dt('Error migrating sidebar text block @id: @message', [
+            '@id' => $block->item_id,
+            '@message' => $e->getMessage(),
+          ]));
+        }
+      }
+
+      $this->logger()->notice(dt('Sidebar text block migration complete. Successes: @success, Errors: @errors', [
+        '@success' => $success_count,
+        '@errors' => $error_count,
+      ]));
+    }
+    catch (\Exception $e) {
+      $this->logger()->error(dt('Sidebar text block migration failed: @message', [
         '@message' => $e->getMessage(),
       ]));
     }
-- 
GitLab


From 41fe5aad548d3bc68024c98015b2c4f21cd26bfc Mon Sep 17 00:00:00 2001
From: Anthony Michaels <a.michaels@webpackager.com>
Date: Tue, 18 Mar 2025 19:37:49 +0000
Subject: [PATCH 02/12] Attempt to update sidebar mappings directly; ensure no
 sidebar items exists from the start;

---
 .../Commands/MigrateTextBlocksCommand.php     | 136 +++++++++++++-----
 1 file changed, 97 insertions(+), 39 deletions(-)

diff --git a/src/Drush/Commands/MigrateTextBlocksCommand.php b/src/Drush/Commands/MigrateTextBlocksCommand.php
index 79b8ed2..364e62f 100644
--- a/src/Drush/Commands/MigrateTextBlocksCommand.php
+++ b/src/Drush/Commands/MigrateTextBlocksCommand.php
@@ -213,7 +213,7 @@ class MigrateTextBlocksCommand extends DrushCommands {
     // Remove literal occurrences of 'text_block'.
     $content = str_replace('text_block', '', $content);
 
-    // Determine the text format to use in D10: if not found, fall back to 'filtered_html'.
+    // Determine the text format to use in D10: if not found, fall back to 'filtered_html'
     $paragraph_format = !empty($format) ? $format : 'filtered_html';
 
     // Convert D7 media tokens to <drupal-media>.
@@ -268,8 +268,13 @@ class MigrateTextBlocksCommand extends DrushCommands {
       // First, migrate main content text blocks
       $this->migrateMainContentTextBlocks();
       
+      // Rebuild caches between migrations
+      \Drupal::service('cache.render')->invalidateAll();
+      \Drupal::service('plugin.manager.entity_reference_selection')->clearCachedDefinitions();
+
       // Then, migrate sidebar text blocks
       $this->migrateSidebarTextBlocks();
+    
       
       $this->logger()->notice(dt('Text block migration complete for both main content and sidebar items.'));
     }
@@ -285,10 +290,18 @@ class MigrateTextBlocksCommand extends DrushCommands {
    */
   protected function migrateMainContentTextBlocks() {
     try {
-      // Query the D7 database for text blocks.
-      $query = $this->sourceDb->select('paragraphs_item', 'p')
-        ->fields('p', ['item_id', 'bundle', 'field_name'])
-        ->condition('p.bundle', 'text_block');
+      // Query the D7 database for text blocks in the main content field.
+      $query = $this->sourceDb->select('field_data_field_content_items', 'n')
+        ->fields('n', ['entity_id', 'field_content_items_value']);
+
+      // Join with paragraphs_item to get only text blocks
+      $query->join(
+        'paragraphs_item',
+        'p',
+        'p.item_id = n.field_content_items_value AND p.bundle = :bundle',
+        [':bundle' => 'text_block']
+      );
+      $query->fields('p', ['item_id', 'bundle', 'field_name']);
 
       // Join with field_data_field_paragraph_heading for headings.
       $query->leftJoin(
@@ -308,14 +321,6 @@ class MigrateTextBlocksCommand extends DrushCommands {
       );
       $query->fields('t', ['field_paragraph_text_content_value', 'field_paragraph_text_content_format']);
 
-      // Join with node reference field to get parent node ID.
-      $query->leftJoin(
-        'field_data_field_content_items',
-        'n',
-        'p.item_id = n.field_content_items_value'
-      );
-      $query->fields('n', ['entity_id']);
-
       $text_blocks = $query->execute()->fetchAll();
       if (empty($text_blocks)) {
         $this->logger()->warning(dt('No main content text blocks found to migrate.'));
@@ -362,10 +367,10 @@ class MigrateTextBlocksCommand extends DrushCommands {
           // Create a new paragraph of type 'text'.
           $paragraph = Paragraph::create([
             'type' => 'text',
-            'langcode' => 'en', // Explicitly set language to English
+            'langcode' => 'und', // Set paragraph language to 'und' (undefined) to match existing data
             'field_text' => [
               'value' => $processed_content['value'],
-              'format' => $processed_content['format'],
+              'format' => $processed_content['format'], // Use the imported format for main content
             ],
           ]);
           $paragraph->save();
@@ -415,10 +420,23 @@ class MigrateTextBlocksCommand extends DrushCommands {
    */
   protected function migrateSidebarTextBlocks() {
     try {
-      // Query the D7 database for sidebar text blocks.
-      $query = $this->sourceDb->select('paragraphs_item', 'p')
-        ->fields('p', ['item_id', 'bundle', 'field_name'])
-        ->condition('p.bundle', 'text_block');
+      // Truncate the sidebar items tables to start fresh
+      $this->targetDb->truncate('node__field_sidebar_items')->execute();
+      $this->targetDb->truncate('node_revision__field_sidebar_items')->execute();
+      $this->logger()->notice(dt('Truncated sidebar items tables to start fresh.'));
+      
+      // Query the D7 database for text blocks in the sidebar field.
+      $query = $this->sourceDb->select('field_data_field_sidebar_items', 'n')
+        ->fields('n', ['entity_id', 'field_sidebar_items_value']);
+
+      // Join with paragraphs_item to get only text blocks
+      $query->join(
+        'paragraphs_item',
+        'p',
+        'p.item_id = n.field_sidebar_items_value AND p.bundle = :bundle',
+        [':bundle' => 'text_block']
+      );
+      $query->fields('p', ['item_id', 'bundle', 'field_name']);
 
       // Join with field_data_field_paragraph_heading for headings.
       $query->leftJoin(
@@ -438,14 +456,6 @@ class MigrateTextBlocksCommand extends DrushCommands {
       );
       $query->fields('t', ['field_paragraph_text_content_value', 'field_paragraph_text_content_format']);
 
-      // Join with node reference field to get parent node ID, specifically for sidebar items.
-      $query->leftJoin(
-        'field_data_field_sidebar_items',
-        'n',
-        'p.item_id = n.field_sidebar_items_value'
-      );
-      $query->fields('n', ['entity_id']);
-
       $text_blocks = $query->execute()->fetchAll();
       if (empty($text_blocks)) {
         $this->logger()->warning(dt('No sidebar text blocks found to migrate.'));
@@ -496,27 +506,74 @@ class MigrateTextBlocksCommand extends DrushCommands {
           // Create a new paragraph of type 'text'.
           $paragraph = Paragraph::create([
             'type' => 'text',
-            'langcode' => 'en', // Explicitly set language to English
+            'langcode' => 'und', // Set paragraph language to 'und' (undefined) to match existing data
             'field_text' => [
               'value' => $processed_content['value'],
-              'format' => $processed_content['format'],
+              'format' => 'basic_html', // Explicitly set to basic_html as required
             ],
           ]);
           $paragraph->save();
 
-          // Attach the new paragraph to the node's field_sidebar_items.
-          // Use the field directly to ensure proper structure
-          $node->field_sidebar_items[] = [
-            'target_id' => $paragraph->id(),
-            'target_revision_id' => $paragraph->getRevisionId(),
-          ];
-          $node->save();
+          // Instead of using the entity API, directly insert into the database
+          // Insert into node__field_sidebar_items table
+          $this->targetDb->insert('node__field_sidebar_items')
+            ->fields([
+              'bundle' => 'page',
+              'deleted' => 0,
+              'entity_id' => $node->id(),
+              'revision_id' => $node->getRevisionId(),
+              'langcode' => 'und',
+              'delta' => 0,
+              'field_sidebar_items_target_id' => $paragraph->id(),
+              'field_sidebar_items_target_revision_id' => $paragraph->getRevisionId(),
+            ])
+            ->execute();
+          
+          // Also insert into node_revision__field_sidebar_items table
+          $this->targetDb->insert('node_revision__field_sidebar_items')
+            ->fields([
+              'bundle' => 'page',
+              'deleted' => 0,
+              'entity_id' => $node->id(),
+              'revision_id' => $node->getRevisionId(),
+              'langcode' => 'und',
+              'delta' => 0,
+              'field_sidebar_items_target_id' => $paragraph->id(),
+              'field_sidebar_items_target_revision_id' => $paragraph->getRevisionId(),
+            ])
+            ->execute();
+
+
+          // Debug: Log detailed information about the paragraph and the node reference
+          $this->logger()->notice(dt('IMPORTANT: Sidebar paragraph created for node @nid: ID=@pid, RevID=@revid, Type=@type, Lang=@lang, Format=@format', [
+            '@nid' => $node->id(),
+            '@pid' => $paragraph->id(),
+            '@revid' => $paragraph->getRevisionId(),
+            '@type' => $paragraph->getType(),
+            '@lang' => $paragraph->language()->getId(),
+            '@format' => $paragraph->get('field_text')->format,
+          ]));
           
-          // Debug: Log the structure of the sidebar field after saving
-          $this->logger()->debug(dt('Sidebar field structure after save for node @nid: @count paragraphs', [
+          // Debug: Log the node's field_sidebar_items values and revision ID after saving
+          $this->logger()->notice(dt('IMPORTANT: Node @nid (revision: @vid) field_sidebar_items values: @values', [
             '@nid' => $node->id(),
-            '@count' => count($node->field_sidebar_items),
+            '@vid' => $node->getRevisionId(),
+            '@values' => json_encode($node->get('field_sidebar_items')->getValue()),
           ]));
+          
+          // Check if the node type has a view display configuration for the sidebar field
+          $entity_display_repository = \Drupal::service('entity_display.repository');
+          $view_display = $entity_display_repository->getViewDisplay('node', $node->bundle(), 'default');
+          if ($view_display && $view_display->getComponent('field_sidebar_items')) {
+            $this->logger()->notice(dt('Node @nid has view display configuration for field_sidebar_items', [
+              '@nid' => $node->id(),
+            ]));
+          } else {
+            $this->logger()->notice(dt('Node @nid (@type) does not have view display configuration for field_sidebar_items', [
+              '@nid' => $node->id(),
+              '@type' => $node->bundle(),
+            ]));
+          }
 
           $success_count++;
           $this->logger()->notice(dt('Successfully migrated sidebar text block @id to node @nid', [
@@ -545,6 +602,7 @@ class MigrateTextBlocksCommand extends DrushCommands {
     }
   }
 
+
   /**
    * {@inheritdoc}
    */
-- 
GitLab


From 8912ac9e8bdf62d89abae30bda7a7393839e0533 Mon Sep 17 00:00:00 2001
From: Anthony Michaels <a.michaels@webpackager.com>
Date: Tue, 18 Mar 2025 20:25:10 +0000
Subject: [PATCH 03/12] Split migrateSidebarTextBlocks() into 2 parts

---
 .../Commands/MigrateTextBlocksCommand.php     | 167 +++++++++++-------
 1 file changed, 105 insertions(+), 62 deletions(-)

diff --git a/src/Drush/Commands/MigrateTextBlocksCommand.php b/src/Drush/Commands/MigrateTextBlocksCommand.php
index 364e62f..9fa68d1 100644
--- a/src/Drush/Commands/MigrateTextBlocksCommand.php
+++ b/src/Drush/Commands/MigrateTextBlocksCommand.php
@@ -272,9 +272,15 @@ class MigrateTextBlocksCommand extends DrushCommands {
       \Drupal::service('cache.render')->invalidateAll();
       \Drupal::service('plugin.manager.entity_reference_selection')->clearCachedDefinitions();
 
-      // Then, migrate sidebar text blocks
-      $this->migrateSidebarTextBlocks();
-    
+      // Then, create sidebar paragraph items
+      $sidebar_items = $this->createSidebarParagraphItems();
+      
+      // Rebuild caches again
+      \Drupal::service('cache.render')->invalidateAll();
+      \Drupal::service('plugin.manager.entity_reference_selection')->clearCachedDefinitions();
+      
+      // Finally, insert sidebar items into the database
+      $this->insertSidebarItemsIntoDatabase($sidebar_items);
       
       $this->logger()->notice(dt('Text block migration complete for both main content and sidebar items.'));
     }
@@ -416,14 +422,14 @@ class MigrateTextBlocksCommand extends DrushCommands {
   }
 
   /**
-   * Migrate text blocks to text paragraphs in the sidebar items field.
+   * Create sidebar paragraph items and return an array of data for database insertion.
+   * 
+   * @return array
+   *   An array of sidebar item data for database insertion.
    */
-  protected function migrateSidebarTextBlocks() {
+  protected function createSidebarParagraphItems() {
     try {
-      // Truncate the sidebar items tables to start fresh
-      $this->targetDb->truncate('node__field_sidebar_items')->execute();
-      $this->targetDb->truncate('node_revision__field_sidebar_items')->execute();
-      $this->logger()->notice(dt('Truncated sidebar items tables to start fresh.'));
+      $sidebar_items = [];
       
       // Query the D7 database for text blocks in the sidebar field.
       $query = $this->sourceDb->select('field_data_field_sidebar_items', 'n')
@@ -459,7 +465,7 @@ class MigrateTextBlocksCommand extends DrushCommands {
       $text_blocks = $query->execute()->fetchAll();
       if (empty($text_blocks)) {
         $this->logger()->warning(dt('No sidebar text blocks found to migrate.'));
-        return;
+        return $sidebar_items;
       }
 
       $success_count = 0;
@@ -514,95 +520,132 @@ class MigrateTextBlocksCommand extends DrushCommands {
           ]);
           $paragraph->save();
 
-          // Instead of using the entity API, directly insert into the database
+          // Store the paragraph and node information for later database insertion
+          $sidebar_items[] = [
+            'node_id' => $node->id(),
+            'node_revision_id' => $node->getRevisionId(),
+            'node_bundle' => $node->bundle(),
+            'paragraph_id' => $paragraph->id(),
+            'paragraph_revision_id' => $paragraph->getRevisionId(),
+          ];
+
+          // Debug: Log detailed information about the paragraph and the node reference
+          $this->logger()->notice(dt('IMPORTANT: Sidebar paragraph created for node @nid: ID=@pid, RevID=@revid, Type=@type, Lang=@lang, Format=@format', [
+            '@nid' => $node->id(),
+            '@pid' => $paragraph->id(),
+            '@revid' => $paragraph->getRevisionId(),
+            '@type' => $paragraph->getType(),
+            '@lang' => $paragraph->language()->getId(),
+            '@format' => $paragraph->get('field_text')->format,
+          ]));
+
+          $success_count++;
+          $this->logger()->notice(dt('Successfully created sidebar paragraph for text block @id to node @nid', [
+            '@id' => $block->item_id,
+            '@nid' => $block->entity_id,
+          ]));
+        }
+        catch (\Exception $e) {
+          $error_count++;
+          $this->logger()->error(dt('Error creating sidebar paragraph for text block @id: @message', [
+            '@id' => $block->item_id,
+            '@message' => $e->getMessage(),
+          ]));
+        }
+      }
+
+      $this->logger()->notice(dt('Sidebar paragraph creation complete. Successes: @success, Errors: @errors', [
+        '@success' => $success_count,
+        '@errors' => $error_count,
+      ]));
+      
+      return $sidebar_items;
+    }
+    catch (\Exception $e) {
+      $this->logger()->error(dt('Sidebar paragraph creation failed: @message', [
+        '@message' => $e->getMessage(),
+      ]));
+      return [];
+    }
+  }
+
+  /**
+   * Insert sidebar items into the database.
+   * 
+   * @param array $sidebar_items
+   *   An array of sidebar item data for database insertion.
+   */
+  protected function insertSidebarItemsIntoDatabase(array $sidebar_items) {
+    try {
+      // Truncate the sidebar items tables to start fresh
+      $this->targetDb->truncate('node__field_sidebar_items')->execute();
+      $this->targetDb->truncate('node_revision__field_sidebar_items')->execute();
+      $this->logger()->notice(dt('Truncated sidebar items tables to start fresh.'));
+      
+      if (empty($sidebar_items)) {
+        $this->logger()->warning(dt('No sidebar items to insert into the database.'));
+        return;
+      }
+      
+      $success_count = 0;
+      $error_count = 0;
+      
+      foreach ($sidebar_items as $item) {
+        try {
           // Insert into node__field_sidebar_items table
           $this->targetDb->insert('node__field_sidebar_items')
             ->fields([
-              'bundle' => 'page',
+              'bundle' => $item['node_bundle'],
               'deleted' => 0,
-              'entity_id' => $node->id(),
-              'revision_id' => $node->getRevisionId(),
+              'entity_id' => $item['node_id'],
+              'revision_id' => $item['node_revision_id'],
               'langcode' => 'und',
               'delta' => 0,
-              'field_sidebar_items_target_id' => $paragraph->id(),
-              'field_sidebar_items_target_revision_id' => $paragraph->getRevisionId(),
+              'field_sidebar_items_target_id' => $item['paragraph_id'],
+              'field_sidebar_items_target_revision_id' => $item['paragraph_revision_id'],
             ])
             ->execute();
           
           // Also insert into node_revision__field_sidebar_items table
           $this->targetDb->insert('node_revision__field_sidebar_items')
             ->fields([
-              'bundle' => 'page',
+              'bundle' => $item['node_bundle'],
               'deleted' => 0,
-              'entity_id' => $node->id(),
-              'revision_id' => $node->getRevisionId(),
+              'entity_id' => $item['node_id'],
+              'revision_id' => $item['node_revision_id'],
               'langcode' => 'und',
               'delta' => 0,
-              'field_sidebar_items_target_id' => $paragraph->id(),
-              'field_sidebar_items_target_revision_id' => $paragraph->getRevisionId(),
+              'field_sidebar_items_target_id' => $item['paragraph_id'],
+              'field_sidebar_items_target_revision_id' => $item['paragraph_revision_id'],
             ])
             ->execute();
-
-
-          // Debug: Log detailed information about the paragraph and the node reference
-          $this->logger()->notice(dt('IMPORTANT: Sidebar paragraph created for node @nid: ID=@pid, RevID=@revid, Type=@type, Lang=@lang, Format=@format', [
-            '@nid' => $node->id(),
-            '@pid' => $paragraph->id(),
-            '@revid' => $paragraph->getRevisionId(),
-            '@type' => $paragraph->getType(),
-            '@lang' => $paragraph->language()->getId(),
-            '@format' => $paragraph->get('field_text')->format,
-          ]));
-          
-          // Debug: Log the node's field_sidebar_items values and revision ID after saving
-          $this->logger()->notice(dt('IMPORTANT: Node @nid (revision: @vid) field_sidebar_items values: @values', [
-            '@nid' => $node->id(),
-            '@vid' => $node->getRevisionId(),
-            '@values' => json_encode($node->get('field_sidebar_items')->getValue()),
-          ]));
           
-          // Check if the node type has a view display configuration for the sidebar field
-          $entity_display_repository = \Drupal::service('entity_display.repository');
-          $view_display = $entity_display_repository->getViewDisplay('node', $node->bundle(), 'default');
-          if ($view_display && $view_display->getComponent('field_sidebar_items')) {
-            $this->logger()->notice(dt('Node @nid has view display configuration for field_sidebar_items', [
-              '@nid' => $node->id(),
-            ]));
-          } else {
-            $this->logger()->notice(dt('Node @nid (@type) does not have view display configuration for field_sidebar_items', [
-              '@nid' => $node->id(),
-              '@type' => $node->bundle(),
-            ]));
-          }
-
           $success_count++;
-          $this->logger()->notice(dt('Successfully migrated sidebar text block @id to node @nid', [
-            '@id' => $block->item_id,
-            '@nid' => $block->entity_id,
+          $this->logger()->notice(dt('Successfully inserted sidebar item for node @nid', [
+            '@nid' => $item['node_id'],
           ]));
         }
         catch (\Exception $e) {
           $error_count++;
-          $this->logger()->error(dt('Error migrating sidebar text block @id: @message', [
-            '@id' => $block->item_id,
+          $this->logger()->error(dt('Error inserting sidebar item for node @nid: @message', [
+            '@nid' => $item['node_id'],
             '@message' => $e->getMessage(),
           ]));
         }
       }
-
-      $this->logger()->notice(dt('Sidebar text block migration complete. Successes: @success, Errors: @errors', [
+      
+      $this->logger()->notice(dt('Sidebar item database insertion complete. Successes: @success, Errors: @errors', [
         '@success' => $success_count,
         '@errors' => $error_count,
       ]));
     }
     catch (\Exception $e) {
-      $this->logger()->error(dt('Sidebar text block migration failed: @message', [
+      $this->logger()->error(dt('Sidebar item database insertion failed: @message', [
         '@message' => $e->getMessage(),
       ]));
     }
   }
 
-
   /**
    * {@inheritdoc}
    */
-- 
GitLab


From 0ac836bede59f5978886263f56b368d21a20fef6 Mon Sep 17 00:00:00 2001
From: Anthony Michaels <a.michaels@webpackager.com>
Date: Tue, 18 Mar 2025 21:18:15 +0000
Subject: [PATCH 04/12] Rebuild caches as a final action;

---
 src/Drush/Commands/MigrateTextBlocksCommand.php | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/Drush/Commands/MigrateTextBlocksCommand.php b/src/Drush/Commands/MigrateTextBlocksCommand.php
index 9fa68d1..5d6f0f0 100644
--- a/src/Drush/Commands/MigrateTextBlocksCommand.php
+++ b/src/Drush/Commands/MigrateTextBlocksCommand.php
@@ -269,19 +269,17 @@ class MigrateTextBlocksCommand extends DrushCommands {
       $this->migrateMainContentTextBlocks();
       
       // Rebuild caches between migrations
-      \Drupal::service('cache.render')->invalidateAll();
-      \Drupal::service('plugin.manager.entity_reference_selection')->clearCachedDefinitions();
+      drupal_flush_all_caches();
 
       // Then, create sidebar paragraph items
       $sidebar_items = $this->createSidebarParagraphItems();
       
-      // Rebuild caches again
-      \Drupal::service('cache.render')->invalidateAll();
-      \Drupal::service('plugin.manager.entity_reference_selection')->clearCachedDefinitions();
-      
       // Finally, insert sidebar items into the database
       $this->insertSidebarItemsIntoDatabase($sidebar_items);
       
+      // Rebuild caches again
+      drupal_flush_all_caches();
+
       $this->logger()->notice(dt('Text block migration complete for both main content and sidebar items.'));
     }
     catch (\Exception $e) {
-- 
GitLab


From 24bc17e6ae7f053c520872328af5e91f90c58068 Mon Sep 17 00:00:00 2001
From: Anthony Michaels <a.michaels@webpackager.com>
Date: Tue, 18 Mar 2025 22:12:21 +0000
Subject: [PATCH 05/12] Avoid db inserts, use $node->save() to attached
 paragraphs to nodes;

---
 .../Commands/MigrateTextBlocksCommand.php     | 35 ++++---------------
 1 file changed, 7 insertions(+), 28 deletions(-)

diff --git a/src/Drush/Commands/MigrateTextBlocksCommand.php b/src/Drush/Commands/MigrateTextBlocksCommand.php
index 5d6f0f0..730889e 100644
--- a/src/Drush/Commands/MigrateTextBlocksCommand.php
+++ b/src/Drush/Commands/MigrateTextBlocksCommand.php
@@ -590,34 +590,13 @@ class MigrateTextBlocksCommand extends DrushCommands {
       
       foreach ($sidebar_items as $item) {
         try {
-          // Insert into node__field_sidebar_items table
-          $this->targetDb->insert('node__field_sidebar_items')
-            ->fields([
-              'bundle' => $item['node_bundle'],
-              'deleted' => 0,
-              'entity_id' => $item['node_id'],
-              'revision_id' => $item['node_revision_id'],
-              'langcode' => 'und',
-              'delta' => 0,
-              'field_sidebar_items_target_id' => $item['paragraph_id'],
-              'field_sidebar_items_target_revision_id' => $item['paragraph_revision_id'],
-            ])
-            ->execute();
-          
-          // Also insert into node_revision__field_sidebar_items table
-          $this->targetDb->insert('node_revision__field_sidebar_items')
-            ->fields([
-              'bundle' => $item['node_bundle'],
-              'deleted' => 0,
-              'entity_id' => $item['node_id'],
-              'revision_id' => $item['node_revision_id'],
-              'langcode' => 'und',
-              'delta' => 0,
-              'field_sidebar_items_target_id' => $item['paragraph_id'],
-              'field_sidebar_items_target_revision_id' => $item['paragraph_revision_id'],
-            ])
-            ->execute();
-          
+          $node = $this->entityTypeManager->getStorage('node')->load($item['node_id']);
+          $node->field_sidebar_items[] = [
+            'target_id' => $item['paragraph_id'],
+            'target_revision_id' => $item['paragraph_revision_id'],
+          ];
+          $node->save();
+
           $success_count++;
           $this->logger()->notice(dt('Successfully inserted sidebar item for node @nid', [
             '@nid' => $item['node_id'],
-- 
GitLab


From 091c2d55a32fe10207e6f3035efc9f33e0826276 Mon Sep 17 00:00:00 2001
From: Anthony Michaels <a.michaels@webpackager.com>
Date: Tue, 18 Mar 2025 22:33:39 +0000
Subject: [PATCH 06/12] Minor

---
 src/Drush/Commands/MigrateTextBlocksCommand.php | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Drush/Commands/MigrateTextBlocksCommand.php b/src/Drush/Commands/MigrateTextBlocksCommand.php
index 730889e..a3f82bf 100644
--- a/src/Drush/Commands/MigrateTextBlocksCommand.php
+++ b/src/Drush/Commands/MigrateTextBlocksCommand.php
@@ -578,7 +578,7 @@ class MigrateTextBlocksCommand extends DrushCommands {
       // Truncate the sidebar items tables to start fresh
       $this->targetDb->truncate('node__field_sidebar_items')->execute();
       $this->targetDb->truncate('node_revision__field_sidebar_items')->execute();
-      $this->logger()->notice(dt('Truncated sidebar items tables to start fresh.'));
+      $this->logger()->notice(dt('Truncated sidebar items tables to start afresh.'));
       
       if (empty($sidebar_items)) {
         $this->logger()->warning(dt('No sidebar items to insert into the database.'));
-- 
GitLab


From dc73033f816bfd5c40e22a4b9af97686f67015d6 Mon Sep 17 00:00:00 2001
From: Anthony Michaels <a.michaels@webpackager.com>
Date: Wed, 19 Mar 2025 09:20:03 +0000
Subject: [PATCH 07/12] Minor

---
 src/Drush/Commands/MigrateTextBlocksCommand.php | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Drush/Commands/MigrateTextBlocksCommand.php b/src/Drush/Commands/MigrateTextBlocksCommand.php
index a3f82bf..dcf6a33 100644
--- a/src/Drush/Commands/MigrateTextBlocksCommand.php
+++ b/src/Drush/Commands/MigrateTextBlocksCommand.php
@@ -575,7 +575,7 @@ class MigrateTextBlocksCommand extends DrushCommands {
    */
   protected function insertSidebarItemsIntoDatabase(array $sidebar_items) {
     try {
-      // Truncate the sidebar items tables to start fresh
+      // Truncate the sidebar items tables to start afresh
       $this->targetDb->truncate('node__field_sidebar_items')->execute();
       $this->targetDb->truncate('node_revision__field_sidebar_items')->execute();
       $this->logger()->notice(dt('Truncated sidebar items tables to start afresh.'));
-- 
GitLab


From d57b4ff57b2e6369ce99836d1470f6c793b88d35 Mon Sep 17 00:00:00 2001
From: Anthony Michaels <a.michaels@webpackager.com>
Date: Thu, 20 Mar 2025 09:22:16 +0000
Subject: [PATCH 08/12] Initial script for cleanup Drush service;

---
 cambridge_migrations.services.yml            |   6 +
 drush.services.yml                           |   6 +
 src/Drush/Commands/MigrateCleanupCommand.php | 206 +++++++++++++++++++
 3 files changed, 218 insertions(+)
 create mode 100644 src/Drush/Commands/MigrateCleanupCommand.php

diff --git a/cambridge_migrations.services.yml b/cambridge_migrations.services.yml
index cc08c86..fab37c3 100644
--- a/cambridge_migrations.services.yml
+++ b/cambridge_migrations.services.yml
@@ -10,3 +10,9 @@ services:
     arguments: ['@entity_type.manager', '@file_system', '@database']
     tags:
       - { name: drush.command }
+      
+  cambridge_migrations.cleanup_command:
+    class: Drupal\cambridge_migrations\Drush\Commands\MigrateCleanupCommand
+    arguments: ['@entity_type.manager']
+    tags:
+      - { name: drush.command }
diff --git a/drush.services.yml b/drush.services.yml
index cc08c86..fab37c3 100644
--- a/drush.services.yml
+++ b/drush.services.yml
@@ -10,3 +10,9 @@ services:
     arguments: ['@entity_type.manager', '@file_system', '@database']
     tags:
       - { name: drush.command }
+      
+  cambridge_migrations.cleanup_command:
+    class: Drupal\cambridge_migrations\Drush\Commands\MigrateCleanupCommand
+    arguments: ['@entity_type.manager']
+    tags:
+      - { name: drush.command }
diff --git a/src/Drush/Commands/MigrateCleanupCommand.php b/src/Drush/Commands/MigrateCleanupCommand.php
new file mode 100644
index 0000000..e934c06
--- /dev/null
+++ b/src/Drush/Commands/MigrateCleanupCommand.php
@@ -0,0 +1,206 @@
+<?php
+
+namespace Drupal\cambridge_migrations\Drush\Commands;
+
+use Drush\Commands\DrushCommands;
+use Drupal\Core\Entity\EntityTypeManagerInterface;
+use Drupal\paragraphs\Entity\Paragraph;
+use Symfony\Component\DependencyInjection\ContainerInterface;
+use DOMDocument;
+use DOMElement;
+
+/**
+ * Drush command for cleaning up paragraph HTML content.
+ */
+class MigrateCleanupCommand extends DrushCommands {
+
+  /**
+   * The entity type manager.
+   *
+   * @var \Drupal\Core\Entity\EntityTypeManagerInterface
+   */
+  protected $entityTypeManager;
+
+  /**
+   * Constructs a new MigrateCleanupCommand object.
+   */
+  public function __construct(EntityTypeManagerInterface $entity_type_manager) {
+    parent::__construct();
+    $this->entityTypeManager = $entity_type_manager;
+  }
+
+  /**
+   * Clean HTML content by removing unwanted attributes and empty tags.
+   *
+   * @param string $html
+   *   The HTML content to clean.
+   *
+   * @return string
+   *   The cleaned HTML content.
+   */
+  protected function cleanHtml($html) {
+    if (empty($html)) {
+      return $html;
+    }
+
+    // Create a new DOM document
+    $doc = new DOMDocument();
+    
+    // Preserve whitespace to maintain formatting
+    $doc->preserveWhiteSpace = true;
+    $doc->formatOutput = true;
+
+    // Load HTML with UTF-8 encoding and suppress warnings
+    $doc->loadHTML('<?xml encoding="UTF-8">' . $html, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
+
+    // Remove unwanted attributes and empty tags recursively
+    $this->cleanNode($doc->documentElement);
+
+    // Get the cleaned HTML
+    $cleanedHtml = $doc->saveHTML($doc->documentElement);
+
+    // Remove the XML declaration if present
+    $cleanedHtml = preg_replace('/<\?xml[^>]+\?>/', '', $cleanedHtml);
+
+    return trim($cleanedHtml);
+  }
+
+  /**
+   * Recursively clean a DOM node by removing unwanted attributes and empty tags.
+   *
+   * @param DOMElement $node
+   *   The DOM node to clean.
+   *
+   * @return bool
+   *   TRUE if the node should be kept, FALSE if it should be removed.
+   */
+  protected function cleanNode(DOMElement $node) {
+    // Remove style and script elements entirely
+    $tagName = strtolower($node->tagName);
+    if ($tagName === 'style' || $tagName === 'script') {
+      return false;
+    }
+
+    // Remove class and style attributes
+    $node->removeAttribute('class');
+    $node->removeAttribute('style');
+
+    // Process child nodes
+    $children = [];
+    foreach ($node->childNodes as $child) {
+      $children[] = $child;
+    }
+
+    $hasNonEmptyContent = false;
+
+    foreach ($children as $child) {
+      if ($child->nodeType === XML_ELEMENT_NODE) {
+        // Recursively clean child element
+        $keepChild = $this->cleanNode($child);
+        if (!$keepChild) {
+          $node->removeChild($child);
+        } else {
+          $hasNonEmptyContent = true;
+        }
+      }
+      elseif ($child->nodeType === XML_TEXT_NODE) {
+        // Check if text node contains more than just whitespace and non-breaking spaces
+        $text = trim($child->nodeValue);
+        $textWithoutNbsp = str_replace("\xC2\xA0", '', $text); // Remove non-breaking spaces
+        if (!empty($textWithoutNbsp)) {
+          $hasNonEmptyContent = true;
+        }
+      }
+    }
+
+    // Return true if node has meaningful content
+    return $hasNonEmptyContent;
+  }
+
+  /**
+   * Clean up HTML content in text paragraphs.
+   *
+   * @command cambridge:migrate-cleanup
+   * @aliases cm-cleanup
+   */
+  public function migrateCleanup() {
+    try {
+      // Query all paragraphs of type "text"
+      $storage = $this->entityTypeManager->getStorage('paragraph');
+      $query = $storage->getQuery()
+        ->condition('type', 'text')
+        ->accessCheck(FALSE);
+      $pids = $query->execute();
+
+      if (empty($pids)) {
+        $this->logger()->warning(dt('No text paragraphs found to clean up.'));
+        return;
+      }
+
+      $success_count = 0;
+      $error_count = 0;
+
+      foreach ($pids as $pid) {
+        try {
+          /** @var \Drupal\paragraphs\Entity\Paragraph $paragraph */
+          $paragraph = $storage->load($pid);
+          
+          if (!$paragraph->hasField('field_text')) {
+            $this->logger()->warning(dt('Paragraph @pid does not have field_text field.', [
+              '@pid' => $pid,
+            ]));
+            continue;
+          }
+
+          $text_field = $paragraph->get('field_text');
+          $value = $text_field->value;
+          $format = $text_field->format;
+
+          // Clean the HTML content
+          $cleaned_value = $this->cleanHtml($value);
+
+          // Update the paragraph if content changed
+          if ($cleaned_value !== $value) {
+            $text_field->setValue([
+              'value' => $cleaned_value,
+              'format' => $format,
+            ]);
+            $paragraph->save();
+
+            $success_count++;
+            $this->logger()->notice(dt('Successfully cleaned HTML in paragraph @pid', [
+              '@pid' => $pid,
+            ]));
+          }
+        }
+        catch (\Exception $e) {
+          $error_count++;
+          $this->logger()->error(dt('Error cleaning paragraph @pid: @message', [
+            '@pid' => $pid,
+            '@message' => $e->getMessage(),
+          ]));
+        }
+      }
+
+      $this->logger()->notice(dt('HTML cleanup complete. Successes: @success, Errors: @errors', [
+        '@success' => $success_count,
+        '@errors' => $error_count,
+      ]));
+    }
+    catch (\Exception $e) {
+      $this->logger()->error(dt('HTML cleanup failed: @message', [
+        '@message' => $e->getMessage(),
+      ]));
+    }
+  }
+
+  /**
+   * {@inheritdoc}
+   */
+  public static function create(ContainerInterface $container) {
+    return new static(
+      $container->get('entity_type_manager')
+    );
+  }
+
+}
-- 
GitLab


From 95a0c966593c28e2f075eacf0ea7fe21d49c7e8f Mon Sep 17 00:00:00 2001
From: Anthony Michaels <a.michaels@webpackager.com>
Date: Thu, 20 Mar 2025 09:36:39 +0000
Subject: [PATCH 09/12] Update - removal process too thorough, removes
 legitimate content;

---
 src/Drush/Commands/MigrateCleanupCommand.php | 70 ++++++++++++++------
 1 file changed, 51 insertions(+), 19 deletions(-)

diff --git a/src/Drush/Commands/MigrateCleanupCommand.php b/src/Drush/Commands/MigrateCleanupCommand.php
index e934c06..d37d3b7 100644
--- a/src/Drush/Commands/MigrateCleanupCommand.php
+++ b/src/Drush/Commands/MigrateCleanupCommand.php
@@ -50,19 +50,37 @@ class MigrateCleanupCommand extends DrushCommands {
     $doc->preserveWhiteSpace = true;
     $doc->formatOutput = true;
 
-    // Load HTML with UTF-8 encoding and suppress warnings
+    // Suppress warnings during HTML loading
+    $internalErrors = libxml_use_internal_errors(true);
+    
+    // Load HTML with UTF-8 encoding
     $doc->loadHTML('<?xml encoding="UTF-8">' . $html, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
+    
+    // Restore error handling
+    libxml_use_internal_errors($internalErrors);
 
-    // Remove unwanted attributes and empty tags recursively
-    $this->cleanNode($doc->documentElement);
+    // Process the document if it loaded successfully
+    if ($doc->documentElement) {
+      // Remove unwanted attributes and empty tags recursively
+      $this->cleanNode($doc->documentElement);
 
-    // Get the cleaned HTML
-    $cleanedHtml = $doc->saveHTML($doc->documentElement);
+      // Get the cleaned HTML
+      $cleanedHtml = $doc->saveHTML($doc->documentElement);
 
-    // Remove the XML declaration if present
-    $cleanedHtml = preg_replace('/<\?xml[^>]+\?>/', '', $cleanedHtml);
+      // Remove the XML declaration if present
+      $cleanedHtml = preg_replace('/<\?xml[^>]+\?>/', '', $cleanedHtml);
+
+      // If the result is empty but the original wasn't, return the original
+      if (empty(trim($cleanedHtml)) && !empty(trim($html))) {
+        $this->logger()->warning('Cleaning resulted in empty content, keeping original');
+        return $html;
+      }
 
-    return trim($cleanedHtml);
+      return trim($cleanedHtml);
+    }
+    
+    // If document couldn't be properly parsed, return original HTML
+    return $html;
   }
 
   /**
@@ -97,24 +115,23 @@ class MigrateCleanupCommand extends DrushCommands {
       if ($child->nodeType === XML_ELEMENT_NODE) {
         // Recursively clean child element
         $keepChild = $this->cleanNode($child);
-        if (!$keepChild) {
+        if (!$keepChild && ($child->nodeName === 'style' || $child->nodeName === 'script')) {
+          // Only remove style and script elements, keep other elements even if empty
           $node->removeChild($child);
-        } else {
+        } 
+        else {
+          // Always keep other elements, even if they're empty
           $hasNonEmptyContent = true;
         }
       }
       elseif ($child->nodeType === XML_TEXT_NODE) {
-        // Check if text node contains more than just whitespace and non-breaking spaces
-        $text = trim($child->nodeValue);
-        $textWithoutNbsp = str_replace("\xC2\xA0", '', $text); // Remove non-breaking spaces
-        if (!empty($textWithoutNbsp)) {
-          $hasNonEmptyContent = true;
-        }
+        // Always keep text nodes, even if they're just whitespace
+        $hasNonEmptyContent = true;
       }
     }
 
-    // Return true if node has meaningful content
-    return $hasNonEmptyContent;
+    // Always keep the node unless it's a style or script element
+    return true;
   }
 
   /**
@@ -139,6 +156,7 @@ class MigrateCleanupCommand extends DrushCommands {
 
       $success_count = 0;
       $error_count = 0;
+      $unchanged_count = 0;
 
       foreach ($pids as $pid) {
         try {
@@ -156,6 +174,14 @@ class MigrateCleanupCommand extends DrushCommands {
           $value = $text_field->value;
           $format = $text_field->format;
 
+          // Skip if empty
+          if (empty($value)) {
+            $this->logger()->notice(dt('Skipping empty paragraph @pid', [
+              '@pid' => $pid,
+            ]));
+            continue;
+          }
+
           // Clean the HTML content
           $cleaned_value = $this->cleanHtml($value);
 
@@ -171,6 +197,11 @@ class MigrateCleanupCommand extends DrushCommands {
             $this->logger()->notice(dt('Successfully cleaned HTML in paragraph @pid', [
               '@pid' => $pid,
             ]));
+          } else {
+            $unchanged_count++;
+            $this->logger()->info(dt('No changes needed for paragraph @pid', [
+              '@pid' => $pid,
+            ]));
           }
         }
         catch (\Exception $e) {
@@ -182,8 +213,9 @@ class MigrateCleanupCommand extends DrushCommands {
         }
       }
 
-      $this->logger()->notice(dt('HTML cleanup complete. Successes: @success, Errors: @errors', [
+      $this->logger()->notice(dt('HTML cleanup complete. Successes: @success, Unchanged: @unchanged, Errors: @errors', [
         '@success' => $success_count,
+        '@unchanged' => $unchanged_count,
         '@errors' => $error_count,
       ]));
     }
-- 
GitLab


From 8bab4800ad4eb4b4b848197772774e9a7f030497 Mon Sep 17 00:00:00 2001
From: Anthony Michaels <a.michaels@webpackager.com>
Date: Thu, 20 Mar 2025 09:45:26 +0000
Subject: [PATCH 10/12] Redo cleanup process; include extra notices;

---
 src/Drush/Commands/MigrateCleanupCommand.php | 347 ++++++++++---------
 1 file changed, 192 insertions(+), 155 deletions(-)

diff --git a/src/Drush/Commands/MigrateCleanupCommand.php b/src/Drush/Commands/MigrateCleanupCommand.php
index d37d3b7..db6a5a0 100644
--- a/src/Drush/Commands/MigrateCleanupCommand.php
+++ b/src/Drush/Commands/MigrateCleanupCommand.php
@@ -8,9 +8,20 @@ use Drupal\paragraphs\Entity\Paragraph;
 use Symfony\Component\DependencyInjection\ContainerInterface;
 use DOMDocument;
 use DOMElement;
+use DOMNode;
 
 /**
- * Drush command for cleaning up paragraph HTML content.
+ * Drush command for cleaning up HTML content in text paragraphs.
+ *
+ * This command:
+ * - Queries all paragraphs of type "text"
+ * - For each paragraph, processes its HTML content to:
+ *   - Remove all class and style attributes from every element
+ *   - Recursively remove empty tags or tags with only non-breaking spaces
+ *   - Remove all <style> and <script> elements entirely
+ *
+ * To run:
+ *   ddev drush cambridge:migrate-cleanup
  */
 class MigrateCleanupCommand extends DrushCommands {
 
@@ -23,207 +34,233 @@ class MigrateCleanupCommand extends DrushCommands {
 
   /**
    * Constructs a new MigrateCleanupCommand object.
+   *
+   * @param \Drupal\Core\Entity\EntityTypeManagerInterface $entity_type_manager
+   *   The entity type manager.
    */
-  public function __construct(EntityTypeManagerInterface $entity_type_manager) {
+  public function __construct(
+    EntityTypeManagerInterface $entity_type_manager
+  ) {
     parent::__construct();
     $this->entityTypeManager = $entity_type_manager;
   }
 
   /**
-   * Clean HTML content by removing unwanted attributes and empty tags.
+   * Clean up HTML content in text paragraphs.
+   *
+   * @command cambridge:migrate-cleanup
+   * @aliases cm-cleanup
+   */
+  public function migrateCleanup() {
+    try {
+      // Query all paragraphs of type "text".
+      $paragraph_storage = $this->entityTypeManager->getStorage('paragraph');
+      $query = $paragraph_storage->getQuery()
+        ->condition('type', 'text')
+        ->accessCheck(FALSE);
+      $paragraph_ids = $query->execute();
+
+      if (empty($paragraph_ids)) {
+        $this->logger()->warning(dt('No text paragraphs found to clean up.'));
+        return;
+      }
+
+      $this->logger()->notice(dt('Found @count text paragraphs to process.', [
+        '@count' => count($paragraph_ids),
+      ]));
+
+      $success_count = 0;
+      $error_count = 0;
+
+      // Load paragraphs in chunks to avoid memory issues.
+      $chunk_size = 50;
+      $chunks = array_chunk($paragraph_ids, $chunk_size, TRUE);
+
+      foreach ($chunks as $chunk) {
+        $paragraphs = $paragraph_storage->loadMultiple($chunk);
+
+        foreach ($paragraphs as $paragraph_id => $paragraph) {
+          try {
+            // Ensure we're working with a Paragraph entity
+            if (!($paragraph instanceof Paragraph)) {
+              $paragraph = Paragraph::load($paragraph_id);
+              if (!$paragraph) {
+                $this->logger()->warning(dt('Could not load paragraph @id', [
+                  '@id' => $paragraph_id,
+                ]));
+                continue;
+              }
+            }
+            
+            // Skip if paragraph doesn't have a text field.
+            if (!$paragraph->hasField('field_text')) {
+              $this->logger()->warning(dt('Skipping paragraph @id - no text field found', [
+                '@id' => $paragraph->id(),
+              ]));
+              continue;
+            }
+
+            // Get the current HTML content.
+            $text_field = $paragraph->get('field_text');
+            $html = $text_field->value;
+            $format = $text_field->format;
+
+            if (empty($html)) {
+              $this->logger()->notice(dt('Skipping paragraph @id - empty content', [
+                '@id' => $paragraph->id(),
+              ]));
+              continue;
+            }
+
+            // Process the HTML content.
+            $cleaned_html = $this->cleanupHtml($html);
+
+            // Update the paragraph with the cleaned HTML.
+            $paragraph->set('field_text', [
+              'value' => $cleaned_html,
+              'format' => $format,
+            ]);
+            $paragraph->save();
+
+            $success_count++;
+            $this->logger()->notice(dt('Successfully cleaned up paragraph @id', [
+              '@id' => $paragraph->id(),
+            ]));
+          }
+          catch (\Exception $e) {
+            $error_count++;
+            $this->logger()->error(dt('Error cleaning up paragraph @id: @message', [
+              '@id' => $paragraph->id(),
+              '@message' => $e->getMessage(),
+            ]));
+          }
+        }
+
+        // Clear static caches to avoid memory issues.
+        $paragraph_storage->resetCache($chunk);
+      }
+
+      $this->logger()->notice(dt('Cleanup complete. Successes: @success, Errors: @errors', [
+        '@success' => $success_count,
+        '@errors' => $error_count,
+      ]));
+    }
+    catch (\Exception $e) {
+      $this->logger()->error(dt('Cleanup failed: @message', [
+        '@message' => $e->getMessage(),
+      ]));
+    }
+  }
+
+  /**
+   * Clean up HTML content.
    *
    * @param string $html
-   *   The HTML content to clean.
+   *   The HTML content to clean up.
    *
    * @return string
-   *   The cleaned HTML content.
+   *   The cleaned up HTML content.
    */
-  protected function cleanHtml($html) {
+  protected function cleanupHtml($html) {
+    // If the HTML is empty, return it as is.
     if (empty($html)) {
       return $html;
     }
 
-    // Create a new DOM document
+    // Create a new DOMDocument.
     $doc = new DOMDocument();
     
-    // Preserve whitespace to maintain formatting
+    // Preserve whitespace to avoid unwanted text nodes.
     $doc->preserveWhiteSpace = true;
-    $doc->formatOutput = true;
-
-    // Suppress warnings during HTML loading
-    $internalErrors = libxml_use_internal_errors(true);
     
-    // Load HTML with UTF-8 encoding
-    $doc->loadHTML('<?xml encoding="UTF-8">' . $html, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
+    // Disable error reporting temporarily to suppress warnings about HTML5 tags.
+    $previous_value = libxml_use_internal_errors(true);
     
-    // Restore error handling
-    libxml_use_internal_errors($internalErrors);
-
-    // Process the document if it loaded successfully
-    if ($doc->documentElement) {
-      // Remove unwanted attributes and empty tags recursively
-      $this->cleanNode($doc->documentElement);
+    // Load the HTML content.
+    // Add a wrapper to ensure proper parsing of fragments.
+    $doc->loadHTML('<!DOCTYPE html><html><head><meta charset="UTF-8"></head><body>' . $html . '</body></html>', LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
+    
+    // Restore error reporting.
+    libxml_use_internal_errors($previous_value);
 
-      // Get the cleaned HTML
-      $cleanedHtml = $doc->saveHTML($doc->documentElement);
+    // Get the body element.
+    $body = $doc->getElementsByTagName('body')->item(0);
 
-      // Remove the XML declaration if present
-      $cleanedHtml = preg_replace('/<\?xml[^>]+\?>/', '', $cleanedHtml);
+    // Process the body element recursively.
+    $this->processNode($body);
 
-      // If the result is empty but the original wasn't, return the original
-      if (empty(trim($cleanedHtml)) && !empty(trim($html))) {
-        $this->logger()->warning('Cleaning resulted in empty content, keeping original');
-        return $html;
-      }
-
-      return trim($cleanedHtml);
-    }
+    // Save the processed HTML.
+    $processed_html = '';
+    $children = $body->childNodes;
     
-    // If document couldn't be properly parsed, return original HTML
-    return $html;
+    foreach ($children as $child) {
+      $processed_html .= $doc->saveHTML($child);
+    }
+
+    return $processed_html;
   }
 
   /**
-   * Recursively clean a DOM node by removing unwanted attributes and empty tags.
+   * Process a DOM node recursively.
    *
-   * @param DOMElement $node
-   *   The DOM node to clean.
+   * @param \DOMNode $node
+   *   The DOM node to process.
    *
    * @return bool
    *   TRUE if the node should be kept, FALSE if it should be removed.
    */
-  protected function cleanNode(DOMElement $node) {
-    // Remove style and script elements entirely
-    $tagName = strtolower($node->tagName);
-    if ($tagName === 'style' || $tagName === 'script') {
+  protected function processNode(DOMNode $node) {
+    // If it's not an element node, keep it unless it's empty.
+    if ($node->nodeType !== XML_ELEMENT_NODE) {
+      // For text nodes, check if they contain only whitespace or non-breaking spaces.
+      if ($node->nodeType === XML_TEXT_NODE) {
+        $text = trim($node->textContent);
+        $text = str_replace('&nbsp;', '', $text);
+        $text = str_replace("\xC2\xA0", '', $text); // UTF-8 non-breaking space
+        return !empty($text);
+      }
+      return true;
+    }
+
+    // If it's a script or style element, remove it.
+    if (in_array(strtolower($node->nodeName), ['script', 'style'])) {
+      $node->parentNode->removeChild($node);
       return false;
     }
 
-    // Remove class and style attributes
-    $node->removeAttribute('class');
-    $node->removeAttribute('style');
+    // If it's an element node, remove class and style attributes.
+    if ($node instanceof DOMElement) {
+      $node->removeAttribute('class');
+      $node->removeAttribute('style');
+    }
 
-    // Process child nodes
+    // Process child nodes recursively.
     $children = [];
     foreach ($node->childNodes as $child) {
       $children[] = $child;
     }
 
-    $hasNonEmptyContent = false;
-
+    $keep_node = false;
     foreach ($children as $child) {
-      if ($child->nodeType === XML_ELEMENT_NODE) {
-        // Recursively clean child element
-        $keepChild = $this->cleanNode($child);
-        if (!$keepChild && ($child->nodeName === 'style' || $child->nodeName === 'script')) {
-          // Only remove style and script elements, keep other elements even if empty
-          $node->removeChild($child);
-        } 
-        else {
-          // Always keep other elements, even if they're empty
-          $hasNonEmptyContent = true;
-        }
+      $keep_child = $this->processNode($child);
+      if (!$keep_child && $child->parentNode) {
+        $child->parentNode->removeChild($child);
       }
-      elseif ($child->nodeType === XML_TEXT_NODE) {
-        // Always keep text nodes, even if they're just whitespace
-        $hasNonEmptyContent = true;
+      else {
+        $keep_node = true;
       }
     }
 
-    // Always keep the node unless it's a style or script element
-    return true;
-  }
-
-  /**
-   * Clean up HTML content in text paragraphs.
-   *
-   * @command cambridge:migrate-cleanup
-   * @aliases cm-cleanup
-   */
-  public function migrateCleanup() {
-    try {
-      // Query all paragraphs of type "text"
-      $storage = $this->entityTypeManager->getStorage('paragraph');
-      $query = $storage->getQuery()
-        ->condition('type', 'text')
-        ->accessCheck(FALSE);
-      $pids = $query->execute();
-
-      if (empty($pids)) {
-        $this->logger()->warning(dt('No text paragraphs found to clean up.'));
-        return;
-      }
-
-      $success_count = 0;
-      $error_count = 0;
-      $unchanged_count = 0;
-
-      foreach ($pids as $pid) {
-        try {
-          /** @var \Drupal\paragraphs\Entity\Paragraph $paragraph */
-          $paragraph = $storage->load($pid);
-          
-          if (!$paragraph->hasField('field_text')) {
-            $this->logger()->warning(dt('Paragraph @pid does not have field_text field.', [
-              '@pid' => $pid,
-            ]));
-            continue;
-          }
-
-          $text_field = $paragraph->get('field_text');
-          $value = $text_field->value;
-          $format = $text_field->format;
-
-          // Skip if empty
-          if (empty($value)) {
-            $this->logger()->notice(dt('Skipping empty paragraph @pid', [
-              '@pid' => $pid,
-            ]));
-            continue;
-          }
-
-          // Clean the HTML content
-          $cleaned_value = $this->cleanHtml($value);
-
-          // Update the paragraph if content changed
-          if ($cleaned_value !== $value) {
-            $text_field->setValue([
-              'value' => $cleaned_value,
-              'format' => $format,
-            ]);
-            $paragraph->save();
-
-            $success_count++;
-            $this->logger()->notice(dt('Successfully cleaned HTML in paragraph @pid', [
-              '@pid' => $pid,
-            ]));
-          } else {
-            $unchanged_count++;
-            $this->logger()->info(dt('No changes needed for paragraph @pid', [
-              '@pid' => $pid,
-            ]));
-          }
-        }
-        catch (\Exception $e) {
-          $error_count++;
-          $this->logger()->error(dt('Error cleaning paragraph @pid: @message', [
-            '@pid' => $pid,
-            '@message' => $e->getMessage(),
-          ]));
-        }
+    // If the node has no children and no text content, mark it for removal.
+    if (!$keep_node && $node->childNodes->length === 0) {
+      // Check if the node is not a self-closing tag like <img>, <br>, etc.
+      $self_closing_tags = ['img', 'br', 'hr', 'input', 'meta', 'link'];
+      if (!in_array(strtolower($node->nodeName), $self_closing_tags)) {
+        return false;
       }
-
-      $this->logger()->notice(dt('HTML cleanup complete. Successes: @success, Unchanged: @unchanged, Errors: @errors', [
-        '@success' => $success_count,
-        '@unchanged' => $unchanged_count,
-        '@errors' => $error_count,
-      ]));
-    }
-    catch (\Exception $e) {
-      $this->logger()->error(dt('HTML cleanup failed: @message', [
-        '@message' => $e->getMessage(),
-      ]));
     }
+
+    return true;
   }
 
   /**
@@ -231,7 +268,7 @@ class MigrateCleanupCommand extends DrushCommands {
    */
   public static function create(ContainerInterface $container) {
     return new static(
-      $container->get('entity_type_manager')
+      $container->get('entity_type.manager')
     );
   }
 
-- 
GitLab


From c82c87cf12aa673f2828d06e90aa8aa6d07e0e6e Mon Sep 17 00:00:00 2001
From: Anthony Michaels <a.michaels@webpackager.com>
Date: Fri, 21 Mar 2025 18:42:35 +0000
Subject: [PATCH 11/12] Exclude embedded Drupal media tags 'drupal-entity' and
 'drupal-media';

---
 src/Drush/Commands/MigrateCleanupCommand.php | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/Drush/Commands/MigrateCleanupCommand.php b/src/Drush/Commands/MigrateCleanupCommand.php
index db6a5a0..e054cd8 100644
--- a/src/Drush/Commands/MigrateCleanupCommand.php
+++ b/src/Drush/Commands/MigrateCleanupCommand.php
@@ -222,6 +222,12 @@ class MigrateCleanupCommand extends DrushCommands {
       return true;
     }
 
+    // Always preserve Drupal-specific tags
+    $drupal_tags = ['drupal-entity', 'drupal-media'];
+    if (in_array(strtolower($node->nodeName), $drupal_tags)) {
+      return true;
+    }
+    
     // If it's a script or style element, remove it.
     if (in_array(strtolower($node->nodeName), ['script', 'style'])) {
       $node->parentNode->removeChild($node);
-- 
GitLab


From 2d0eff48057dd293decd60230824dbd6c65033f8 Mon Sep 17 00:00:00 2001
From: Anthony Michaels <a.michaels@webpackager.com>
Date: Fri, 21 Mar 2025 19:17:39 +0000
Subject: [PATCH 12/12] Focus on specific empty tags thus avoiding Drupal
 reserved tags being affected;

---
 src/Drush/Commands/MigrateCleanupCommand.php | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/src/Drush/Commands/MigrateCleanupCommand.php b/src/Drush/Commands/MigrateCleanupCommand.php
index e054cd8..d6cd921 100644
--- a/src/Drush/Commands/MigrateCleanupCommand.php
+++ b/src/Drush/Commands/MigrateCleanupCommand.php
@@ -221,12 +221,6 @@ class MigrateCleanupCommand extends DrushCommands {
       }
       return true;
     }
-
-    // Always preserve Drupal-specific tags
-    $drupal_tags = ['drupal-entity', 'drupal-media'];
-    if (in_array(strtolower($node->nodeName), $drupal_tags)) {
-      return true;
-    }
     
     // If it's a script or style element, remove it.
     if (in_array(strtolower($node->nodeName), ['script', 'style'])) {
@@ -257,11 +251,13 @@ class MigrateCleanupCommand extends DrushCommands {
       }
     }
 
-    // If the node has no children and no text content, mark it for removal.
+    // Only check specific tags for emptiness
+    $tags_to_check = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'strong', 'b', 'em', 'i', 
+                      'small', 'mark', 'del', 'ins', 'sub', 'sup', 'q', 'cite', 'pre'];
+    
+    // If the node has no children and no text content, check if it's in our list of tags to check
     if (!$keep_node && $node->childNodes->length === 0) {
-      // Check if the node is not a self-closing tag like <img>, <br>, etc.
-      $self_closing_tags = ['img', 'br', 'hr', 'input', 'meta', 'link'];
-      if (!in_array(strtolower($node->nodeName), $self_closing_tags)) {
+      if (in_array(strtolower($node->nodeName), $tags_to_check)) {
         return false;
       }
     }
-- 
GitLab