From 6d5200ad20772325b0bbcefa54ae6376dca87557 Mon Sep 17 00:00:00 2001 From: Anthony Michaels <a.michaels@webpackager.com> Date: Tue, 18 Mar 2025 10:19:11 +0000 Subject: [PATCH 01/12] Include sidebar Text block paragraph type in migration script; --- .../Commands/MigrateTextBlocksCommand.php | 314 +++++++++++++++--- 1 file changed, 262 insertions(+), 52 deletions(-) diff --git a/src/Drush/Commands/MigrateTextBlocksCommand.php b/src/Drush/Commands/MigrateTextBlocksCommand.php index 50cfa66..79b8ed2 100644 --- a/src/Drush/Commands/MigrateTextBlocksCommand.php +++ b/src/Drush/Commands/MigrateTextBlocksCommand.php @@ -108,6 +108,32 @@ class MigrateTextBlocksCommand extends DrushCommands { } } + /** + * Remove all paragraphs from a node's sidebar items field. + */ + protected function removeSidebarItems($node) { + if (!$node->hasField('field_sidebar_items')) { + return; + } + + $paragraphs = $node->get('field_sidebar_items')->referencedEntities(); + $removed_count = 0; + + foreach ($paragraphs as $paragraph) { + $paragraph->delete(); + $removed_count++; + } + + if ($removed_count > 0) { + $node->set('field_sidebar_items', []); + $node->save(); + $this->logger()->notice(dt('Removed @count sidebar paragraph(s) from node @nid', [ + '@count' => $removed_count, + '@nid' => $node->id(), + ])); + } + } + /** * Helper to find the new D10 Media entity that corresponds to a given old D7 file fid. * @@ -169,6 +195,68 @@ class MigrateTextBlocksCommand extends DrushCommands { return $requested_view_mode; } + /** + * Process text block content to convert media tokens and clean up. + * + * @param string $content + * The raw content from D7. + * @param string $format + * The text format to use. + * + * @return array + * An array with 'value' and 'format' keys for the processed content. + */ + protected function processTextBlockContent($content, $format) { + // Clean up content slightly. + $content = $this->cleanContent($content); + + // Remove literal occurrences of 'text_block'. + $content = str_replace('text_block', '', $content); + + // Determine the text format to use in D10: if not found, fall back to 'filtered_html'. + $paragraph_format = !empty($format) ? $format : 'filtered_html'; + + // Convert D7 media tokens to <drupal-media>. + $content = preg_replace_callback( + '/\[\[\s*(\{.*?"fid":.*?\})\s*\]\]/s', + function ($matches) { + $json_string = $matches[1]; + $embed_data = json_decode($json_string, TRUE); + if (is_array($embed_data) && isset($embed_data['fid'])) { + // Get the correct new media entity. + $old_fid = $embed_data['fid']; + $media = $this->getMediaEntityFromOldFid($old_fid); + if ($media) { + // If "fields.format" is set in the old JSON, treat that as the requested view mode. + // Otherwise default to "media_library". + $requested_mode = !empty($embed_data['fields']['format']) + ? $embed_data['fields']['format'] + : 'media_library'; + + // Validate if this view mode actually exists in D10; fallback if not. + $view_mode = $this->getValidMediaViewMode($requested_mode); + + $uuid = $media->uuid(); + return '<drupal-media data-entity-type="media" data-entity-uuid="' . $uuid . '" data-view-mode="' . $view_mode . '"></drupal-media>'; + } + else { + // No mapping => log warning & keep original token. + $this->logger()->warning(sprintf('No media mapping found for old fid "%s".', $old_fid)); + return $matches[0]; + } + } + // If JSON parse fails or no "fid", leave token as-is. + return $matches[0]; + }, + $content + ); + + return [ + 'value' => $content, + 'format' => $paragraph_format, + ]; + } + /** * Migrate text blocks to text paragraphs. * @@ -176,6 +264,26 @@ class MigrateTextBlocksCommand extends DrushCommands { * @aliases cm-text-blocks */ public function migrateTextBlocks() { + try { + // First, migrate main content text blocks + $this->migrateMainContentTextBlocks(); + + // Then, migrate sidebar text blocks + $this->migrateSidebarTextBlocks(); + + $this->logger()->notice(dt('Text block migration complete for both main content and sidebar items.')); + } + catch (\Exception $e) { + $this->logger()->error(dt('Migration failed: @message', [ + '@message' => $e->getMessage(), + ])); + } + } + + /** + * Migrate text blocks to text paragraphs in the main content field. + */ + protected function migrateMainContentTextBlocks() { try { // Query the D7 database for text blocks. $query = $this->sourceDb->select('paragraphs_item', 'p') @@ -210,7 +318,7 @@ class MigrateTextBlocksCommand extends DrushCommands { $text_blocks = $query->execute()->fetchAll(); if (empty($text_blocks)) { - $this->logger()->warning(dt('No text blocks found to migrate.')); + $this->logger()->warning(dt('No main content text blocks found to migrate.')); return; } @@ -236,50 +344,10 @@ class MigrateTextBlocksCommand extends DrushCommands { $content .= $block->field_paragraph_text_content_value; } - // Clean up content slightly. - $content = $this->cleanContent($content); - - // Remove literal occurrences of 'text_block'. - $content = str_replace('text_block', '', $content); - - // Determine the text format to use in D10: if not found, fall back to 'filtered_html'. - $paragraph_format = !empty($block->field_paragraph_text_content_format) - ? $block->field_paragraph_text_content_format - : 'filtered_html'; - - // Convert D7 media tokens to <drupal-media>. - $content = preg_replace_callback( - '/\[\[\s*(\{.*?"fid":.*?\})\s*\]\]/s', - function ($matches) { - $json_string = $matches[1]; - $embed_data = json_decode($json_string, TRUE); - if (is_array($embed_data) && isset($embed_data['fid'])) { - // Get the correct new media entity. - $old_fid = $embed_data['fid']; - $media = $this->getMediaEntityFromOldFid($old_fid); - if ($media) { - // If "fields.format" is set in the old JSON, treat that as the requested view mode. - // Otherwise default to "media_library". - $requested_mode = !empty($embed_data['fields']['format']) - ? $embed_data['fields']['format'] - : 'media_library'; - - // Validate if this view mode actually exists in D10; fallback if not. - $view_mode = $this->getValidMediaViewMode($requested_mode); - - $uuid = $media->uuid(); - return '<drupal-media data-entity-type="media" data-entity-uuid="' . $uuid . '" data-view-mode="' . $view_mode . '"></drupal-media>'; - } - else { - // No mapping => log warning & keep original token. - $this->logger()->warning(sprintf('No media mapping found for old fid "%s".', $old_fid)); - return $matches[0]; - } - } - // If JSON parse fails or no "fid", leave token as-is. - return $matches[0]; - }, - $content + // Process the content + $processed_content = $this->processTextBlockContent( + $content, + $block->field_paragraph_text_content_format ); // Load the corresponding D10 node. @@ -294,42 +362,184 @@ class MigrateTextBlocksCommand extends DrushCommands { // Create a new paragraph of type 'text'. $paragraph = Paragraph::create([ 'type' => 'text', + 'langcode' => 'en', // Explicitly set language to English 'field_text' => [ - 'value' => $content, - 'format' => $paragraph_format, + 'value' => $processed_content['value'], + 'format' => $processed_content['format'], ], ]); $paragraph->save(); - // Attach the new paragraph to the node’s field_paragraph. + // Attach the new paragraph to the node's field_paragraph. $node->field_paragraph[] = [ 'target_id' => $paragraph->id(), 'target_revision_id' => $paragraph->getRevisionId(), ]; $node->save(); + + // Debug: Log the structure of the main content field after saving + $this->logger()->debug(dt('Main content field structure after save for node @nid: @count paragraphs', [ + '@nid' => $node->id(), + '@count' => count($node->field_paragraph), + ])); $success_count++; - $this->logger()->notice(dt('Successfully migrated text block @id to node @nid', [ + $this->logger()->notice(dt('Successfully migrated main content text block @id to node @nid', [ '@id' => $block->item_id, '@nid' => $block->entity_id, ])); } catch (\Exception $e) { $error_count++; - $this->logger()->error(dt('Error migrating text block @id: @message', [ + $this->logger()->error(dt('Error migrating main content text block @id: @message', [ '@id' => $block->item_id, '@message' => $e->getMessage(), ])); } } - $this->logger()->notice(dt('Migration complete. Successes: @success, Errors: @errors', [ + $this->logger()->notice(dt('Main content text block migration complete. Successes: @success, Errors: @errors', [ '@success' => $success_count, '@errors' => $error_count, ])); } catch (\Exception $e) { - $this->logger()->error(dt('Migration failed: @message', [ + $this->logger()->error(dt('Main content text block migration failed: @message', [ + '@message' => $e->getMessage(), + ])); + } + } + + /** + * Migrate text blocks to text paragraphs in the sidebar items field. + */ + protected function migrateSidebarTextBlocks() { + try { + // Query the D7 database for sidebar text blocks. + $query = $this->sourceDb->select('paragraphs_item', 'p') + ->fields('p', ['item_id', 'bundle', 'field_name']) + ->condition('p.bundle', 'text_block'); + + // Join with field_data_field_paragraph_heading for headings. + $query->leftJoin( + 'field_data_field_paragraph_heading', + 'h', + 'p.item_id = h.entity_id AND h.entity_type = :entity_type', + [':entity_type' => 'paragraphs_item'] + ); + $query->fields('h', ['field_paragraph_heading_value']); + + // Join with field_data_field_paragraph_text_content for body + format. + $query->leftJoin( + 'field_data_field_paragraph_text_content', + 't', + 'p.item_id = t.entity_id AND t.entity_type = :entity_type', + [':entity_type' => 'paragraphs_item'] + ); + $query->fields('t', ['field_paragraph_text_content_value', 'field_paragraph_text_content_format']); + + // Join with node reference field to get parent node ID, specifically for sidebar items. + $query->leftJoin( + 'field_data_field_sidebar_items', + 'n', + 'p.item_id = n.field_sidebar_items_value' + ); + $query->fields('n', ['entity_id']); + + $text_blocks = $query->execute()->fetchAll(); + if (empty($text_blocks)) { + $this->logger()->warning(dt('No sidebar text blocks found to migrate.')); + return; + } + + $success_count = 0; + $error_count = 0; + $processed_nodes = []; + + foreach ($text_blocks as $block) { + try { + // Skip if no node reference. + if (empty($block->entity_id)) { + $this->logger()->warning(dt('Skipping sidebar text block @id - no node reference found', [ + '@id' => $block->item_id, + ])); + continue; + } + + // Load the corresponding D10 node. + $node = $this->entityTypeManager->getStorage('node')->load($block->entity_id); + if (!$node) { + throw new \Exception("Node {$block->entity_id} not found in D10."); + } + + // Remove existing sidebar items before adding new ones, but only once per node + if (!in_array($block->entity_id, $processed_nodes)) { + $this->removeSidebarItems($node); + $processed_nodes[] = $block->entity_id; + } + + // Combine heading + body text. + $content = ''; + if (!empty($block->field_paragraph_heading_value)) { + $content .= '<h2>' . $block->field_paragraph_heading_value . '</h2>'; + } + if (!empty($block->field_paragraph_text_content_value)) { + $content .= $block->field_paragraph_text_content_value; + } + + // Process the content + $processed_content = $this->processTextBlockContent( + $content, + $block->field_paragraph_text_content_format + ); + + // Create a new paragraph of type 'text'. + $paragraph = Paragraph::create([ + 'type' => 'text', + 'langcode' => 'en', // Explicitly set language to English + 'field_text' => [ + 'value' => $processed_content['value'], + 'format' => $processed_content['format'], + ], + ]); + $paragraph->save(); + + // Attach the new paragraph to the node's field_sidebar_items. + // Use the field directly to ensure proper structure + $node->field_sidebar_items[] = [ + 'target_id' => $paragraph->id(), + 'target_revision_id' => $paragraph->getRevisionId(), + ]; + $node->save(); + + // Debug: Log the structure of the sidebar field after saving + $this->logger()->debug(dt('Sidebar field structure after save for node @nid: @count paragraphs', [ + '@nid' => $node->id(), + '@count' => count($node->field_sidebar_items), + ])); + + $success_count++; + $this->logger()->notice(dt('Successfully migrated sidebar text block @id to node @nid', [ + '@id' => $block->item_id, + '@nid' => $block->entity_id, + ])); + } + catch (\Exception $e) { + $error_count++; + $this->logger()->error(dt('Error migrating sidebar text block @id: @message', [ + '@id' => $block->item_id, + '@message' => $e->getMessage(), + ])); + } + } + + $this->logger()->notice(dt('Sidebar text block migration complete. Successes: @success, Errors: @errors', [ + '@success' => $success_count, + '@errors' => $error_count, + ])); + } + catch (\Exception $e) { + $this->logger()->error(dt('Sidebar text block migration failed: @message', [ '@message' => $e->getMessage(), ])); } -- GitLab From 41fe5aad548d3bc68024c98015b2c4f21cd26bfc Mon Sep 17 00:00:00 2001 From: Anthony Michaels <a.michaels@webpackager.com> Date: Tue, 18 Mar 2025 19:37:49 +0000 Subject: [PATCH 02/12] Attempt to update sidebar mappings directly; ensure no sidebar items exists from the start; --- .../Commands/MigrateTextBlocksCommand.php | 136 +++++++++++++----- 1 file changed, 97 insertions(+), 39 deletions(-) diff --git a/src/Drush/Commands/MigrateTextBlocksCommand.php b/src/Drush/Commands/MigrateTextBlocksCommand.php index 79b8ed2..364e62f 100644 --- a/src/Drush/Commands/MigrateTextBlocksCommand.php +++ b/src/Drush/Commands/MigrateTextBlocksCommand.php @@ -213,7 +213,7 @@ class MigrateTextBlocksCommand extends DrushCommands { // Remove literal occurrences of 'text_block'. $content = str_replace('text_block', '', $content); - // Determine the text format to use in D10: if not found, fall back to 'filtered_html'. + // Determine the text format to use in D10: if not found, fall back to 'filtered_html' $paragraph_format = !empty($format) ? $format : 'filtered_html'; // Convert D7 media tokens to <drupal-media>. @@ -268,8 +268,13 @@ class MigrateTextBlocksCommand extends DrushCommands { // First, migrate main content text blocks $this->migrateMainContentTextBlocks(); + // Rebuild caches between migrations + \Drupal::service('cache.render')->invalidateAll(); + \Drupal::service('plugin.manager.entity_reference_selection')->clearCachedDefinitions(); + // Then, migrate sidebar text blocks $this->migrateSidebarTextBlocks(); + $this->logger()->notice(dt('Text block migration complete for both main content and sidebar items.')); } @@ -285,10 +290,18 @@ class MigrateTextBlocksCommand extends DrushCommands { */ protected function migrateMainContentTextBlocks() { try { - // Query the D7 database for text blocks. - $query = $this->sourceDb->select('paragraphs_item', 'p') - ->fields('p', ['item_id', 'bundle', 'field_name']) - ->condition('p.bundle', 'text_block'); + // Query the D7 database for text blocks in the main content field. + $query = $this->sourceDb->select('field_data_field_content_items', 'n') + ->fields('n', ['entity_id', 'field_content_items_value']); + + // Join with paragraphs_item to get only text blocks + $query->join( + 'paragraphs_item', + 'p', + 'p.item_id = n.field_content_items_value AND p.bundle = :bundle', + [':bundle' => 'text_block'] + ); + $query->fields('p', ['item_id', 'bundle', 'field_name']); // Join with field_data_field_paragraph_heading for headings. $query->leftJoin( @@ -308,14 +321,6 @@ class MigrateTextBlocksCommand extends DrushCommands { ); $query->fields('t', ['field_paragraph_text_content_value', 'field_paragraph_text_content_format']); - // Join with node reference field to get parent node ID. - $query->leftJoin( - 'field_data_field_content_items', - 'n', - 'p.item_id = n.field_content_items_value' - ); - $query->fields('n', ['entity_id']); - $text_blocks = $query->execute()->fetchAll(); if (empty($text_blocks)) { $this->logger()->warning(dt('No main content text blocks found to migrate.')); @@ -362,10 +367,10 @@ class MigrateTextBlocksCommand extends DrushCommands { // Create a new paragraph of type 'text'. $paragraph = Paragraph::create([ 'type' => 'text', - 'langcode' => 'en', // Explicitly set language to English + 'langcode' => 'und', // Set paragraph language to 'und' (undefined) to match existing data 'field_text' => [ 'value' => $processed_content['value'], - 'format' => $processed_content['format'], + 'format' => $processed_content['format'], // Use the imported format for main content ], ]); $paragraph->save(); @@ -415,10 +420,23 @@ class MigrateTextBlocksCommand extends DrushCommands { */ protected function migrateSidebarTextBlocks() { try { - // Query the D7 database for sidebar text blocks. - $query = $this->sourceDb->select('paragraphs_item', 'p') - ->fields('p', ['item_id', 'bundle', 'field_name']) - ->condition('p.bundle', 'text_block'); + // Truncate the sidebar items tables to start fresh + $this->targetDb->truncate('node__field_sidebar_items')->execute(); + $this->targetDb->truncate('node_revision__field_sidebar_items')->execute(); + $this->logger()->notice(dt('Truncated sidebar items tables to start fresh.')); + + // Query the D7 database for text blocks in the sidebar field. + $query = $this->sourceDb->select('field_data_field_sidebar_items', 'n') + ->fields('n', ['entity_id', 'field_sidebar_items_value']); + + // Join with paragraphs_item to get only text blocks + $query->join( + 'paragraphs_item', + 'p', + 'p.item_id = n.field_sidebar_items_value AND p.bundle = :bundle', + [':bundle' => 'text_block'] + ); + $query->fields('p', ['item_id', 'bundle', 'field_name']); // Join with field_data_field_paragraph_heading for headings. $query->leftJoin( @@ -438,14 +456,6 @@ class MigrateTextBlocksCommand extends DrushCommands { ); $query->fields('t', ['field_paragraph_text_content_value', 'field_paragraph_text_content_format']); - // Join with node reference field to get parent node ID, specifically for sidebar items. - $query->leftJoin( - 'field_data_field_sidebar_items', - 'n', - 'p.item_id = n.field_sidebar_items_value' - ); - $query->fields('n', ['entity_id']); - $text_blocks = $query->execute()->fetchAll(); if (empty($text_blocks)) { $this->logger()->warning(dt('No sidebar text blocks found to migrate.')); @@ -496,27 +506,74 @@ class MigrateTextBlocksCommand extends DrushCommands { // Create a new paragraph of type 'text'. $paragraph = Paragraph::create([ 'type' => 'text', - 'langcode' => 'en', // Explicitly set language to English + 'langcode' => 'und', // Set paragraph language to 'und' (undefined) to match existing data 'field_text' => [ 'value' => $processed_content['value'], - 'format' => $processed_content['format'], + 'format' => 'basic_html', // Explicitly set to basic_html as required ], ]); $paragraph->save(); - // Attach the new paragraph to the node's field_sidebar_items. - // Use the field directly to ensure proper structure - $node->field_sidebar_items[] = [ - 'target_id' => $paragraph->id(), - 'target_revision_id' => $paragraph->getRevisionId(), - ]; - $node->save(); + // Instead of using the entity API, directly insert into the database + // Insert into node__field_sidebar_items table + $this->targetDb->insert('node__field_sidebar_items') + ->fields([ + 'bundle' => 'page', + 'deleted' => 0, + 'entity_id' => $node->id(), + 'revision_id' => $node->getRevisionId(), + 'langcode' => 'und', + 'delta' => 0, + 'field_sidebar_items_target_id' => $paragraph->id(), + 'field_sidebar_items_target_revision_id' => $paragraph->getRevisionId(), + ]) + ->execute(); + + // Also insert into node_revision__field_sidebar_items table + $this->targetDb->insert('node_revision__field_sidebar_items') + ->fields([ + 'bundle' => 'page', + 'deleted' => 0, + 'entity_id' => $node->id(), + 'revision_id' => $node->getRevisionId(), + 'langcode' => 'und', + 'delta' => 0, + 'field_sidebar_items_target_id' => $paragraph->id(), + 'field_sidebar_items_target_revision_id' => $paragraph->getRevisionId(), + ]) + ->execute(); + + + // Debug: Log detailed information about the paragraph and the node reference + $this->logger()->notice(dt('IMPORTANT: Sidebar paragraph created for node @nid: ID=@pid, RevID=@revid, Type=@type, Lang=@lang, Format=@format', [ + '@nid' => $node->id(), + '@pid' => $paragraph->id(), + '@revid' => $paragraph->getRevisionId(), + '@type' => $paragraph->getType(), + '@lang' => $paragraph->language()->getId(), + '@format' => $paragraph->get('field_text')->format, + ])); - // Debug: Log the structure of the sidebar field after saving - $this->logger()->debug(dt('Sidebar field structure after save for node @nid: @count paragraphs', [ + // Debug: Log the node's field_sidebar_items values and revision ID after saving + $this->logger()->notice(dt('IMPORTANT: Node @nid (revision: @vid) field_sidebar_items values: @values', [ '@nid' => $node->id(), - '@count' => count($node->field_sidebar_items), + '@vid' => $node->getRevisionId(), + '@values' => json_encode($node->get('field_sidebar_items')->getValue()), ])); + + // Check if the node type has a view display configuration for the sidebar field + $entity_display_repository = \Drupal::service('entity_display.repository'); + $view_display = $entity_display_repository->getViewDisplay('node', $node->bundle(), 'default'); + if ($view_display && $view_display->getComponent('field_sidebar_items')) { + $this->logger()->notice(dt('Node @nid has view display configuration for field_sidebar_items', [ + '@nid' => $node->id(), + ])); + } else { + $this->logger()->notice(dt('Node @nid (@type) does not have view display configuration for field_sidebar_items', [ + '@nid' => $node->id(), + '@type' => $node->bundle(), + ])); + } $success_count++; $this->logger()->notice(dt('Successfully migrated sidebar text block @id to node @nid', [ @@ -545,6 +602,7 @@ class MigrateTextBlocksCommand extends DrushCommands { } } + /** * {@inheritdoc} */ -- GitLab From 8912ac9e8bdf62d89abae30bda7a7393839e0533 Mon Sep 17 00:00:00 2001 From: Anthony Michaels <a.michaels@webpackager.com> Date: Tue, 18 Mar 2025 20:25:10 +0000 Subject: [PATCH 03/12] Split migrateSidebarTextBlocks() into 2 parts --- .../Commands/MigrateTextBlocksCommand.php | 167 +++++++++++------- 1 file changed, 105 insertions(+), 62 deletions(-) diff --git a/src/Drush/Commands/MigrateTextBlocksCommand.php b/src/Drush/Commands/MigrateTextBlocksCommand.php index 364e62f..9fa68d1 100644 --- a/src/Drush/Commands/MigrateTextBlocksCommand.php +++ b/src/Drush/Commands/MigrateTextBlocksCommand.php @@ -272,9 +272,15 @@ class MigrateTextBlocksCommand extends DrushCommands { \Drupal::service('cache.render')->invalidateAll(); \Drupal::service('plugin.manager.entity_reference_selection')->clearCachedDefinitions(); - // Then, migrate sidebar text blocks - $this->migrateSidebarTextBlocks(); - + // Then, create sidebar paragraph items + $sidebar_items = $this->createSidebarParagraphItems(); + + // Rebuild caches again + \Drupal::service('cache.render')->invalidateAll(); + \Drupal::service('plugin.manager.entity_reference_selection')->clearCachedDefinitions(); + + // Finally, insert sidebar items into the database + $this->insertSidebarItemsIntoDatabase($sidebar_items); $this->logger()->notice(dt('Text block migration complete for both main content and sidebar items.')); } @@ -416,14 +422,14 @@ class MigrateTextBlocksCommand extends DrushCommands { } /** - * Migrate text blocks to text paragraphs in the sidebar items field. + * Create sidebar paragraph items and return an array of data for database insertion. + * + * @return array + * An array of sidebar item data for database insertion. */ - protected function migrateSidebarTextBlocks() { + protected function createSidebarParagraphItems() { try { - // Truncate the sidebar items tables to start fresh - $this->targetDb->truncate('node__field_sidebar_items')->execute(); - $this->targetDb->truncate('node_revision__field_sidebar_items')->execute(); - $this->logger()->notice(dt('Truncated sidebar items tables to start fresh.')); + $sidebar_items = []; // Query the D7 database for text blocks in the sidebar field. $query = $this->sourceDb->select('field_data_field_sidebar_items', 'n') @@ -459,7 +465,7 @@ class MigrateTextBlocksCommand extends DrushCommands { $text_blocks = $query->execute()->fetchAll(); if (empty($text_blocks)) { $this->logger()->warning(dt('No sidebar text blocks found to migrate.')); - return; + return $sidebar_items; } $success_count = 0; @@ -514,95 +520,132 @@ class MigrateTextBlocksCommand extends DrushCommands { ]); $paragraph->save(); - // Instead of using the entity API, directly insert into the database + // Store the paragraph and node information for later database insertion + $sidebar_items[] = [ + 'node_id' => $node->id(), + 'node_revision_id' => $node->getRevisionId(), + 'node_bundle' => $node->bundle(), + 'paragraph_id' => $paragraph->id(), + 'paragraph_revision_id' => $paragraph->getRevisionId(), + ]; + + // Debug: Log detailed information about the paragraph and the node reference + $this->logger()->notice(dt('IMPORTANT: Sidebar paragraph created for node @nid: ID=@pid, RevID=@revid, Type=@type, Lang=@lang, Format=@format', [ + '@nid' => $node->id(), + '@pid' => $paragraph->id(), + '@revid' => $paragraph->getRevisionId(), + '@type' => $paragraph->getType(), + '@lang' => $paragraph->language()->getId(), + '@format' => $paragraph->get('field_text')->format, + ])); + + $success_count++; + $this->logger()->notice(dt('Successfully created sidebar paragraph for text block @id to node @nid', [ + '@id' => $block->item_id, + '@nid' => $block->entity_id, + ])); + } + catch (\Exception $e) { + $error_count++; + $this->logger()->error(dt('Error creating sidebar paragraph for text block @id: @message', [ + '@id' => $block->item_id, + '@message' => $e->getMessage(), + ])); + } + } + + $this->logger()->notice(dt('Sidebar paragraph creation complete. Successes: @success, Errors: @errors', [ + '@success' => $success_count, + '@errors' => $error_count, + ])); + + return $sidebar_items; + } + catch (\Exception $e) { + $this->logger()->error(dt('Sidebar paragraph creation failed: @message', [ + '@message' => $e->getMessage(), + ])); + return []; + } + } + + /** + * Insert sidebar items into the database. + * + * @param array $sidebar_items + * An array of sidebar item data for database insertion. + */ + protected function insertSidebarItemsIntoDatabase(array $sidebar_items) { + try { + // Truncate the sidebar items tables to start fresh + $this->targetDb->truncate('node__field_sidebar_items')->execute(); + $this->targetDb->truncate('node_revision__field_sidebar_items')->execute(); + $this->logger()->notice(dt('Truncated sidebar items tables to start fresh.')); + + if (empty($sidebar_items)) { + $this->logger()->warning(dt('No sidebar items to insert into the database.')); + return; + } + + $success_count = 0; + $error_count = 0; + + foreach ($sidebar_items as $item) { + try { // Insert into node__field_sidebar_items table $this->targetDb->insert('node__field_sidebar_items') ->fields([ - 'bundle' => 'page', + 'bundle' => $item['node_bundle'], 'deleted' => 0, - 'entity_id' => $node->id(), - 'revision_id' => $node->getRevisionId(), + 'entity_id' => $item['node_id'], + 'revision_id' => $item['node_revision_id'], 'langcode' => 'und', 'delta' => 0, - 'field_sidebar_items_target_id' => $paragraph->id(), - 'field_sidebar_items_target_revision_id' => $paragraph->getRevisionId(), + 'field_sidebar_items_target_id' => $item['paragraph_id'], + 'field_sidebar_items_target_revision_id' => $item['paragraph_revision_id'], ]) ->execute(); // Also insert into node_revision__field_sidebar_items table $this->targetDb->insert('node_revision__field_sidebar_items') ->fields([ - 'bundle' => 'page', + 'bundle' => $item['node_bundle'], 'deleted' => 0, - 'entity_id' => $node->id(), - 'revision_id' => $node->getRevisionId(), + 'entity_id' => $item['node_id'], + 'revision_id' => $item['node_revision_id'], 'langcode' => 'und', 'delta' => 0, - 'field_sidebar_items_target_id' => $paragraph->id(), - 'field_sidebar_items_target_revision_id' => $paragraph->getRevisionId(), + 'field_sidebar_items_target_id' => $item['paragraph_id'], + 'field_sidebar_items_target_revision_id' => $item['paragraph_revision_id'], ]) ->execute(); - - - // Debug: Log detailed information about the paragraph and the node reference - $this->logger()->notice(dt('IMPORTANT: Sidebar paragraph created for node @nid: ID=@pid, RevID=@revid, Type=@type, Lang=@lang, Format=@format', [ - '@nid' => $node->id(), - '@pid' => $paragraph->id(), - '@revid' => $paragraph->getRevisionId(), - '@type' => $paragraph->getType(), - '@lang' => $paragraph->language()->getId(), - '@format' => $paragraph->get('field_text')->format, - ])); - - // Debug: Log the node's field_sidebar_items values and revision ID after saving - $this->logger()->notice(dt('IMPORTANT: Node @nid (revision: @vid) field_sidebar_items values: @values', [ - '@nid' => $node->id(), - '@vid' => $node->getRevisionId(), - '@values' => json_encode($node->get('field_sidebar_items')->getValue()), - ])); - // Check if the node type has a view display configuration for the sidebar field - $entity_display_repository = \Drupal::service('entity_display.repository'); - $view_display = $entity_display_repository->getViewDisplay('node', $node->bundle(), 'default'); - if ($view_display && $view_display->getComponent('field_sidebar_items')) { - $this->logger()->notice(dt('Node @nid has view display configuration for field_sidebar_items', [ - '@nid' => $node->id(), - ])); - } else { - $this->logger()->notice(dt('Node @nid (@type) does not have view display configuration for field_sidebar_items', [ - '@nid' => $node->id(), - '@type' => $node->bundle(), - ])); - } - $success_count++; - $this->logger()->notice(dt('Successfully migrated sidebar text block @id to node @nid', [ - '@id' => $block->item_id, - '@nid' => $block->entity_id, + $this->logger()->notice(dt('Successfully inserted sidebar item for node @nid', [ + '@nid' => $item['node_id'], ])); } catch (\Exception $e) { $error_count++; - $this->logger()->error(dt('Error migrating sidebar text block @id: @message', [ - '@id' => $block->item_id, + $this->logger()->error(dt('Error inserting sidebar item for node @nid: @message', [ + '@nid' => $item['node_id'], '@message' => $e->getMessage(), ])); } } - - $this->logger()->notice(dt('Sidebar text block migration complete. Successes: @success, Errors: @errors', [ + + $this->logger()->notice(dt('Sidebar item database insertion complete. Successes: @success, Errors: @errors', [ '@success' => $success_count, '@errors' => $error_count, ])); } catch (\Exception $e) { - $this->logger()->error(dt('Sidebar text block migration failed: @message', [ + $this->logger()->error(dt('Sidebar item database insertion failed: @message', [ '@message' => $e->getMessage(), ])); } } - /** * {@inheritdoc} */ -- GitLab From 0ac836bede59f5978886263f56b368d21a20fef6 Mon Sep 17 00:00:00 2001 From: Anthony Michaels <a.michaels@webpackager.com> Date: Tue, 18 Mar 2025 21:18:15 +0000 Subject: [PATCH 04/12] Rebuild caches as a final action; --- src/Drush/Commands/MigrateTextBlocksCommand.php | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/Drush/Commands/MigrateTextBlocksCommand.php b/src/Drush/Commands/MigrateTextBlocksCommand.php index 9fa68d1..5d6f0f0 100644 --- a/src/Drush/Commands/MigrateTextBlocksCommand.php +++ b/src/Drush/Commands/MigrateTextBlocksCommand.php @@ -269,19 +269,17 @@ class MigrateTextBlocksCommand extends DrushCommands { $this->migrateMainContentTextBlocks(); // Rebuild caches between migrations - \Drupal::service('cache.render')->invalidateAll(); - \Drupal::service('plugin.manager.entity_reference_selection')->clearCachedDefinitions(); + drupal_flush_all_caches(); // Then, create sidebar paragraph items $sidebar_items = $this->createSidebarParagraphItems(); - // Rebuild caches again - \Drupal::service('cache.render')->invalidateAll(); - \Drupal::service('plugin.manager.entity_reference_selection')->clearCachedDefinitions(); - // Finally, insert sidebar items into the database $this->insertSidebarItemsIntoDatabase($sidebar_items); + // Rebuild caches again + drupal_flush_all_caches(); + $this->logger()->notice(dt('Text block migration complete for both main content and sidebar items.')); } catch (\Exception $e) { -- GitLab From 24bc17e6ae7f053c520872328af5e91f90c58068 Mon Sep 17 00:00:00 2001 From: Anthony Michaels <a.michaels@webpackager.com> Date: Tue, 18 Mar 2025 22:12:21 +0000 Subject: [PATCH 05/12] Avoid db inserts, use $node->save() to attached paragraphs to nodes; --- .../Commands/MigrateTextBlocksCommand.php | 35 ++++--------------- 1 file changed, 7 insertions(+), 28 deletions(-) diff --git a/src/Drush/Commands/MigrateTextBlocksCommand.php b/src/Drush/Commands/MigrateTextBlocksCommand.php index 5d6f0f0..730889e 100644 --- a/src/Drush/Commands/MigrateTextBlocksCommand.php +++ b/src/Drush/Commands/MigrateTextBlocksCommand.php @@ -590,34 +590,13 @@ class MigrateTextBlocksCommand extends DrushCommands { foreach ($sidebar_items as $item) { try { - // Insert into node__field_sidebar_items table - $this->targetDb->insert('node__field_sidebar_items') - ->fields([ - 'bundle' => $item['node_bundle'], - 'deleted' => 0, - 'entity_id' => $item['node_id'], - 'revision_id' => $item['node_revision_id'], - 'langcode' => 'und', - 'delta' => 0, - 'field_sidebar_items_target_id' => $item['paragraph_id'], - 'field_sidebar_items_target_revision_id' => $item['paragraph_revision_id'], - ]) - ->execute(); - - // Also insert into node_revision__field_sidebar_items table - $this->targetDb->insert('node_revision__field_sidebar_items') - ->fields([ - 'bundle' => $item['node_bundle'], - 'deleted' => 0, - 'entity_id' => $item['node_id'], - 'revision_id' => $item['node_revision_id'], - 'langcode' => 'und', - 'delta' => 0, - 'field_sidebar_items_target_id' => $item['paragraph_id'], - 'field_sidebar_items_target_revision_id' => $item['paragraph_revision_id'], - ]) - ->execute(); - + $node = $this->entityTypeManager->getStorage('node')->load($item['node_id']); + $node->field_sidebar_items[] = [ + 'target_id' => $item['paragraph_id'], + 'target_revision_id' => $item['paragraph_revision_id'], + ]; + $node->save(); + $success_count++; $this->logger()->notice(dt('Successfully inserted sidebar item for node @nid', [ '@nid' => $item['node_id'], -- GitLab From 091c2d55a32fe10207e6f3035efc9f33e0826276 Mon Sep 17 00:00:00 2001 From: Anthony Michaels <a.michaels@webpackager.com> Date: Tue, 18 Mar 2025 22:33:39 +0000 Subject: [PATCH 06/12] Minor --- src/Drush/Commands/MigrateTextBlocksCommand.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Drush/Commands/MigrateTextBlocksCommand.php b/src/Drush/Commands/MigrateTextBlocksCommand.php index 730889e..a3f82bf 100644 --- a/src/Drush/Commands/MigrateTextBlocksCommand.php +++ b/src/Drush/Commands/MigrateTextBlocksCommand.php @@ -578,7 +578,7 @@ class MigrateTextBlocksCommand extends DrushCommands { // Truncate the sidebar items tables to start fresh $this->targetDb->truncate('node__field_sidebar_items')->execute(); $this->targetDb->truncate('node_revision__field_sidebar_items')->execute(); - $this->logger()->notice(dt('Truncated sidebar items tables to start fresh.')); + $this->logger()->notice(dt('Truncated sidebar items tables to start afresh.')); if (empty($sidebar_items)) { $this->logger()->warning(dt('No sidebar items to insert into the database.')); -- GitLab From dc73033f816bfd5c40e22a4b9af97686f67015d6 Mon Sep 17 00:00:00 2001 From: Anthony Michaels <a.michaels@webpackager.com> Date: Wed, 19 Mar 2025 09:20:03 +0000 Subject: [PATCH 07/12] Minor --- src/Drush/Commands/MigrateTextBlocksCommand.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Drush/Commands/MigrateTextBlocksCommand.php b/src/Drush/Commands/MigrateTextBlocksCommand.php index a3f82bf..dcf6a33 100644 --- a/src/Drush/Commands/MigrateTextBlocksCommand.php +++ b/src/Drush/Commands/MigrateTextBlocksCommand.php @@ -575,7 +575,7 @@ class MigrateTextBlocksCommand extends DrushCommands { */ protected function insertSidebarItemsIntoDatabase(array $sidebar_items) { try { - // Truncate the sidebar items tables to start fresh + // Truncate the sidebar items tables to start afresh $this->targetDb->truncate('node__field_sidebar_items')->execute(); $this->targetDb->truncate('node_revision__field_sidebar_items')->execute(); $this->logger()->notice(dt('Truncated sidebar items tables to start afresh.')); -- GitLab From d57b4ff57b2e6369ce99836d1470f6c793b88d35 Mon Sep 17 00:00:00 2001 From: Anthony Michaels <a.michaels@webpackager.com> Date: Thu, 20 Mar 2025 09:22:16 +0000 Subject: [PATCH 08/12] Initial script for cleanup Drush service; --- cambridge_migrations.services.yml | 6 + drush.services.yml | 6 + src/Drush/Commands/MigrateCleanupCommand.php | 206 +++++++++++++++++++ 3 files changed, 218 insertions(+) create mode 100644 src/Drush/Commands/MigrateCleanupCommand.php diff --git a/cambridge_migrations.services.yml b/cambridge_migrations.services.yml index cc08c86..fab37c3 100644 --- a/cambridge_migrations.services.yml +++ b/cambridge_migrations.services.yml @@ -10,3 +10,9 @@ services: arguments: ['@entity_type.manager', '@file_system', '@database'] tags: - { name: drush.command } + + cambridge_migrations.cleanup_command: + class: Drupal\cambridge_migrations\Drush\Commands\MigrateCleanupCommand + arguments: ['@entity_type.manager'] + tags: + - { name: drush.command } diff --git a/drush.services.yml b/drush.services.yml index cc08c86..fab37c3 100644 --- a/drush.services.yml +++ b/drush.services.yml @@ -10,3 +10,9 @@ services: arguments: ['@entity_type.manager', '@file_system', '@database'] tags: - { name: drush.command } + + cambridge_migrations.cleanup_command: + class: Drupal\cambridge_migrations\Drush\Commands\MigrateCleanupCommand + arguments: ['@entity_type.manager'] + tags: + - { name: drush.command } diff --git a/src/Drush/Commands/MigrateCleanupCommand.php b/src/Drush/Commands/MigrateCleanupCommand.php new file mode 100644 index 0000000..e934c06 --- /dev/null +++ b/src/Drush/Commands/MigrateCleanupCommand.php @@ -0,0 +1,206 @@ +<?php + +namespace Drupal\cambridge_migrations\Drush\Commands; + +use Drush\Commands\DrushCommands; +use Drupal\Core\Entity\EntityTypeManagerInterface; +use Drupal\paragraphs\Entity\Paragraph; +use Symfony\Component\DependencyInjection\ContainerInterface; +use DOMDocument; +use DOMElement; + +/** + * Drush command for cleaning up paragraph HTML content. + */ +class MigrateCleanupCommand extends DrushCommands { + + /** + * The entity type manager. + * + * @var \Drupal\Core\Entity\EntityTypeManagerInterface + */ + protected $entityTypeManager; + + /** + * Constructs a new MigrateCleanupCommand object. + */ + public function __construct(EntityTypeManagerInterface $entity_type_manager) { + parent::__construct(); + $this->entityTypeManager = $entity_type_manager; + } + + /** + * Clean HTML content by removing unwanted attributes and empty tags. + * + * @param string $html + * The HTML content to clean. + * + * @return string + * The cleaned HTML content. + */ + protected function cleanHtml($html) { + if (empty($html)) { + return $html; + } + + // Create a new DOM document + $doc = new DOMDocument(); + + // Preserve whitespace to maintain formatting + $doc->preserveWhiteSpace = true; + $doc->formatOutput = true; + + // Load HTML with UTF-8 encoding and suppress warnings + $doc->loadHTML('<?xml encoding="UTF-8">' . $html, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD); + + // Remove unwanted attributes and empty tags recursively + $this->cleanNode($doc->documentElement); + + // Get the cleaned HTML + $cleanedHtml = $doc->saveHTML($doc->documentElement); + + // Remove the XML declaration if present + $cleanedHtml = preg_replace('/<\?xml[^>]+\?>/', '', $cleanedHtml); + + return trim($cleanedHtml); + } + + /** + * Recursively clean a DOM node by removing unwanted attributes and empty tags. + * + * @param DOMElement $node + * The DOM node to clean. + * + * @return bool + * TRUE if the node should be kept, FALSE if it should be removed. + */ + protected function cleanNode(DOMElement $node) { + // Remove style and script elements entirely + $tagName = strtolower($node->tagName); + if ($tagName === 'style' || $tagName === 'script') { + return false; + } + + // Remove class and style attributes + $node->removeAttribute('class'); + $node->removeAttribute('style'); + + // Process child nodes + $children = []; + foreach ($node->childNodes as $child) { + $children[] = $child; + } + + $hasNonEmptyContent = false; + + foreach ($children as $child) { + if ($child->nodeType === XML_ELEMENT_NODE) { + // Recursively clean child element + $keepChild = $this->cleanNode($child); + if (!$keepChild) { + $node->removeChild($child); + } else { + $hasNonEmptyContent = true; + } + } + elseif ($child->nodeType === XML_TEXT_NODE) { + // Check if text node contains more than just whitespace and non-breaking spaces + $text = trim($child->nodeValue); + $textWithoutNbsp = str_replace("\xC2\xA0", '', $text); // Remove non-breaking spaces + if (!empty($textWithoutNbsp)) { + $hasNonEmptyContent = true; + } + } + } + + // Return true if node has meaningful content + return $hasNonEmptyContent; + } + + /** + * Clean up HTML content in text paragraphs. + * + * @command cambridge:migrate-cleanup + * @aliases cm-cleanup + */ + public function migrateCleanup() { + try { + // Query all paragraphs of type "text" + $storage = $this->entityTypeManager->getStorage('paragraph'); + $query = $storage->getQuery() + ->condition('type', 'text') + ->accessCheck(FALSE); + $pids = $query->execute(); + + if (empty($pids)) { + $this->logger()->warning(dt('No text paragraphs found to clean up.')); + return; + } + + $success_count = 0; + $error_count = 0; + + foreach ($pids as $pid) { + try { + /** @var \Drupal\paragraphs\Entity\Paragraph $paragraph */ + $paragraph = $storage->load($pid); + + if (!$paragraph->hasField('field_text')) { + $this->logger()->warning(dt('Paragraph @pid does not have field_text field.', [ + '@pid' => $pid, + ])); + continue; + } + + $text_field = $paragraph->get('field_text'); + $value = $text_field->value; + $format = $text_field->format; + + // Clean the HTML content + $cleaned_value = $this->cleanHtml($value); + + // Update the paragraph if content changed + if ($cleaned_value !== $value) { + $text_field->setValue([ + 'value' => $cleaned_value, + 'format' => $format, + ]); + $paragraph->save(); + + $success_count++; + $this->logger()->notice(dt('Successfully cleaned HTML in paragraph @pid', [ + '@pid' => $pid, + ])); + } + } + catch (\Exception $e) { + $error_count++; + $this->logger()->error(dt('Error cleaning paragraph @pid: @message', [ + '@pid' => $pid, + '@message' => $e->getMessage(), + ])); + } + } + + $this->logger()->notice(dt('HTML cleanup complete. Successes: @success, Errors: @errors', [ + '@success' => $success_count, + '@errors' => $error_count, + ])); + } + catch (\Exception $e) { + $this->logger()->error(dt('HTML cleanup failed: @message', [ + '@message' => $e->getMessage(), + ])); + } + } + + /** + * {@inheritdoc} + */ + public static function create(ContainerInterface $container) { + return new static( + $container->get('entity_type_manager') + ); + } + +} -- GitLab From 95a0c966593c28e2f075eacf0ea7fe21d49c7e8f Mon Sep 17 00:00:00 2001 From: Anthony Michaels <a.michaels@webpackager.com> Date: Thu, 20 Mar 2025 09:36:39 +0000 Subject: [PATCH 09/12] Update - removal process too thorough, removes legitimate content; --- src/Drush/Commands/MigrateCleanupCommand.php | 70 ++++++++++++++------ 1 file changed, 51 insertions(+), 19 deletions(-) diff --git a/src/Drush/Commands/MigrateCleanupCommand.php b/src/Drush/Commands/MigrateCleanupCommand.php index e934c06..d37d3b7 100644 --- a/src/Drush/Commands/MigrateCleanupCommand.php +++ b/src/Drush/Commands/MigrateCleanupCommand.php @@ -50,19 +50,37 @@ class MigrateCleanupCommand extends DrushCommands { $doc->preserveWhiteSpace = true; $doc->formatOutput = true; - // Load HTML with UTF-8 encoding and suppress warnings + // Suppress warnings during HTML loading + $internalErrors = libxml_use_internal_errors(true); + + // Load HTML with UTF-8 encoding $doc->loadHTML('<?xml encoding="UTF-8">' . $html, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD); + + // Restore error handling + libxml_use_internal_errors($internalErrors); - // Remove unwanted attributes and empty tags recursively - $this->cleanNode($doc->documentElement); + // Process the document if it loaded successfully + if ($doc->documentElement) { + // Remove unwanted attributes and empty tags recursively + $this->cleanNode($doc->documentElement); - // Get the cleaned HTML - $cleanedHtml = $doc->saveHTML($doc->documentElement); + // Get the cleaned HTML + $cleanedHtml = $doc->saveHTML($doc->documentElement); - // Remove the XML declaration if present - $cleanedHtml = preg_replace('/<\?xml[^>]+\?>/', '', $cleanedHtml); + // Remove the XML declaration if present + $cleanedHtml = preg_replace('/<\?xml[^>]+\?>/', '', $cleanedHtml); + + // If the result is empty but the original wasn't, return the original + if (empty(trim($cleanedHtml)) && !empty(trim($html))) { + $this->logger()->warning('Cleaning resulted in empty content, keeping original'); + return $html; + } - return trim($cleanedHtml); + return trim($cleanedHtml); + } + + // If document couldn't be properly parsed, return original HTML + return $html; } /** @@ -97,24 +115,23 @@ class MigrateCleanupCommand extends DrushCommands { if ($child->nodeType === XML_ELEMENT_NODE) { // Recursively clean child element $keepChild = $this->cleanNode($child); - if (!$keepChild) { + if (!$keepChild && ($child->nodeName === 'style' || $child->nodeName === 'script')) { + // Only remove style and script elements, keep other elements even if empty $node->removeChild($child); - } else { + } + else { + // Always keep other elements, even if they're empty $hasNonEmptyContent = true; } } elseif ($child->nodeType === XML_TEXT_NODE) { - // Check if text node contains more than just whitespace and non-breaking spaces - $text = trim($child->nodeValue); - $textWithoutNbsp = str_replace("\xC2\xA0", '', $text); // Remove non-breaking spaces - if (!empty($textWithoutNbsp)) { - $hasNonEmptyContent = true; - } + // Always keep text nodes, even if they're just whitespace + $hasNonEmptyContent = true; } } - // Return true if node has meaningful content - return $hasNonEmptyContent; + // Always keep the node unless it's a style or script element + return true; } /** @@ -139,6 +156,7 @@ class MigrateCleanupCommand extends DrushCommands { $success_count = 0; $error_count = 0; + $unchanged_count = 0; foreach ($pids as $pid) { try { @@ -156,6 +174,14 @@ class MigrateCleanupCommand extends DrushCommands { $value = $text_field->value; $format = $text_field->format; + // Skip if empty + if (empty($value)) { + $this->logger()->notice(dt('Skipping empty paragraph @pid', [ + '@pid' => $pid, + ])); + continue; + } + // Clean the HTML content $cleaned_value = $this->cleanHtml($value); @@ -171,6 +197,11 @@ class MigrateCleanupCommand extends DrushCommands { $this->logger()->notice(dt('Successfully cleaned HTML in paragraph @pid', [ '@pid' => $pid, ])); + } else { + $unchanged_count++; + $this->logger()->info(dt('No changes needed for paragraph @pid', [ + '@pid' => $pid, + ])); } } catch (\Exception $e) { @@ -182,8 +213,9 @@ class MigrateCleanupCommand extends DrushCommands { } } - $this->logger()->notice(dt('HTML cleanup complete. Successes: @success, Errors: @errors', [ + $this->logger()->notice(dt('HTML cleanup complete. Successes: @success, Unchanged: @unchanged, Errors: @errors', [ '@success' => $success_count, + '@unchanged' => $unchanged_count, '@errors' => $error_count, ])); } -- GitLab From 8bab4800ad4eb4b4b848197772774e9a7f030497 Mon Sep 17 00:00:00 2001 From: Anthony Michaels <a.michaels@webpackager.com> Date: Thu, 20 Mar 2025 09:45:26 +0000 Subject: [PATCH 10/12] Redo cleanup process; include extra notices; --- src/Drush/Commands/MigrateCleanupCommand.php | 347 ++++++++++--------- 1 file changed, 192 insertions(+), 155 deletions(-) diff --git a/src/Drush/Commands/MigrateCleanupCommand.php b/src/Drush/Commands/MigrateCleanupCommand.php index d37d3b7..db6a5a0 100644 --- a/src/Drush/Commands/MigrateCleanupCommand.php +++ b/src/Drush/Commands/MigrateCleanupCommand.php @@ -8,9 +8,20 @@ use Drupal\paragraphs\Entity\Paragraph; use Symfony\Component\DependencyInjection\ContainerInterface; use DOMDocument; use DOMElement; +use DOMNode; /** - * Drush command for cleaning up paragraph HTML content. + * Drush command for cleaning up HTML content in text paragraphs. + * + * This command: + * - Queries all paragraphs of type "text" + * - For each paragraph, processes its HTML content to: + * - Remove all class and style attributes from every element + * - Recursively remove empty tags or tags with only non-breaking spaces + * - Remove all <style> and <script> elements entirely + * + * To run: + * ddev drush cambridge:migrate-cleanup */ class MigrateCleanupCommand extends DrushCommands { @@ -23,207 +34,233 @@ class MigrateCleanupCommand extends DrushCommands { /** * Constructs a new MigrateCleanupCommand object. + * + * @param \Drupal\Core\Entity\EntityTypeManagerInterface $entity_type_manager + * The entity type manager. */ - public function __construct(EntityTypeManagerInterface $entity_type_manager) { + public function __construct( + EntityTypeManagerInterface $entity_type_manager + ) { parent::__construct(); $this->entityTypeManager = $entity_type_manager; } /** - * Clean HTML content by removing unwanted attributes and empty tags. + * Clean up HTML content in text paragraphs. + * + * @command cambridge:migrate-cleanup + * @aliases cm-cleanup + */ + public function migrateCleanup() { + try { + // Query all paragraphs of type "text". + $paragraph_storage = $this->entityTypeManager->getStorage('paragraph'); + $query = $paragraph_storage->getQuery() + ->condition('type', 'text') + ->accessCheck(FALSE); + $paragraph_ids = $query->execute(); + + if (empty($paragraph_ids)) { + $this->logger()->warning(dt('No text paragraphs found to clean up.')); + return; + } + + $this->logger()->notice(dt('Found @count text paragraphs to process.', [ + '@count' => count($paragraph_ids), + ])); + + $success_count = 0; + $error_count = 0; + + // Load paragraphs in chunks to avoid memory issues. + $chunk_size = 50; + $chunks = array_chunk($paragraph_ids, $chunk_size, TRUE); + + foreach ($chunks as $chunk) { + $paragraphs = $paragraph_storage->loadMultiple($chunk); + + foreach ($paragraphs as $paragraph_id => $paragraph) { + try { + // Ensure we're working with a Paragraph entity + if (!($paragraph instanceof Paragraph)) { + $paragraph = Paragraph::load($paragraph_id); + if (!$paragraph) { + $this->logger()->warning(dt('Could not load paragraph @id', [ + '@id' => $paragraph_id, + ])); + continue; + } + } + + // Skip if paragraph doesn't have a text field. + if (!$paragraph->hasField('field_text')) { + $this->logger()->warning(dt('Skipping paragraph @id - no text field found', [ + '@id' => $paragraph->id(), + ])); + continue; + } + + // Get the current HTML content. + $text_field = $paragraph->get('field_text'); + $html = $text_field->value; + $format = $text_field->format; + + if (empty($html)) { + $this->logger()->notice(dt('Skipping paragraph @id - empty content', [ + '@id' => $paragraph->id(), + ])); + continue; + } + + // Process the HTML content. + $cleaned_html = $this->cleanupHtml($html); + + // Update the paragraph with the cleaned HTML. + $paragraph->set('field_text', [ + 'value' => $cleaned_html, + 'format' => $format, + ]); + $paragraph->save(); + + $success_count++; + $this->logger()->notice(dt('Successfully cleaned up paragraph @id', [ + '@id' => $paragraph->id(), + ])); + } + catch (\Exception $e) { + $error_count++; + $this->logger()->error(dt('Error cleaning up paragraph @id: @message', [ + '@id' => $paragraph->id(), + '@message' => $e->getMessage(), + ])); + } + } + + // Clear static caches to avoid memory issues. + $paragraph_storage->resetCache($chunk); + } + + $this->logger()->notice(dt('Cleanup complete. Successes: @success, Errors: @errors', [ + '@success' => $success_count, + '@errors' => $error_count, + ])); + } + catch (\Exception $e) { + $this->logger()->error(dt('Cleanup failed: @message', [ + '@message' => $e->getMessage(), + ])); + } + } + + /** + * Clean up HTML content. * * @param string $html - * The HTML content to clean. + * The HTML content to clean up. * * @return string - * The cleaned HTML content. + * The cleaned up HTML content. */ - protected function cleanHtml($html) { + protected function cleanupHtml($html) { + // If the HTML is empty, return it as is. if (empty($html)) { return $html; } - // Create a new DOM document + // Create a new DOMDocument. $doc = new DOMDocument(); - // Preserve whitespace to maintain formatting + // Preserve whitespace to avoid unwanted text nodes. $doc->preserveWhiteSpace = true; - $doc->formatOutput = true; - - // Suppress warnings during HTML loading - $internalErrors = libxml_use_internal_errors(true); - // Load HTML with UTF-8 encoding - $doc->loadHTML('<?xml encoding="UTF-8">' . $html, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD); + // Disable error reporting temporarily to suppress warnings about HTML5 tags. + $previous_value = libxml_use_internal_errors(true); - // Restore error handling - libxml_use_internal_errors($internalErrors); - - // Process the document if it loaded successfully - if ($doc->documentElement) { - // Remove unwanted attributes and empty tags recursively - $this->cleanNode($doc->documentElement); + // Load the HTML content. + // Add a wrapper to ensure proper parsing of fragments. + $doc->loadHTML('<!DOCTYPE html><html><head><meta charset="UTF-8"></head><body>' . $html . '</body></html>', LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD); + + // Restore error reporting. + libxml_use_internal_errors($previous_value); - // Get the cleaned HTML - $cleanedHtml = $doc->saveHTML($doc->documentElement); + // Get the body element. + $body = $doc->getElementsByTagName('body')->item(0); - // Remove the XML declaration if present - $cleanedHtml = preg_replace('/<\?xml[^>]+\?>/', '', $cleanedHtml); + // Process the body element recursively. + $this->processNode($body); - // If the result is empty but the original wasn't, return the original - if (empty(trim($cleanedHtml)) && !empty(trim($html))) { - $this->logger()->warning('Cleaning resulted in empty content, keeping original'); - return $html; - } - - return trim($cleanedHtml); - } + // Save the processed HTML. + $processed_html = ''; + $children = $body->childNodes; - // If document couldn't be properly parsed, return original HTML - return $html; + foreach ($children as $child) { + $processed_html .= $doc->saveHTML($child); + } + + return $processed_html; } /** - * Recursively clean a DOM node by removing unwanted attributes and empty tags. + * Process a DOM node recursively. * - * @param DOMElement $node - * The DOM node to clean. + * @param \DOMNode $node + * The DOM node to process. * * @return bool * TRUE if the node should be kept, FALSE if it should be removed. */ - protected function cleanNode(DOMElement $node) { - // Remove style and script elements entirely - $tagName = strtolower($node->tagName); - if ($tagName === 'style' || $tagName === 'script') { + protected function processNode(DOMNode $node) { + // If it's not an element node, keep it unless it's empty. + if ($node->nodeType !== XML_ELEMENT_NODE) { + // For text nodes, check if they contain only whitespace or non-breaking spaces. + if ($node->nodeType === XML_TEXT_NODE) { + $text = trim($node->textContent); + $text = str_replace(' ', '', $text); + $text = str_replace("\xC2\xA0", '', $text); // UTF-8 non-breaking space + return !empty($text); + } + return true; + } + + // If it's a script or style element, remove it. + if (in_array(strtolower($node->nodeName), ['script', 'style'])) { + $node->parentNode->removeChild($node); return false; } - // Remove class and style attributes - $node->removeAttribute('class'); - $node->removeAttribute('style'); + // If it's an element node, remove class and style attributes. + if ($node instanceof DOMElement) { + $node->removeAttribute('class'); + $node->removeAttribute('style'); + } - // Process child nodes + // Process child nodes recursively. $children = []; foreach ($node->childNodes as $child) { $children[] = $child; } - $hasNonEmptyContent = false; - + $keep_node = false; foreach ($children as $child) { - if ($child->nodeType === XML_ELEMENT_NODE) { - // Recursively clean child element - $keepChild = $this->cleanNode($child); - if (!$keepChild && ($child->nodeName === 'style' || $child->nodeName === 'script')) { - // Only remove style and script elements, keep other elements even if empty - $node->removeChild($child); - } - else { - // Always keep other elements, even if they're empty - $hasNonEmptyContent = true; - } + $keep_child = $this->processNode($child); + if (!$keep_child && $child->parentNode) { + $child->parentNode->removeChild($child); } - elseif ($child->nodeType === XML_TEXT_NODE) { - // Always keep text nodes, even if they're just whitespace - $hasNonEmptyContent = true; + else { + $keep_node = true; } } - // Always keep the node unless it's a style or script element - return true; - } - - /** - * Clean up HTML content in text paragraphs. - * - * @command cambridge:migrate-cleanup - * @aliases cm-cleanup - */ - public function migrateCleanup() { - try { - // Query all paragraphs of type "text" - $storage = $this->entityTypeManager->getStorage('paragraph'); - $query = $storage->getQuery() - ->condition('type', 'text') - ->accessCheck(FALSE); - $pids = $query->execute(); - - if (empty($pids)) { - $this->logger()->warning(dt('No text paragraphs found to clean up.')); - return; - } - - $success_count = 0; - $error_count = 0; - $unchanged_count = 0; - - foreach ($pids as $pid) { - try { - /** @var \Drupal\paragraphs\Entity\Paragraph $paragraph */ - $paragraph = $storage->load($pid); - - if (!$paragraph->hasField('field_text')) { - $this->logger()->warning(dt('Paragraph @pid does not have field_text field.', [ - '@pid' => $pid, - ])); - continue; - } - - $text_field = $paragraph->get('field_text'); - $value = $text_field->value; - $format = $text_field->format; - - // Skip if empty - if (empty($value)) { - $this->logger()->notice(dt('Skipping empty paragraph @pid', [ - '@pid' => $pid, - ])); - continue; - } - - // Clean the HTML content - $cleaned_value = $this->cleanHtml($value); - - // Update the paragraph if content changed - if ($cleaned_value !== $value) { - $text_field->setValue([ - 'value' => $cleaned_value, - 'format' => $format, - ]); - $paragraph->save(); - - $success_count++; - $this->logger()->notice(dt('Successfully cleaned HTML in paragraph @pid', [ - '@pid' => $pid, - ])); - } else { - $unchanged_count++; - $this->logger()->info(dt('No changes needed for paragraph @pid', [ - '@pid' => $pid, - ])); - } - } - catch (\Exception $e) { - $error_count++; - $this->logger()->error(dt('Error cleaning paragraph @pid: @message', [ - '@pid' => $pid, - '@message' => $e->getMessage(), - ])); - } + // If the node has no children and no text content, mark it for removal. + if (!$keep_node && $node->childNodes->length === 0) { + // Check if the node is not a self-closing tag like <img>, <br>, etc. + $self_closing_tags = ['img', 'br', 'hr', 'input', 'meta', 'link']; + if (!in_array(strtolower($node->nodeName), $self_closing_tags)) { + return false; } - - $this->logger()->notice(dt('HTML cleanup complete. Successes: @success, Unchanged: @unchanged, Errors: @errors', [ - '@success' => $success_count, - '@unchanged' => $unchanged_count, - '@errors' => $error_count, - ])); - } - catch (\Exception $e) { - $this->logger()->error(dt('HTML cleanup failed: @message', [ - '@message' => $e->getMessage(), - ])); } + + return true; } /** @@ -231,7 +268,7 @@ class MigrateCleanupCommand extends DrushCommands { */ public static function create(ContainerInterface $container) { return new static( - $container->get('entity_type_manager') + $container->get('entity_type.manager') ); } -- GitLab From c82c87cf12aa673f2828d06e90aa8aa6d07e0e6e Mon Sep 17 00:00:00 2001 From: Anthony Michaels <a.michaels@webpackager.com> Date: Fri, 21 Mar 2025 18:42:35 +0000 Subject: [PATCH 11/12] Exclude embedded Drupal media tags 'drupal-entity' and 'drupal-media'; --- src/Drush/Commands/MigrateCleanupCommand.php | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/Drush/Commands/MigrateCleanupCommand.php b/src/Drush/Commands/MigrateCleanupCommand.php index db6a5a0..e054cd8 100644 --- a/src/Drush/Commands/MigrateCleanupCommand.php +++ b/src/Drush/Commands/MigrateCleanupCommand.php @@ -222,6 +222,12 @@ class MigrateCleanupCommand extends DrushCommands { return true; } + // Always preserve Drupal-specific tags + $drupal_tags = ['drupal-entity', 'drupal-media']; + if (in_array(strtolower($node->nodeName), $drupal_tags)) { + return true; + } + // If it's a script or style element, remove it. if (in_array(strtolower($node->nodeName), ['script', 'style'])) { $node->parentNode->removeChild($node); -- GitLab From 2d0eff48057dd293decd60230824dbd6c65033f8 Mon Sep 17 00:00:00 2001 From: Anthony Michaels <a.michaels@webpackager.com> Date: Fri, 21 Mar 2025 19:17:39 +0000 Subject: [PATCH 12/12] Focus on specific empty tags thus avoiding Drupal reserved tags being affected; --- src/Drush/Commands/MigrateCleanupCommand.php | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/src/Drush/Commands/MigrateCleanupCommand.php b/src/Drush/Commands/MigrateCleanupCommand.php index e054cd8..d6cd921 100644 --- a/src/Drush/Commands/MigrateCleanupCommand.php +++ b/src/Drush/Commands/MigrateCleanupCommand.php @@ -221,12 +221,6 @@ class MigrateCleanupCommand extends DrushCommands { } return true; } - - // Always preserve Drupal-specific tags - $drupal_tags = ['drupal-entity', 'drupal-media']; - if (in_array(strtolower($node->nodeName), $drupal_tags)) { - return true; - } // If it's a script or style element, remove it. if (in_array(strtolower($node->nodeName), ['script', 'style'])) { @@ -257,11 +251,13 @@ class MigrateCleanupCommand extends DrushCommands { } } - // If the node has no children and no text content, mark it for removal. + // Only check specific tags for emptiness + $tags_to_check = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'strong', 'b', 'em', 'i', + 'small', 'mark', 'del', 'ins', 'sub', 'sup', 'q', 'cite', 'pre']; + + // If the node has no children and no text content, check if it's in our list of tags to check if (!$keep_node && $node->childNodes->length === 0) { - // Check if the node is not a self-closing tag like <img>, <br>, etc. - $self_closing_tags = ['img', 'br', 'hr', 'input', 'meta', 'link']; - if (!in_array(strtolower($node->nodeName), $self_closing_tags)) { + if (in_array(strtolower($node->nodeName), $tags_to_check)) { return false; } } -- GitLab