FAQ
| This is a
LIVE
service |
Changelog
Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
C
Cambridge migrations
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Iterations
Wiki
Requirements
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Information Services
DevOps
Drupal team
2023 Platform
Custom Modules
Cambridge migrations
Merge requests
!8
Feature/am 427 sidebar only basic html
Code
Review changes
Check out branch
Download
Patches
Plain diff
Merged
Feature/am 427 sidebar only basic html
feature/am-427-sidebar-only-basic-html
into
main
Overview
0
Commits
12
Pipelines
0
Changes
4
Merged
A. Michaels
requested to merge
feature/am-427-sidebar-only-basic-html
into
main
1 week ago
Overview
0
Commits
12
Pipelines
0
Changes
4
Expand
Closes feature/am-427-sidebar-only-basic-html
0
0
Merge request reports
Compare
main
version 6
c82c87cf
1 week ago
version 5
8bab4800
1 week ago
version 4
95a0c966
1 week ago
version 3
d57b4ff5
1 week ago
version 2
dc73033f
1 week ago
version 1
091c2d55
1 week ago
main (base)
and
latest version
latest version
2d0eff48
12 commits,
1 week ago
version 6
c82c87cf
11 commits,
1 week ago
version 5
8bab4800
10 commits,
1 week ago
version 4
95a0c966
9 commits,
1 week ago
version 3
d57b4ff5
8 commits,
1 week ago
version 2
dc73033f
7 commits,
1 week ago
version 1
091c2d55
6 commits,
1 week ago
4 files
+
641
−
64
Inline
Compare changes
Side-by-side
Inline
Show whitespace changes
Show one file at a time
Files
4
Search (e.g. *.vue) (Ctrl+P)
src/Drush/Commands/MigrateCleanupCommand.php
0 → 100644
+
277
−
0
Options
<?php
namespace
Drupal\cambridge_migrations\Drush\Commands
;
use
Drush\Commands\DrushCommands
;
use
Drupal\Core\Entity\EntityTypeManagerInterface
;
use
Drupal\paragraphs\Entity\Paragraph
;
use
Symfony\Component\DependencyInjection\ContainerInterface
;
use
DOMDocument
;
use
DOMElement
;
use
DOMNode
;
/**
* Drush command for cleaning up HTML content in text paragraphs.
*
* This command:
* - Queries all paragraphs of type "text"
* - For each paragraph, processes its HTML content to:
* - Remove all class and style attributes from every element
* - Recursively remove empty tags or tags with only non-breaking spaces
* - Remove all <style> and <script> elements entirely
*
* To run:
* ddev drush cambridge:migrate-cleanup
*/
class
MigrateCleanupCommand
extends
DrushCommands
{
/**
* The entity type manager.
*
* @var \Drupal\Core\Entity\EntityTypeManagerInterface
*/
protected
$entityTypeManager
;
/**
* Constructs a new MigrateCleanupCommand object.
*
* @param \Drupal\Core\Entity\EntityTypeManagerInterface $entity_type_manager
* The entity type manager.
*/
public
function
__construct
(
EntityTypeManagerInterface
$entity_type_manager
)
{
parent
::
__construct
();
$this
->
entityTypeManager
=
$entity_type_manager
;
}
/**
* Clean up HTML content in text paragraphs.
*
* @command cambridge:migrate-cleanup
* @aliases cm-cleanup
*/
public
function
migrateCleanup
()
{
try
{
// Query all paragraphs of type "text".
$paragraph_storage
=
$this
->
entityTypeManager
->
getStorage
(
'paragraph'
);
$query
=
$paragraph_storage
->
getQuery
()
->
condition
(
'type'
,
'text'
)
->
accessCheck
(
FALSE
);
$paragraph_ids
=
$query
->
execute
();
if
(
empty
(
$paragraph_ids
))
{
$this
->
logger
()
->
warning
(
dt
(
'No text paragraphs found to clean up.'
));
return
;
}
$this
->
logger
()
->
notice
(
dt
(
'Found @count text paragraphs to process.'
,
[
'@count'
=>
count
(
$paragraph_ids
),
]));
$success_count
=
0
;
$error_count
=
0
;
// Load paragraphs in chunks to avoid memory issues.
$chunk_size
=
50
;
$chunks
=
array_chunk
(
$paragraph_ids
,
$chunk_size
,
TRUE
);
foreach
(
$chunks
as
$chunk
)
{
$paragraphs
=
$paragraph_storage
->
loadMultiple
(
$chunk
);
foreach
(
$paragraphs
as
$paragraph_id
=>
$paragraph
)
{
try
{
// Ensure we're working with a Paragraph entity
if
(
!
(
$paragraph
instanceof
Paragraph
))
{
$paragraph
=
Paragraph
::
load
(
$paragraph_id
);
if
(
!
$paragraph
)
{
$this
->
logger
()
->
warning
(
dt
(
'Could not load paragraph @id'
,
[
'@id'
=>
$paragraph_id
,
]));
continue
;
}
}
// Skip if paragraph doesn't have a text field.
if
(
!
$paragraph
->
hasField
(
'field_text'
))
{
$this
->
logger
()
->
warning
(
dt
(
'Skipping paragraph @id - no text field found'
,
[
'@id'
=>
$paragraph
->
id
(),
]));
continue
;
}
// Get the current HTML content.
$text_field
=
$paragraph
->
get
(
'field_text'
);
$html
=
$text_field
->
value
;
$format
=
$text_field
->
format
;
if
(
empty
(
$html
))
{
$this
->
logger
()
->
notice
(
dt
(
'Skipping paragraph @id - empty content'
,
[
'@id'
=>
$paragraph
->
id
(),
]));
continue
;
}
// Process the HTML content.
$cleaned_html
=
$this
->
cleanupHtml
(
$html
);
// Update the paragraph with the cleaned HTML.
$paragraph
->
set
(
'field_text'
,
[
'value'
=>
$cleaned_html
,
'format'
=>
$format
,
]);
$paragraph
->
save
();
$success_count
++
;
$this
->
logger
()
->
notice
(
dt
(
'Successfully cleaned up paragraph @id'
,
[
'@id'
=>
$paragraph
->
id
(),
]));
}
catch
(
\Exception
$e
)
{
$error_count
++
;
$this
->
logger
()
->
error
(
dt
(
'Error cleaning up paragraph @id: @message'
,
[
'@id'
=>
$paragraph
->
id
(),
'@message'
=>
$e
->
getMessage
(),
]));
}
}
// Clear static caches to avoid memory issues.
$paragraph_storage
->
resetCache
(
$chunk
);
}
$this
->
logger
()
->
notice
(
dt
(
'Cleanup complete. Successes: @success, Errors: @errors'
,
[
'@success'
=>
$success_count
,
'@errors'
=>
$error_count
,
]));
}
catch
(
\Exception
$e
)
{
$this
->
logger
()
->
error
(
dt
(
'Cleanup failed: @message'
,
[
'@message'
=>
$e
->
getMessage
(),
]));
}
}
/**
* Clean up HTML content.
*
* @param string $html
* The HTML content to clean up.
*
* @return string
* The cleaned up HTML content.
*/
protected
function
cleanupHtml
(
$html
)
{
// If the HTML is empty, return it as is.
if
(
empty
(
$html
))
{
return
$html
;
}
// Create a new DOMDocument.
$doc
=
new
DOMDocument
();
// Preserve whitespace to avoid unwanted text nodes.
$doc
->
preserveWhiteSpace
=
true
;
// Disable error reporting temporarily to suppress warnings about HTML5 tags.
$previous_value
=
libxml_use_internal_errors
(
true
);
// Load the HTML content.
// Add a wrapper to ensure proper parsing of fragments.
$doc
->
loadHTML
(
'<!DOCTYPE html><html><head><meta charset="UTF-8"></head><body>'
.
$html
.
'</body></html>'
,
LIBXML_HTML_NOIMPLIED
|
LIBXML_HTML_NODEFDTD
);
// Restore error reporting.
libxml_use_internal_errors
(
$previous_value
);
// Get the body element.
$body
=
$doc
->
getElementsByTagName
(
'body'
)
->
item
(
0
);
// Process the body element recursively.
$this
->
processNode
(
$body
);
// Save the processed HTML.
$processed_html
=
''
;
$children
=
$body
->
childNodes
;
foreach
(
$children
as
$child
)
{
$processed_html
.
=
$doc
->
saveHTML
(
$child
);
}
return
$processed_html
;
}
/**
* Process a DOM node recursively.
*
* @param \DOMNode $node
* The DOM node to process.
*
* @return bool
* TRUE if the node should be kept, FALSE if it should be removed.
*/
protected
function
processNode
(
DOMNode
$node
)
{
// If it's not an element node, keep it unless it's empty.
if
(
$node
->
nodeType
!==
XML_ELEMENT_NODE
)
{
// For text nodes, check if they contain only whitespace or non-breaking spaces.
if
(
$node
->
nodeType
===
XML_TEXT_NODE
)
{
$text
=
trim
(
$node
->
textContent
);
$text
=
str_replace
(
' '
,
''
,
$text
);
$text
=
str_replace
(
"
\xC2\xA0
"
,
''
,
$text
);
// UTF-8 non-breaking space
return
!
empty
(
$text
);
}
return
true
;
}
// If it's a script or style element, remove it.
if
(
in_array
(
strtolower
(
$node
->
nodeName
),
[
'script'
,
'style'
]))
{
$node
->
parentNode
->
removeChild
(
$node
);
return
false
;
}
// If it's an element node, remove class and style attributes.
if
(
$node
instanceof
DOMElement
)
{
$node
->
removeAttribute
(
'class'
);
$node
->
removeAttribute
(
'style'
);
}
// Process child nodes recursively.
$children
=
[];
foreach
(
$node
->
childNodes
as
$child
)
{
$children
[]
=
$child
;
}
$keep_node
=
false
;
foreach
(
$children
as
$child
)
{
$keep_child
=
$this
->
processNode
(
$child
);
if
(
!
$keep_child
&&
$child
->
parentNode
)
{
$child
->
parentNode
->
removeChild
(
$child
);
}
else
{
$keep_node
=
true
;
}
}
// Only check specific tags for emptiness
$tags_to_check
=
[
'p'
,
'h1'
,
'h2'
,
'h3'
,
'h4'
,
'h5'
,
'h6'
,
'strong'
,
'b'
,
'em'
,
'i'
,
'small'
,
'mark'
,
'del'
,
'ins'
,
'sub'
,
'sup'
,
'q'
,
'cite'
,
'pre'
];
// If the node has no children and no text content, check if it's in our list of tags to check
if
(
!
$keep_node
&&
$node
->
childNodes
->
length
===
0
)
{
if
(
in_array
(
strtolower
(
$node
->
nodeName
),
$tags_to_check
))
{
return
false
;
}
}
return
true
;
}
/**
* {@inheritdoc}
*/
public
static
function
create
(
ContainerInterface
$container
)
{
return
new
static
(
$container
->
get
(
'entity_type.manager'
)
);
}
}
Loading