Skip to content

Instantly share code, notes, and snippets.

@dave-mills
Last active June 26, 2025 15:17
Show Gist options
  • Select an option

  • Save dave-mills/d73d254d7a1facc1b51ea6375520c34b to your computer and use it in GitHub Desktop.

Select an option

Save dave-mills/d73d254d7a1facc1b51ea6375520c34b to your computer and use it in GitHub Desktop.

Revisions

  1. dave-mills revised this gist Jun 26, 2025. No changes.
  2. dave-mills created this gist Jun 26, 2025.
    120 changes: 120 additions & 0 deletions FormatArticles.php
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,120 @@
    Article::all()
    ->each(function (Article $article) {

    // Mark "width" inline styles for keeping
    $article->content = Str::replaceMatches(
    '/style=(\"[^\"]*width:[^\"]*\")/',
    fn($matches) => 'keepwidth_123=' . $matches[1],
    $article->old_content
    );

    // Remove inline styles, except image widths
    $article->content = Str::replaceMatches(
    '/style=\"[^\"]*\"/',
    '',
    $article->content
    );

    // put widths back
    $article->content = Str::replaceMatches(
    '/keepwidth_123=(\"[^\"]*width:[^\"]*\")/',
    fn($matches) => 'style=' . $matches[1],
    $article->content,
    );

    // Remove inline classes
    $article->content = Str::replaceMatches(
    '/class=\"[^\"]*\"/',
    '',
    $article->content
    );

    // Remove inline font declarations (!)
    $article->content = Str::replaceMatches(
    '/<font [^>]*>/',
    '',
    $article->content
    );

    // remove errant spaces before the end of html tags <>
    $article->content = Str::replaceMatches(
    '/\s+>/',
    '>',
    $article->content
    );

    // remove no-brake-spaces (hidden characters)
    $article->content = Str::replaceMatches(
    [
    '/&nbsp;/',
    '/<o:p>/',
    '/<\/o:p>/',
    '/<\/font>/',
    '/<span[^>]*>/',
    '/<\/span>/',
    ],
    '',
    $article->content
    );

    // remove newlines (line-breaks are all over the place in many of the articles)
    $article->content = Str::replaceMatches(
    '/[\n\r]+/',
    ' ',
    $article->content
    );

    // Add in linebreaks to match paragraph tags
    $article->content = Str::replaceMatches(
    '/<\/p>?[\s]*</',
    '</p>
    <',
    $article->content
    );

    // Add in linebreaks after header tags
    $article->content = Str::replaceMatches(
    '/<\/h(\d)>?[\s]*</',
    fn($matches) => '</h' . $matches[1] . '>
    <',
    $article->content
    );

    // and after <br/> tags
    $article->content = Str::replaceMatches(
    '/<br\/>?[\s]*</',
    '<br\/>
    <',
    $article->content
    );


    // Remove empty tags
    $article->content = Str::replaceMatches(
    '/[\r\n]<p>\s*<\/p>/',
    '',
    $article->content
    );

    // Remove empty space before the end of a <p> tag.
    $article->content = Str::replaceMatches(
    '/[\s\n\r]*<\/p>/',
    '</p>',
    $article->content
    );

    // Remove span tags, as they're only used for inline formatting
    $article->content = Str::replaceMatches(
    [
    '/<span>/',
    '/<\/span>/',
    ],
    '',
    $article->content
    );

    // $article->content = $article->old_content;

    $article->save();

    });