pub / newsplanet

Planet-Style Newsfeed generated with perlanet
git clone https://src.jayvii.de/pub/newsplanet.git
Home | Log | Files | Exports | Refs | README | RSS

commit 0f261c6309a35d3e0f70473afab41adb52055c49
parent 865396e85df919e9f7d73caa23c918a68a7614d9
Author: JayVii <jayvii[AT]posteo[DOT]de>
Date:   Sat, 12 Apr 2025 11:42:59 +0200

feat: more robust feed-text cleanup

Diffstat:
Mscripts/run.sh | 36+++++++++++++++++++++++++-----------
Mtemplates/index.tt | 19+++----------------
2 files changed, 28 insertions(+), 27 deletions(-)

diff --git a/scripts/run.sh b/scripts/run.sh @@ -19,20 +19,34 @@ eval "$(parse_yaml ${config})" printf "Fetching feeds and generating html...\n" perlanet "$config" -# remove images and iframes from article previews -printf "Removing images and iframes...\n" -sed -E -e 's/<img[^>]*>//g' -e 's/<iframe[^>]*>//g' \ - -e 's/<figure[^>]*>//g' -e 's/<\/figure>//g' -i "$page_file" - -# remove linebreaks and empty paragraphs as well as inline-links ("more...") +# Clean up HTML from tags within the <!--start--> / <!--end--> comments +# 1. removes anchor text at the end of the post (typically "read more...") +# 2. removes text within <figcaption></figcaption> +# 3. removes all HTML tags +# 4. creates paragraphs line-wise +# 5. removes empty paragraphs printf "Clean up HTML...\n" +start_lines=($(grep -n "<\!--start-->" "$page_file" | sed -e 's/:.*//g')) +stop_lines=($(grep -n "<\!--end-->" "$page_file" | sed -e 's/:.*//g')) +for i in $(seq 0 1 $((${#start_lines[@]} - 1))); do + for line in $(seq ${start_lines[$i]} 1 ${stop_lines[$i]}); do + sed -E \ + -e "${line}s/<a\ [^<]+<\/a><\!--end-->/<\!--end-->/g" \ + -e "${line}s/<figcaption.*<\/figcaption>//g" \ + -e "${line}s/<[^<]+>//g" \ + -e "${line}s/^/<p>/" \ + -e "${line}s/$/<\/p>/" \ + -e "${line}s/<p>\s*\t*(<\!--(start|end)-->)*<\/p>/\1/g" \ + -i "$page_file" + done +done + +# re-apply <!--start--> / <!--end--> comments sed -E \ - -e 's/<br[^a-z]*[^>]*>//g' \ - -e 's/<p[^a-z]*[^>]*><\/p>//g' \ - -e 's/<a\ [^<]+<\/a><\!--end-->/<\!--end-->/g' \ - -e 's/(<\!--start-->)(([^<]*)(<(a|div)[^>]*>)*([^<]*)(<\/(a|div)>)([^<]*))+.*/\1\3\6\9<\!--end-->/g' \ + -e "${start_lines[0]}s/^/<\!--start\-\->/g" \ + -e "${stop_lines[0]}s/$/<\!--end-->/g" \ -i "$page_file" - + # insert link to rss/xml file printf "Inserting RSS feed file...\n" sed -E -e "s/<\!--XML-->/$feed_file/g" -i "$page_file" diff --git a/templates/index.tt b/templates/index.tt @@ -27,7 +27,7 @@ <meta name="robots" content="noindex"> </head> - <body onload="check_articles();"> + <body> <header> <nav> @@ -57,19 +57,6 @@ <!--UPDATED--> </p> - <!-- This is updated by find-and-replace--> - <!-- <details> --> - <!-- <summary style="width:100%;">Feeds</summary> --> - <!-- <div class="button-row"> --> - <!-- <a href=<!--XML--> class="button" title="Subscribe to all these feeds via your own RSS reader">Subscribe via RSS</a> --> - <!-- <a href=<!--OPML--> class="button" title="Import all these feeds into your own RSS reader">Download OPML</a> --> - <!-- </div> --> - - <div style="margin-left:1em;"> - <!--FEEDS--> - </div> - </details> - <!-- Sub Feed Entries --> <!--SUBFEED--> @@ -96,9 +83,9 @@ <!-- Content --> <div class="article_content"> [% IF entry.summary.body %] - <p><!--start-->[% entry.summary.body %]<!--end--></p> + <!--start-->[% entry.summary.body %]<!--end--> [% ELSE %] - <!--start-->[% entry.content.body %]<!--end--> + <!--start-->[% entry.content.body %]<!--end--> [% END %] </div> </section>