feat: more robust feed-text cleanup - newsplanet - Planet-Style Newsfeed generated with perlanet

commit 0f261c6309a35d3e0f70473afab41adb52055c49
parent 865396e85df919e9f7d73caa23c918a68a7614d9
Author: JayVii <jayvii[AT]posteo[DOT]de>
Date:   Sat, 12 Apr 2025 11:42:59 +0200

feat: more robust feed-text cleanup

Diffstat:
M scripts/run.sh  | 36 +++++++++++++++++++++++++-----------
M templates/index.tt  | 19 +++----------------

2 files changed, 28 insertions(+), 27 deletions(-)
diff --git a/scripts/run.sh b/scripts/run.sh
@@ -19,20 +19,34 @@ eval "$(parse_yaml ${config})"
 printf "Fetching feeds and generating html...\n"
 perlanet "$config"
 
-# remove images and iframes from article previews
-printf "Removing images and iframes...\n"
-sed -E -e 's/<img[^>]*>//g' -e 's/<iframe[^>]*>//g' \
-  -e 's/<figure[^>]*>//g' -e 's/<\/figure>//g' -i "$page_file"
-
-# remove linebreaks and empty paragraphs as well as inline-links ("more...")
+# Clean up HTML from tags within the <!--start--> / <!--end--> comments
+# 1. removes anchor text at the end of the post (typically "read more...")
+# 2. removes text within <figcaption></figcaption>
+# 3. removes all HTML tags
+# 4. creates paragraphs line-wise
+# 5. removes empty paragraphs
 printf "Clean up HTML...\n"
+start_lines=($(grep -n "<\!--start-->" "$page_file" | sed -e 's/:.*//g'))
+stop_lines=($(grep -n "<\!--end-->" "$page_file" | sed -e 's/:.*//g'))
+for i in $(seq 0 1 $((${#start_lines[@]} - 1))); do
+  for line in $(seq ${start_lines[$i]} 1 ${stop_lines[$i]}); do
+    sed -E \
+      -e "${line}s/<a\ [^<]+<\/a><\!--end-->/<\!--end-->/g" \
+      -e "${line}s/<figcaption.*<\/figcaption>//g" \
+      -e "${line}s/<[^<]+>//g" \
+      -e "${line}s/^/<p>/" \
+      -e "${line}s/$/<\/p>/" \
+      -e "${line}s/<p>\s*\t*(<\!--(start|end)-->)*<\/p>/\1/g" \
+      -i "$page_file"
+  done
+done
+
+# re-apply <!--start--> / <!--end--> comments
 sed -E \
-  -e 's/<br[^a-z]*[^>]*>//g' \
-  -e 's/<p[^a-z]*[^>]*><\/p>//g' \
-  -e 's/<a\ [^<]+<\/a><\!--end-->/<\!--end-->/g' \
-  -e 's/(<\!--start-->)(([^<]*)(<(a|div)[^>]*>)*([^<]*)(<\/(a|div)>)([^<]*))+.*/\1\3\6\9<\!--end-->/g' \
+  -e "${start_lines[0]}s/^/<\!--start\-\->/g" \
+  -e "${stop_lines[0]}s/$/<\!--end-->/g" \
   -i "$page_file"
-
+  
 # insert link to rss/xml file
 printf "Inserting RSS feed file...\n"
 sed -E -e "s/<\!--XML-->/$feed_file/g" -i "$page_file"
diff --git a/templates/index.tt b/templates/index.tt
@@ -27,7 +27,7 @@
     <meta name="robots" content="noindex">
   </head>
 
-  <body onload="check_articles();">
+  <body>
 
     <header>
       <nav>
@@ -57,19 +57,6 @@
           <!--UPDATED-->
         </p>
 
-        <!-- This is updated by find-and-replace-->
-        <!-- <details> -->
-          <!-- <summary style="width:100%;">Feeds</summary> -->
-            <!-- <div class="button-row"> -->
-              <!-- <a href=<!--XML--> class="button" title="Subscribe to all these feeds via your own RSS reader">Subscribe via RSS</a> -->
-              <!-- <a href=<!--OPML--> class="button" title="Import all these feeds into your own RSS reader">Download OPML</a> -->
-            <!-- </div> -->
-
-            <div style="margin-left:1em;">
-            <!--FEEDS-->
-            </div>
-        </details>
-
         <!-- Sub Feed Entries -->
         <!--SUBFEED-->
 
@@ -96,9 +83,9 @@
           <!-- Content -->
           <div class="article_content">
             [% IF entry.summary.body %]
-              <p><!--start-->[% entry.summary.body  %]<!--end--></p>
+              <!--start-->[% entry.summary.body  %]<!--end-->
             [% ELSE %]
-                <!--start-->[% entry.content.body %]<!--end-->
+              <!--start-->[% entry.content.body %]<!--end-->
             [% END %]
           </div>
         </section>

	pub / newsplanet Planet-Style Newsfeed generated with perlanet
	`git clone https://src.jayvii.de/pub/newsplanet.git`
	Home \| Log \| Files \| Exports \| Refs \| README \| RSS

M	scripts/run.sh	\|	36	+++++++++++++++++++++++++-----------
M	templates/index.tt	\|	19	+++----------------

pub / newsplanet