commit 0f261c6309a35d3e0f70473afab41adb52055c49
parent 865396e85df919e9f7d73caa23c918a68a7614d9
Author: JayVii <jayvii[AT]posteo[DOT]de>
Date: Sat, 12 Apr 2025 11:42:59 +0200
feat: more robust feed-text cleanup
Diffstat:
2 files changed, 28 insertions(+), 27 deletions(-)
diff --git a/scripts/run.sh b/scripts/run.sh
@@ -19,20 +19,34 @@ eval "$(parse_yaml ${config})"
printf "Fetching feeds and generating html...\n"
perlanet "$config"
-# remove images and iframes from article previews
-printf "Removing images and iframes...\n"
-sed -E -e 's/<img[^>]*>//g' -e 's/<iframe[^>]*>//g' \
- -e 's/<figure[^>]*>//g' -e 's/<\/figure>//g' -i "$page_file"
-
-# remove linebreaks and empty paragraphs as well as inline-links ("more...")
+# Clean up HTML from tags within the <!--start--> / <!--end--> comments
+# 1. removes anchor text at the end of the post (typically "read more...")
+# 2. removes text within <figcaption></figcaption>
+# 3. removes all HTML tags
+# 4. creates paragraphs line-wise
+# 5. removes empty paragraphs
printf "Clean up HTML...\n"
+start_lines=($(grep -n "<\!--start-->" "$page_file" | sed -e 's/:.*//g'))
+stop_lines=($(grep -n "<\!--end-->" "$page_file" | sed -e 's/:.*//g'))
+for i in $(seq 0 1 $((${#start_lines[@]} - 1))); do
+ for line in $(seq ${start_lines[$i]} 1 ${stop_lines[$i]}); do
+ sed -E \
+ -e "${line}s/<a\ [^<]+<\/a><\!--end-->/<\!--end-->/g" \
+ -e "${line}s/<figcaption.*<\/figcaption>//g" \
+ -e "${line}s/<[^<]+>//g" \
+ -e "${line}s/^/<p>/" \
+ -e "${line}s/$/<\/p>/" \
+ -e "${line}s/<p>\s*\t*(<\!--(start|end)-->)*<\/p>/\1/g" \
+ -i "$page_file"
+ done
+done
+
+# re-apply <!--start--> / <!--end--> comments
sed -E \
- -e 's/<br[^a-z]*[^>]*>//g' \
- -e 's/<p[^a-z]*[^>]*><\/p>//g' \
- -e 's/<a\ [^<]+<\/a><\!--end-->/<\!--end-->/g' \
- -e 's/(<\!--start-->)(([^<]*)(<(a|div)[^>]*>)*([^<]*)(<\/(a|div)>)([^<]*))+.*/\1\3\6\9<\!--end-->/g' \
+ -e "${start_lines[0]}s/^/<\!--start\-\->/g" \
+ -e "${stop_lines[0]}s/$/<\!--end-->/g" \
-i "$page_file"
-
+
# insert link to rss/xml file
printf "Inserting RSS feed file...\n"
sed -E -e "s/<\!--XML-->/$feed_file/g" -i "$page_file"
diff --git a/templates/index.tt b/templates/index.tt
@@ -27,7 +27,7 @@
<meta name="robots" content="noindex">
</head>
- <body onload="check_articles();">
+ <body>
<header>
<nav>
@@ -57,19 +57,6 @@
<!--UPDATED-->
</p>
- <!-- This is updated by find-and-replace-->
- <!-- <details> -->
- <!-- <summary style="width:100%;">Feeds</summary> -->
- <!-- <div class="button-row"> -->
- <!-- <a href=<!--XML--> class="button" title="Subscribe to all these feeds via your own RSS reader">Subscribe via RSS</a> -->
- <!-- <a href=<!--OPML--> class="button" title="Import all these feeds into your own RSS reader">Download OPML</a> -->
- <!-- </div> -->
-
- <div style="margin-left:1em;">
- <!--FEEDS-->
- </div>
- </details>
-
<!-- Sub Feed Entries -->
<!--SUBFEED-->
@@ -96,9 +83,9 @@
<!-- Content -->
<div class="article_content">
[% IF entry.summary.body %]
- <p><!--start-->[% entry.summary.body %]<!--end--></p>
+ <!--start-->[% entry.summary.body %]<!--end-->
[% ELSE %]
- <!--start-->[% entry.content.body %]<!--end-->
+ <!--start-->[% entry.content.body %]<!--end-->
[% END %]
</div>
</section>