commit ccc1b19aaad3e3eb1c7967e6357b13beb0e426a9
parent dc64999eb312914a47bbbd95ebcc87ed170609e2
Author: JayVii <jayvii[AT]posteo[DOT]de>
Date: Wed, 19 Mar 2025 20:15:57 +0100
feat: rewrite dates for certain feeds
Diffstat:
3 files changed, 122 insertions(+), 2 deletions(-)
diff --git a/01_pulse.yaml b/01_pulse.yaml
@@ -53,11 +53,14 @@ feeds:
url: https://www.nd-aktuell.de/rss/wirtschaft-umwelt.xml
web: https://www.nd-aktuell.de/rubrik/wirtschaft/
- title: junge Welt (Kapital und Arbeit)
- url: https://www.jungewelt.de/feeds/kapital_arbeit.xml
+ url: file:./rewrite/www.jungewelt.de_feeds_kapital_arbeit.xml
web: https://www.jungewelt.de/aktuell/rubrik/kapital_und_arbeit.php
- title: junge Welt (Inland)
- url: https://www.jungewelt.de/feeds/inland.xml
+ url: file:./rewrite/www.jungewelt.de_feeds_inland.xml
web: https://www.jungewelt.de/aktuell/rubrik/inland.php
+ - title: junge Welt (Ausland)
+ url: file:./rewrite/www.jungewelt.de_feeds_ausland.xml
+ web: https://www.jungewelt.de/aktuell/rubrik/ausland.php
- title: Taz (Politik)
url: https://taz.de/Politik/!p4615;rss/
web: https://taz.de/Politik/!p4615/
diff --git a/fetch_and_rewrite.sh b/fetch_and_rewrite.sh
@@ -0,0 +1,113 @@
+#!/usr/bin/env bash
+
+# Config -----------------------------------------------------------------------
+
+# cache directory
+cache_dir="/tmp/cache/newsplanet/rewrite"
+file_name=$(echo "$1" | sed -e 's/https*:\/\///' -e 's/\//_/g')
+cache_file="${cache_dir}/${file_name}"
+
+# Function ---------------------------------------------------------------------
+
+read_xml () {
+ local IFS=\>
+ read -d \< ENTITY CONTENT
+}
+
+write_xml () {
+ cat ${1} | sed -E \
+ -e 's/^(\/.*$)/<\1>/g' \
+ -e 's/^([^<].*)\s*=>\s*(.*$)/<\1>\2/g' \
+ -e 's/^([^<]+$)/<\1>/g' | \
+ tee ${2} > /dev/null
+}
+
+reverse() {
+ tac <(echo "$@" | tr ' ' '\n') | tr '\n' ' '
+}
+
+# Script -----------------------------------------------------------------------
+
+# Fetch given XML Feed
+wget --quiet "$1" -O "$cache_file"
+
+# parse XML file
+while read_xml; do
+ if [[ ! -z $CONTENT ]]; then
+ echo "$ENTITY => $CONTENT"
+ else
+ echo "$ENTITY"
+ fi
+done < "${cache_file}" | tee "${cache_file}.new" > /dev/null
+
+# compare old to new file
+if [ -f "${cache_file}.old" ]; then
+
+ # find new GUID line in new XML file
+ dguids=$(
+ diff "${cache_file}.old" "${cache_file}.new" | \
+ grep -e "^> guid" | \
+ sed -e 's/^>.*=>\s*//g'
+ )
+
+ for dguid in $dguids; do
+
+ # get line number of new GUID line
+ guid_line=$(
+ grep -E "guid\s*=>\s*${dguid}" -n "${cache_file}.new" | \
+ sed -E -e 's/^([0-9]+):.*/\1/g'
+ )
+
+ # get <pubDate> lines
+ date_lines=$(
+ grep -e "^pubDate" -n ${cache_file}.new | sed -E -e 's/^([0-9]+):.*/\1/g'
+ )
+
+ # get <item></item> lines
+ item_line_start=$(
+ grep -e "^item" -n "${cache_file}.new" | sed -E -e 's/^([0-9]+):.*/\1/g'
+ )
+ item_line_stop=$(
+ grep -e "^\/item" -n "${cache_file}.new" | sed -E -e 's/^([0-9]+):.*/\1/g'
+ )
+
+ # get emcompasing item lines
+ ## assign the last SMALLER line as the start-line of the affected item
+ for i in $item_line_start; do
+ if [ $i -lt $guid_line ]; then
+ item_start=$i
+ fi
+ done
+ ## assign the first GREATER line as the stop-line of the affected item
+ for i in $(reverse $item_line_stop); do
+ if [ $i -gt $guid_line ]; then
+ item_stop=$i
+ fi
+ done
+
+ # get pubDate line that needs to be replaced
+ for i in $date_lines; do
+ if [ $i -gt $item_start ] && [ $i -lt $item_stop ]; then
+ date_line=$i
+ fi
+ done
+
+ # replace date in affected line
+ if [ ! -z $date_line ]; then
+ new_date=$(LC_ALL=en date "+%a, %d %b %Y %H:%m:%S %z")
+ sed -e "${date_line}s/=>[^\/]*/=> ${new_date}/" -i "${cache_file}.new"
+ fi
+
+ done
+
+ # write new XML file
+ mkdir -p "./rewrites/"
+ write_xml "${cache_file}.new" "./rewrites/${file_name}"
+
+fi
+
+mv "${cache_file}.new" "${cache_file}.old"
+
+
+
+
diff --git a/rewrite.txt b/rewrite.txt
@@ -0,0 +1,4 @@
+https://www.jungewelt.de/feeds/kapital_arbeit.xml
+https://www.jungewelt.de/feeds/inland.xml
+https://www.jungewelt.de/feeds/ausland.xml
+