commit eb1fdb06f596676a625fad6549e07bd0557fae70
Author: JayVii <jayvii[AT]posteo[DOT]de>
Date: Mon, 2 Mar 2026 18:34:52 +0100
feat: initial version of Zeitung
Diffstat:
7 files changed, 363 insertions(+), 0 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+output/*
+perlanetrc.yaml
diff --git a/README.md b/README.md
@@ -0,0 +1,57 @@
+# Zeitung
+
+For comments and ideas, please send issues and code-patches to:
+[jayvii+zeitung[AT]posteo[DOT]de](mailto:jayvii+zeitung[AT]posteo[DOT]de).
+
+## About
+
+A thin script around `[perlanet](https://github.com/davorg-cpan/perlanet)` and
+`[morss](github.com/pictuga/morss)` that gathers news from a pre-defined set of
+sources and formats them for newspaper-style consumption in various formats,
+such as:
+
+- HTML, i.e. for any type of browser either mobile or desktop
+- PDF, once in DIN A4 and once in DIN A 5 format with the
+ [Eisvogel](https://github.com/Wandmalfarbe/pandoc-latex-template) template
+- EPUB for ebook readers
+
+Zeitung is intended to run **once a day**, e.g. in the very early morning, so
+you can get updated on the latest news once a day without falling in a habit to
+check news constantly all day. Of course, the script also works if you would
+like to update all document types multiple times a day, if you really wish to.
+
+For example, you could generate the documents automatically every morning at 6AM
+on your webserver and publish them through your already running webserver, from
+where you can download the appropriate document to your device.
+
+- Examplary use cases would be to read *Zeitung* in the morning on an E-Book
+ reader (EPUB) or tablet (EPUB or DIN A5 PDF).
+- Or read *Zeitung* while commuting via train on a phone (HTML or DIN A5 PDF)
+- Or maybe, for you the best time for news consumption is during the evening on
+ your computer screen (HTML or DIN A4 PDF)
+
+## Name
+
+*Zeitung* means newspaper in German. Very original, I know.
+
+
+
+## Dependencies
+
+- [Pandoc](https://pandoc.org/)
+- [LaTeX / TeX Live](https://www.tug.org/texlive/)
+- [Perlanet](https://github.com/davorg-cpan/perlanet/)
+
+Optionally, install [morss](https://github.com/pictuga/morss) on your own server
+or use a public service like the upstream [morss.it](https://morss.it).
+
+## How to start
+
+Copy the example perlanet config `perlanetrc.example.yaml` to `perlanetrc.yaml`
+and edit it accordingly. Please refer to the
+[perlanet documentation](https://github.com/davorg-cpan/perlanet) for specifics.
+
+For Zeitung to work properly, each feed needs both an article summary as well as
+the full content of the article. Most news websites do not publish the full text
+article within their RSS feeds. In these cases, you can use `morss` to fetch
+the full text from the article sites for you (see `perlanetrc.example.yaml`).
diff --git a/generate.sh b/generate.sh
@@ -0,0 +1,102 @@
+#!/usr/bin/env bash
+
+# run perlanet on the config file to fetch all feeds and populate the markdown
+perlanet "./perlanetrc.yaml"
+
+# ensure only characters exist, that are valid for the LaTeX template
+sed -E \
+ -e 's/ä/\[ae\]/g' \
+ -e 's/ö/\[oe\]/g' \
+ -e 's/ü/\[ue\]/g' \
+ -e 's/ß/\[ss\]/g' \
+ -e 's/Ä/\[Ae\]/g' \
+ -e 's/Ö/\[Oe\]/g' \
+ -e 's/Ü/\[Ue\]/g' \
+ -i ./output/zeitung.md
+iconv -t ascii -c ./output/zeitung.md | tee ./output/zeitung2.md > /dev/null
+mv ./output/zeitung2.md ./output/zeitung.md
+sed -E \
+ -e 's/\[ae\]/ä/g' \
+ -e 's/\[oe\]/ö/g' \
+ -e 's/\[ue\]/ü/g' \
+ -e 's/\[ss\]/ß/g' \
+ -e 's/\[Ae\]/Ä/g' \
+ -e 's/\[Oe\]/Ö/g' \
+ -e 's/\[Ue\]/Ü/g' \
+ -i ./output/zeitung.md
+iconv -t utf-8 -c ./output/zeitung.md | tee ./output/zeitung2.md > /dev/null
+mv ./output/zeitung2.md ./output/zeitung.md
+
+# remove all articles before yesterday
+## remove start/stop markers from all recent articles
+sed -E \
+ -e "s/^<\!--(START|STOP):$(date +%Y-%m-%d)T[0-9:]+-->$//g" \
+ -e "s/^<\!--(START|STOP):$(date +%Y-%m-%d -d 'yesterday')T[0-9:]+-->$//g" \
+ -i ./output/zeitung.md
+## remove all articles that still have start/stop markers
+perl -pe 'BEGIN{undef $/;} s/<\!--START.*?<\!--STOP.*?-->//smg' -i ./output/zeitung.md
+
+# clean up HTML
+
+## remove linebreaks at the start of headline tags
+perl -pe 's/(<h[1-6][^>]*>)(.*)\n+/$1$2/g' -i ./output/zeitung.md
+
+## add linebreaks after closing headline or paragraph tags
+perl -pe 's/(<\/[hp][1-6]*>)/$1\n\n/g' -i ./output/zeitung.md
+
+## replace headline tags with markdown syntax (starting at lvl 3)
+sed -E \
+ -e 's/<h[4-6][^>]*?>/\n\n###### /g' \
+ -e 's/<h3[^>]*?>/\n\n##### /g' \
+ -e 's/<h2[^>]*?>/\n\n#### /g' \
+ -e 's/<h1[^>]*?>/\n\n### /g' \
+ -i ./output/zeitung.md
+
+## replace strong, bold, italic, emphasis tags with markdown syntax
+perl -pe 'BEGIN{undef $/;} s/<strong[^>]*?>(.*?)<\/strong>/__$1__/smg' -i ./output/zeitung.md
+perl -pe 'BEGIN{undef $/;} s/<b[^>]*?>(.*?)<\/b>/__$1__/smg' -i ./output/zeitung.md
+perl -pe 'BEGIN{undef $/;} s/<i[^>]*?>(.*?)<\/i>/_$1_/smg' -i ./output/zeitung.md
+perl -pe 'BEGIN{undef $/;} s/<em[^>]*?>(.*?)<\/em>/_$1_/smg' -i ./output/zeitung.md
+
+## remove any content that is within specific html tags
+perl -pe 'BEGIN{undef $/;} s/<time[^>]*?>.*?<\/time>//smg' -i ./output/zeitung.md
+perl -pe 'BEGIN{undef $/;} s/<figure[^>]*?>.*?<\/figure>//smg' -i ./output/zeitung.md
+perl -pe 'BEGIN{undef $/;} s/<figcaption[^>]*?>.*?<\/figcaption>//smg' -i ./output/zeitung.md
+perl -pe 'BEGIN{undef $/;} s/<iframe[^>]*?>.*?<\/iframe>//smg' -i ./output/zeitung.md
+
+## remove any remaining HTML tags (but leave their content)
+perl -pe 'BEGIN{undef $/;} s/<[^>]+?>//smg' -i ./output/zeitung.md
+
+## remove lines starting with empty space
+sed -E -e 's/^\s+//g' -i ./output/zeitung.md
+
+# clean up quirks
+## 1. separated first letter at start of article
+sed -E -e 's/^([A-Z])\s([a-z])/\1\2/g' -i ./output/zeitung.md
+## 2. text is not allowed to contain literal "*", so escape it
+sed -E -e 's/\*/\\*/g' -i ./output/zeitung.md
+
+# generate internal linking by creating SHA1 hashes
+perl -MDigest::SHA=sha1_hex \
+ -pe 's/HASH:(http[^(\)|\})]+)/sha1_hex$1/ge' \
+ -i ./output/zeitung.md
+
+# remove double entries
+grep "{#" ./output/zeitung.md | \
+sed -E -e 's/(\-|\;|\\|\/|\ |\!|\"|\#|\$|\&|\(|\)|\||\*|\,|\<|\>|\[|\]|\^|\`|\{|\.)/\\\1/g' | \
+while read -r line; do
+ perl -pe "BEGIN{undef $/;} s/$line.*?$line/$line/smg" -i ./output/zeitung.md
+done
+
+# determine system language
+lang=$(echo "$LANG" | sed -e 's/\..*$//g' -e 's/_/-/g')
+
+# generate output (html, pdf, epub)
+pandoc ./output/zeitung.md -t html -f markdown -o ./output/zeitung.html \
+ --css=./templates/style.css --self-contained --toc --toc-depth=1 -V lang=$lang
+pandoc ./output/zeitung.md -t pdf -f markdown -o ./output/zeitung.pdf \
+ --pdf-engine=xelatex --template eisvogel -V lang=$lang
+pandoc ./output/zeitung.md -t pdf -f markdown -o ./output/zeitung_mobile.pdf \
+ --pdf-engine=xelatex --template eisvogel -V papersize=a5 -V lang=$lang
+pandoc ./output/zeitung.md -t epub -f markdown -o ./output/zeitung.epub \
+ -V lang=$lang
diff --git a/perlanetrc.example.yaml b/perlanetrc.example.yaml
@@ -0,0 +1,35 @@
+title: Zeitung
+entries: 500
+entries_per_feed: 10
+entries_sort_order: "issued"
+page:
+ file: output/zeitung.md
+ template: templates/index.tt
+feed:
+ file: /dev/null
+ format: Atom
+feeds:
+ - title: Tagesschau - Inland
+ url: https://morss.it/https://www.tagesschau.de/inland/index~rss2.xml
+ web: https://www.tagesschau.de/inland/
+ - title: Tagesschau - Ausland
+ url: https://morss.it/https://www.tagesschau.de/ausland/index~rss2.xml
+ web: https://www.tagesschau.de/ausland/
+ - title: Tagesschau - Wirtschaft
+ url: https://morss.it/https://www.tagesschau.de/wirtschaft/index~rss2.xml
+ web: https://www.tagesschau.de/wirtschaft/
+ - title: Tagesschau - Wissen
+ url: https://morss.it/https://www.tagesschau.de/wissen/index~rss2.xml
+ web: https://www.tagesschau.de/wissen/
+ - title: Tagesschau - Faktenfinder
+ url: https://morss.it/https://www.tagesschau.de/faktenfinder/index~rss2.xml
+ web: https://www.tagesschau.de/faktenfinder/
+ - title: ZDFheute - Politik
+ url: https://morss.it/https://www.zdfheute.de/rss/zdf/nachrichten/politik
+ web: https://www.zdfheute.de/politik
+ - title: ZDFheute - Wirtschaft
+ url: https://morss.it/https://www.zdfheute.de/rss/zdf/nachrichten/wirtschaft
+ web: https://www.zdfheute.de/wirtschaft
+ - title: ZDFheute - Wissen
+ url: https://morss.it/https://www.zdfheute.de/rss/zdf/nachrichten/wissen
+ web: https://www.zdfheute.de/wissen
diff --git a/templates/background.pdf b/templates/background.pdf
@@ -0,0 +1,70 @@
+%PDF-1.5
+%
+4 0 obj
+<< /Length 5 0 R
+ /Filter /FlateDecode
+>>
+stream
+xe0FJcU$v?QȒ㙁9q0f%=!W9(pWzُbif1R/YgX֗O0cbl]Kfz]
+gd.➾Jws[C
+q0cbHNK2
+endstream
+endobj
+5 0 obj
+ 181
+endobj
+3 0 obj
+<<
+ /ExtGState <<
+ /a0 << /CA 1 /ca 1 >>
+ >>
+>>
+endobj
+2 0 obj
+<< /Type /Page % 1
+ /Parent 1 0 R
+ /MediaBox [ 0 0 595.275574 841.889771 ]
+ /Contents 4 0 R
+ /Group <<
+ /Type /Group
+ /S /Transparency
+ /I true
+ /CS /DeviceRGB
+ >>
+ /Resources 3 0 R
+>>
+endobj
+1 0 obj
+<< /Type /Pages
+ /Kids [ 2 0 R ]
+ /Count 1
+>>
+endobj
+6 0 obj
+<< /Producer (cairo 1.16.0 (https://cairographics.org))
+ /CreationDate (D:20190804184551+02'00)
+>>
+endobj
+7 0 obj
+<< /Type /Catalog
+ /Pages 1 0 R
+>>
+endobj
+xref
+0 8
+0000000000 65535 f
+0000000599 00000 n
+0000000367 00000 n
+0000000295 00000 n
+0000000015 00000 n
+0000000273 00000 n
+0000000664 00000 n
+0000000780 00000 n
+trailer
+<< /Size 8
+ /Root 7 0 R
+ /Info 6 0 R
+>>
+startxref
+832
+%%EOF
diff --git a/templates/index.tt b/templates/index.tt
@@ -0,0 +1,68 @@
+---
+title: "Zeitung"
+date: "[% feed.modified %]"
+titlepage: true
+titlepage-text-color: "FFFFFF"
+titlepage-rule-color: "360049"
+titlepage-rule-height: 0
+titlepage-background: "templates/background.pdf"
+toc: true
+toc-depth: 1
+toc-own-page: true
+number-sections: true
+book: true
+papersize: a4
+documentclass: article
+header-includes:
+ - \usepackage{multicol}
+ - \newcommand{\hideFromPandoc}[1]{#1}
+ - \hideFromPandoc{\let\Begin\begin \let\End\end}
+---
+
+# Overview
+
+[% FOREACH entry IN feed.entries %]
+<!--START:[% entry.issued | html %]-->
+## [% entry.title | html %] {#overview-HASH:[% entry.link | url | html %]}
+
+[% entry.issued | html %][% IF entry.author %] by [% entry.author | html %][% END %]
+
+[% IF entry.summary.body %]
+[% entry.summary.body %]
+[% ELSE %]
+[% entry.content.body %]
+[% END %]
+
+_[Read article](#HASH:[% entry.link | url | html %])_
+<!--STOP:[% entry.issued | html %]-->
+[% END %]
+
+---
+
+\newpage
+
+# Articles
+
+\Begin{multicols}{2}
+
+[% FOREACH entry IN feed.entries %]
+<!--START:[% entry.issued | html %]-->
+## [% entry.title | html %] {#HASH:[% entry.link | url | html %]}
+
+- [% entry.issued | html %][% IF entry.author %] by [% entry.author | html %][% END %]
+- _[Read article online]([% entry.link | url %])_
+- _[Back to the overview](#overview-HASH:[% entry.link | url | html %])_
+
+[% IF entry.content.body %]
+[% entry.content.body %]
+[% ELSE %]
+The content of the article could not be found.
+[% END %]
+
+_[Back to the overview](#overview-HASH:[% entry.link | url | html %])_
+
+---
+<!--STOP:[% entry.issued | html %]-->
+[% END %]
+
+\End{multicols}
diff --git a/templates/style.css b/templates/style.css
@@ -0,0 +1,29 @@
+:root {
+ --bg: #f4ecd8;
+ --fg: #000000;
+}
+@media (prefers-color-scheme: dark) {
+ :root{
+ --bg: #1c1b22;
+ --fg: #fbfbfe;
+ }
+}
+:root {
+ background-color: var(--bg);
+ color: var(--fg);
+ font-family: "Times New Roman", Times, serif;
+}
+body {
+ width: 95%;
+ max-width: 980px;
+ margin-left: auto;
+ margin-right: auto;
+}
+a {
+ color: var(--fg) !important;
+ font-weight: bolder;
+ text-decoration: underline;
+}
+p {
+ text-align: justify;
+}