#!/usr/bin/env bash

# run perlanet on the config file to fetch all feeds and populate the markdown -
mkdir -p ./output/
perlanet "./perlanetrc.yaml"

# remove all articles before yesterday -----------------------------------------

## remove start/stop markers from all recent articles
sed -E \
  -e "s/^<\!--(START|STOP):$(date +%Y-%m-%d)T[0-9:]+-->$//g" \
  -e "s/^<\!--(START|STOP):$(date +%Y-%m-%d -d 'yesterday')T[0-9:]+-->$//g" \
  -i ./output/zeitung.md

## remove all articles that still have start/stop markers
perl -pe 'BEGIN{undef $/;} s/<\!--START.*?<\!--STOP.*?-->//smg' \
  -i ./output/zeitung.md

# ensure the file only includes characters the compile can understand ----------

sed -E \
  -e 's/á/\[\`a\]/g' \
  -e 's/é/\[\`e\]/g' \
  -e 's/í/\[\`i\]/g' \
  -e 's/ó/\[\`o\]/g' \
  -e 's/ú/\[\`u\]/g' \
  -e 's/Á/\[\`A\]/g' \
  -e 's/É/\[\`E\]/g' \
  -e 's/Í/\[\`I\]/g' \
  -e 's/Ó/\[\`O\]/g' \
  -e 's/Ú/\[\`U\]/g' \
  -e 's/â/\[\^a\]/g' \
  -e 's/ê/\[\^e\]/g' \
  -e 's/î/\[\^i\]/g' \
  -e 's/ô/\[\^o\]/g' \
  -e 's/û/\[\^u\]/g' \
  -e 's/Â/\[\^A\]/g' \
  -e 's/Ê/\[\^E\]/g' \
  -e 's/Î/\[\^I\]/g' \
  -e 's/Ô/\[\^O\]/g' \
  -e 's/Û/\[\^U\]/g' \
  -e 's/ß/\[ss\]/g' \
  -e 's/ä/\[ae\]/g' \
  -e 's/ü/\[\ue\]/g' \
  -e 's/ö/\[oe\]/g' \
  -e 's/Ä/\[Ae\]/g' \
  -e 's/Ü/\[Ue\]/g' \
  -e 's/Ö/\[Oe\]/g' \
  -i ./output/zeitung.md
iconv -t ascii -c ./output/zeitung.md | tee ./output/tmp.md > /dev/null
mv ./output/tmp.md ./output/zeitung.md
sed -E \
  -e 's/\[\`a\]/á/g' \
  -e 's/\[\`e\]/é/g' \
  -e 's/\[\`i\]/í/g' \
  -e 's/\[\`o\]/ó/g' \
  -e 's/\[\`u\]/ú/g' \
  -e 's/\[\`A\]/Á/g' \
  -e 's/\[\`E\]/É/g' \
  -e 's/\[\`I\]/Í/g' \
  -e 's/\[\`O\]/Ó/g' \
  -e 's/\[\`U\]/Ú/g' \
  -e 's/\[\^a\]/â/g' \
  -e 's/\[\^e\]/ê/g' \
  -e 's/\[\^i\]/î/g' \
  -e 's/\[\^o\]/ô/g' \
  -e 's/\[\^u\]/û/g' \
  -e 's/\[\^A\]/Â/g' \
  -e 's/\[\^E\]/Ê/g' \
  -e 's/\[\^I\]/Î/g' \
  -e 's/\[\^O\]/Ô/g' \
  -e 's/\[\^U\]/Û/g' \
  -e 's/\[ss\]/ß/g' \
  -e 's/\[ae\]/ä/g' \
  -e 's/\[\ue\]/ü/g' \
  -e 's/\[oe\]/ö/g' \
  -e 's/\[Ae\]/Ä/g' \
  -e 's/\[Ue\]/Ü/g' \
  -e 's/\[Oe\]/Ö/g' \
  -i ./output/zeitung.md

# clean up HTML ----------------------------------------------------------------

## remove linebreaks at the start of headline tags
perl -pe 's/(<h[1-6][^>]*>)(.*)\n+/$1$2/g' -i ./output/zeitung.md

## add linebreaks after closing headline or paragraph tags
perl -pe 's/(<\/[hp][1-6]*>)/$1\n\n/g' -i ./output/zeitung.md

## replace headline tags with markdown syntax (starting at lvl 3)
sed -E \
  -e 's/<h[4-6][^>]*?>/\n\n###### /g' \
  -e 's/<h3[^>]*?>/\n\n##### /g' \
  -e 's/<h2[^>]*?>/\n\n#### /g' \
  -e 's/<h1[^>]*?>/\n\n### /g' \
  -i ./output/zeitung.md

## replace strong, bold, italic, emphasis tags with markdown syntax
perl -pe 'BEGIN{undef $/;} s/<strong[^>]*?>(.*?)<\/strong>/__$1__/smg' \
  -i ./output/zeitung.md
perl -pe 'BEGIN{undef $/;} s/<b[^>]*?>(.*?)<\/b>/__$1__/smg' \
  -i ./output/zeitung.md
perl -pe 'BEGIN{undef $/;} s/<i[^>]*?>(.*?)<\/i>/_$1_/smg' \
  -i ./output/zeitung.md
perl -pe 'BEGIN{undef $/;} s/<em[^>]*?>(.*?)<\/em>/_$1_/smg' \
  -i ./output/zeitung.md

## remove any content that is within specific html tags
perl -pe 'BEGIN{undef $/;} s/<time[^>]*?>.*?<\/time>//smg' \
  -i ./output/zeitung.md
perl -pe 'BEGIN{undef $/;} s/<figure[^>]*?>.*?<\/figure>//smg' \
  -i ./output/zeitung.md
perl -pe 'BEGIN{undef $/;} s/<figcaption[^>]*?>.*?<\/figcaption>//smg' \
  -i ./output/zeitung.md
perl -pe 'BEGIN{undef $/;} s/<iframe[^>]*?>.*?<\/iframe>//smg' \
  -i ./output/zeitung.md

## remove any remaining HTML tags (but leave their content)
perl -pe 'BEGIN{undef $/;} s/<[^>]+?>//smg' -i ./output/zeitung.md

## remove lines starting with empty space
sed -E -e 's/^\s+//g' -i ./output/zeitung.md

# clean up quirks --------------------------------------------------------------

## separated first letter at start of article
sed -E -e 's/^([A-Z])\s([a-z])/\1\2/g' -i ./output/zeitung.md

## text is not allowed to contain literal "*", so escape it
sed -E -e 's/\*/\\*/g' -i ./output/zeitung.md

# generate internal linking by creating SHA1 hashes ----------------------------
perl -MDigest::SHA=sha1_hex \
  -pe 's/HASH:(http[^(\)|\})]+)/sha1_hex$1/ge' \
  -i ./output/zeitung.md

# remove double entries --------------------------------------------------------
grep "{#" ./output/zeitung.md | \
sed -E -e 's/(\-|\;|\\|\/|\ |\!|\"|\#|\$|\&|\(|\)|\||\*|\,|\<|\>|\[|\]|\^|\`|\{|\.)/\\\1/g' | \
while read -r line; do
  perl -pe "BEGIN{undef $/;} s/$line.*?$line/$line/smg" -i ./output/zeitung.md
done

# determine system language ----------------------------------------------------
lang=$(echo "$LANG" | sed -e 's/\..*$//g' -e 's/_/-/g')

# generate output (html, pdf, epub) --------------------------------------------
pandoc ./output/zeitung.md -t html -f markdown -o ./output/zeitung.html \
  --css=./templates/style.css --include-in-header=./templates/scale_fonts.html \
  --self-contained --toc --toc-depth=1 -V lang=$lang
pandoc ./output/zeitung.md -t pdf -f markdown -o ./output/zeitung.pdf \
  --pdf-engine=xelatex --template eisvogel -V lang=$lang
pandoc ./output/zeitung.md -t pdf -f markdown -o ./output/zeitung_mobile.pdf \
  --pdf-engine=xelatex --template eisvogel -V papersize=a5 -V lang=$lang
pandoc ./output/zeitung.md -t epub -f markdown -o ./output/zeitung.epub \
  -V lang=$lang
