pub / zeitung

News-Aggregator in newspaper style as HTML/PDF/EPUB
git clone src.jayvii.de/pub/zeitung.git
Home | Log | Files | Exports | Refs | README | RSS

generate.sh (5391B)


      1 #!/usr/bin/env bash
      2 
      3 # run perlanet on the config file to fetch all feeds and populate the markdown -
      4 mkdir -p ./output/
      5 perlanet "./perlanetrc.yaml"
      6 
      7 # remove all articles before yesterday -----------------------------------------
      8 
      9 ## remove start/stop markers from all recent articles
     10 sed -E \
     11   -e "s/^<\!--(START|STOP):$(date +%Y-%m-%d)T[0-9:]+-->$//g" \
     12   -e "s/^<\!--(START|STOP):$(date +%Y-%m-%d -d 'yesterday')T[0-9:]+-->$//g" \
     13   -i ./output/zeitung.md
     14 
     15 ## remove all articles that still have start/stop markers
     16 perl -pe 'BEGIN{undef $/;} s/<\!--START.*?<\!--STOP.*?-->//smg' \
     17   -i ./output/zeitung.md
     18 
     19 # ensure the file only includes characters the compile can understand ----------
     20 
     21 sed -E \
     22   -e 's/á/\[\`a\]/g' \
     23   -e 's/é/\[\`e\]/g' \
     24   -e 's/í/\[\`i\]/g' \
     25   -e 's/ó/\[\`o\]/g' \
     26   -e 's/ú/\[\`u\]/g' \
     27   -e 's/Á/\[\`A\]/g' \
     28   -e 's/É/\[\`E\]/g' \
     29   -e 's/Í/\[\`I\]/g' \
     30   -e 's/Ó/\[\`O\]/g' \
     31   -e 's/Ú/\[\`U\]/g' \
     32   -e 's/â/\[\^a\]/g' \
     33   -e 's/ê/\[\^e\]/g' \
     34   -e 's/î/\[\^i\]/g' \
     35   -e 's/ô/\[\^o\]/g' \
     36   -e 's/û/\[\^u\]/g' \
     37   -e 's/Â/\[\^A\]/g' \
     38   -e 's/Ê/\[\^E\]/g' \
     39   -e 's/Î/\[\^I\]/g' \
     40   -e 's/Ô/\[\^O\]/g' \
     41   -e 's/Û/\[\^U\]/g' \
     42   -e 's/ß/\[ss\]/g' \
     43   -e 's/ä/\[ae\]/g' \
     44   -e 's/ü/\[\ue\]/g' \
     45   -e 's/ö/\[oe\]/g' \
     46   -e 's/Ä/\[Ae\]/g' \
     47   -e 's/Ü/\[Ue\]/g' \
     48   -e 's/Ö/\[Oe\]/g' \
     49   -i ./output/zeitung.md
     50 iconv -t ascii -c ./output/zeitung.md | tee ./output/tmp.md > /dev/null
     51 mv ./output/tmp.md ./output/zeitung.md
     52 sed -E \
     53   -e 's/\[\`a\]/á/g' \
     54   -e 's/\[\`e\]/é/g' \
     55   -e 's/\[\`i\]/í/g' \
     56   -e 's/\[\`o\]/ó/g' \
     57   -e 's/\[\`u\]/ú/g' \
     58   -e 's/\[\`A\]/Á/g' \
     59   -e 's/\[\`E\]/É/g' \
     60   -e 's/\[\`I\]/Í/g' \
     61   -e 's/\[\`O\]/Ó/g' \
     62   -e 's/\[\`U\]/Ú/g' \
     63   -e 's/\[\^a\]/â/g' \
     64   -e 's/\[\^e\]/ê/g' \
     65   -e 's/\[\^i\]/î/g' \
     66   -e 's/\[\^o\]/ô/g' \
     67   -e 's/\[\^u\]/û/g' \
     68   -e 's/\[\^A\]/Â/g' \
     69   -e 's/\[\^E\]/Ê/g' \
     70   -e 's/\[\^I\]/Î/g' \
     71   -e 's/\[\^O\]/Ô/g' \
     72   -e 's/\[\^U\]/Û/g' \
     73   -e 's/\[ss\]/ß/g' \
     74   -e 's/\[ae\]/ä/g' \
     75   -e 's/\[\ue\]/ü/g' \
     76   -e 's/\[oe\]/ö/g' \
     77   -e 's/\[Ae\]/Ä/g' \
     78   -e 's/\[Ue\]/Ü/g' \
     79   -e 's/\[Oe\]/Ö/g' \
     80   -i ./output/zeitung.md
     81 
     82 # clean up HTML ----------------------------------------------------------------
     83 
     84 ## remove linebreaks at the start of headline tags
     85 perl -pe 's/(<h[1-6][^>]*>)(.*)\n+/$1$2/g' -i ./output/zeitung.md
     86 
     87 ## add linebreaks after closing headline or paragraph tags
     88 perl -pe 's/(<\/[hp][1-6]*>)/$1\n\n/g' -i ./output/zeitung.md
     89 
     90 ## replace headline tags with markdown syntax (starting at lvl 3)
     91 sed -E \
     92   -e 's/<h[4-6][^>]*?>/\n\n###### /g' \
     93   -e 's/<h3[^>]*?>/\n\n##### /g' \
     94   -e 's/<h2[^>]*?>/\n\n#### /g' \
     95   -e 's/<h1[^>]*?>/\n\n### /g' \
     96   -i ./output/zeitung.md
     97 
     98 ## replace strong, bold, italic, emphasis tags with markdown syntax
     99 perl -pe 'BEGIN{undef $/;} s/<strong[^>]*?>(.*?)<\/strong>/__$1__/smg' \
    100   -i ./output/zeitung.md
    101 perl -pe 'BEGIN{undef $/;} s/<b[^>]*?>(.*?)<\/b>/__$1__/smg' \
    102   -i ./output/zeitung.md
    103 perl -pe 'BEGIN{undef $/;} s/<i[^>]*?>(.*?)<\/i>/_$1_/smg' \
    104   -i ./output/zeitung.md
    105 perl -pe 'BEGIN{undef $/;} s/<em[^>]*?>(.*?)<\/em>/_$1_/smg' \
    106   -i ./output/zeitung.md
    107 
    108 ## remove any content that is within specific html tags
    109 perl -pe 'BEGIN{undef $/;} s/<time[^>]*?>.*?<\/time>//smg' \
    110   -i ./output/zeitung.md
    111 perl -pe 'BEGIN{undef $/;} s/<figure[^>]*?>.*?<\/figure>//smg' \
    112   -i ./output/zeitung.md
    113 perl -pe 'BEGIN{undef $/;} s/<figcaption[^>]*?>.*?<\/figcaption>//smg' \
    114   -i ./output/zeitung.md
    115 perl -pe 'BEGIN{undef $/;} s/<iframe[^>]*?>.*?<\/iframe>//smg' \
    116   -i ./output/zeitung.md
    117 
    118 ## remove any remaining HTML tags (but leave their content)
    119 perl -pe 'BEGIN{undef $/;} s/<[^>]+?>//smg' -i ./output/zeitung.md
    120 
    121 ## remove lines starting with empty space
    122 sed -E -e 's/^\s+//g' -i ./output/zeitung.md
    123 
    124 # clean up quirks --------------------------------------------------------------
    125 
    126 ## separated first letter at start of article
    127 sed -E -e 's/^([A-Z])\s([a-z])/\1\2/g' -i ./output/zeitung.md
    128 
    129 ## text is not allowed to contain literal "*", so escape it
    130 sed -E -e 's/\*/\\*/g' -i ./output/zeitung.md
    131 
    132 # generate internal linking by creating SHA1 hashes ----------------------------
    133 perl -MDigest::SHA=sha1_hex \
    134   -pe 's/HASH:(http[^(\)|\})]+)/sha1_hex$1/ge' \
    135   -i ./output/zeitung.md
    136 
    137 # remove double entries --------------------------------------------------------
    138 grep "{#" ./output/zeitung.md | \
    139 sed -E -e 's/(\-|\;|\\|\/|\ |\!|\"|\#|\$|\&|\(|\)|\||\*|\,|\<|\>|\[|\]|\^|\`|\{|\.)/\\\1/g' | \
    140 while read -r line; do
    141   perl -pe "BEGIN{undef $/;} s/$line.*?$line/$line/smg" -i ./output/zeitung.md
    142 done
    143 
    144 # determine system language ----------------------------------------------------
    145 lang=$(echo "$LANG" | sed -e 's/\..*$//g' -e 's/_/-/g')
    146 
    147 # generate output (html, pdf, epub) --------------------------------------------
    148 pandoc ./output/zeitung.md -t html -f markdown -o ./output/zeitung.html \
    149   --css=./templates/style.css --include-in-header=./templates/scale_fonts.html \
    150   --self-contained --toc --toc-depth=1 -V lang=$lang
    151 pandoc ./output/zeitung.md -t pdf -f markdown -o ./output/zeitung.pdf \
    152   --pdf-engine=xelatex --template eisvogel -V lang=$lang
    153 pandoc ./output/zeitung.md -t pdf -f markdown -o ./output/zeitung_mobile.pdf \
    154   --pdf-engine=xelatex --template eisvogel -V papersize=a5 -V lang=$lang
    155 pandoc ./output/zeitung.md -t epub -f markdown -o ./output/zeitung.epub \
    156   -V lang=$lang