pub / zeitung

News-Aggregator in newspaper style as HTML/PDF/EPUB
git clone src.jayvii.de/pub/zeitung.git
Home | Log | Files | Exports | Refs | README | RSS

generate.sh (5775B)


      1 #!/usr/bin/env bash
      2 
      3 # run perlanet on the config file to fetch all feeds and populate the markdown -
      4 echo "[1/8] Gather RSS sources..."
      5 
      6 mkdir -p ./output/
      7 perlanet "./perlanetrc.yaml"
      8 
      9 # remove all articles before yesterday -----------------------------------------
     10 echo "[2/8] Remove old articles..."
     11 
     12 ## remove start/stop markers from all recent articles
     13 sed -E \
     14   -e "s/^<\!--(START|STOP):$(date +%Y-%m-%d)T[0-9:]+-->$//g" \
     15   -e "s/^<\!--(START|STOP):$(date +%Y-%m-%d -d 'yesterday')T[0-9:]+-->$//g" \
     16   -i ./output/zeitung.md
     17 
     18 ## remove all articles that still have start/stop markers
     19 perl -pe 'BEGIN{undef $/;} s/<\!--START.*?<\!--STOP.*?-->//smg' \
     20   -i ./output/zeitung.md
     21 
     22 # ensure the file only includes characters the compile can understand ----------
     23 echo "[3/8] Simplify character encoding..."
     24 
     25 sed -E \
     26   -e 's/á/\[`a\]/g' \
     27   -e 's/é/\[`e\]/g' \
     28   -e 's/í/\[`i\]/g' \
     29   -e 's/ó/\[`o\]/g' \
     30   -e 's/ú/\[`u\]/g' \
     31   -e 's/Á/\[`A\]/g' \
     32   -e 's/É/\[`E\]/g' \
     33   -e 's/Í/\[`I\]/g' \
     34   -e 's/Ó/\[`O\]/g' \
     35   -e 's/Ú/\[`U\]/g' \
     36   -e 's/â/\[\^a\]/g' \
     37   -e 's/ê/\[\^e\]/g' \
     38   -e 's/î/\[\^i\]/g' \
     39   -e 's/ô/\[\^o\]/g' \
     40   -e 's/û/\[\^u\]/g' \
     41   -e 's/Â/\[\^A\]/g' \
     42   -e 's/Ê/\[\^E\]/g' \
     43   -e 's/Î/\[\^I\]/g' \
     44   -e 's/Ô/\[\^O\]/g' \
     45   -e 's/Û/\[\^U\]/g' \
     46   -e 's/ß/\[ss\]/g' \
     47   -e 's/ä/\[ae\]/g' \
     48   -e 's/ü/\[ue\]/g' \
     49   -e 's/ö/\[oe\]/g' \
     50   -e 's/Ä/\[Ae\]/g' \
     51   -e 's/Ü/\[Ue\]/g' \
     52   -e 's/Ö/\[Oe\]/g' \
     53   -e 's/\—/\-/g' \
     54   -e 's/\–/\-/g' \
     55   -i ./output/zeitung.md
     56 iconv -t ascii -c ./output/zeitung.md | tee ./output/tmp.md > /dev/null
     57 mv ./output/tmp.md ./output/zeitung.md
     58 sed -E \
     59   -e 's/\[`a\]/á/g' \
     60   -e 's/\[`e\]/é/g' \
     61   -e 's/\[`i\]/í/g' \
     62   -e 's/\[`o\]/ó/g' \
     63   -e 's/\[`u\]/ú/g' \
     64   -e 's/\[`A\]/Á/g' \
     65   -e 's/\[`E\]/É/g' \
     66   -e 's/\[`I\]/Í/g' \
     67   -e 's/\[`O\]/Ó/g' \
     68   -e 's/\[`U\]/Ú/g' \
     69   -e 's/\[\^a\]/â/g' \
     70   -e 's/\[\^e\]/ê/g' \
     71   -e 's/\[\^i\]/î/g' \
     72   -e 's/\[\^o\]/ô/g' \
     73   -e 's/\[\^u\]/û/g' \
     74   -e 's/\[\^A\]/Â/g' \
     75   -e 's/\[\^E\]/Ê/g' \
     76   -e 's/\[\^I\]/Î/g' \
     77   -e 's/\[\^O\]/Ô/g' \
     78   -e 's/\[\^U\]/Û/g' \
     79   -e 's/\[ss\]/ß/g' \
     80   -e 's/\[ae\]/ä/g' \
     81   -e 's/\[ue\]/ü/g' \
     82   -e 's/\[oe\]/ö/g' \
     83   -e 's/\[Ae\]/Ä/g' \
     84   -e 's/\[Ue\]/Ü/g' \
     85   -e 's/\[Oe\]/Ö/g' \
     86   -i ./output/zeitung.md
     87 
     88 # clean up HTML ----------------------------------------------------------------
     89 echo "[4/8] Clean up full text articles..."
     90 
     91 ## remove linebreaks at the start of headline tags
     92 perl -pe 's/(<h[1-6][^>]*>)(.*)\n+/$1$2/g' -i ./output/zeitung.md
     93 
     94 ## add linebreaks after closing headline or paragraph tags
     95 perl -pe 's/(<\/[hp][1-6]*>)/$1\n\n/g' -i ./output/zeitung.md
     96 
     97 ## replace headline tags with markdown syntax (starting at lvl 3)
     98 sed -E \
     99   -e 's/<h[4-6][^>]*?>/\n\n###### /g' \
    100   -e 's/<h3[^>]*?>/\n\n##### /g' \
    101   -e 's/<h2[^>]*?>/\n\n#### /g' \
    102   -e 's/<h1[^>]*?>/\n\n### /g' \
    103   -i ./output/zeitung.md
    104 
    105 ## replace strong, bold, italic, emphasis tags with markdown syntax
    106 perl -pe 'BEGIN{undef $/;} s/<strong[^>]*?>(.*?)<\/strong>/ __$1__ /smg' \
    107   -i ./output/zeitung.md
    108 perl -pe 'BEGIN{undef $/;} s/<b[^>]*?>(.*?)<\/b>/ __$1__ /smg' \
    109   -i ./output/zeitung.md
    110 perl -pe 'BEGIN{undef $/;} s/<i[^>]*?>(.*?)<\/i>/ _$1_ /smg' \
    111   -i ./output/zeitung.md
    112 perl -pe 'BEGIN{undef $/;} s/<em[^>]*?>(.*?)<\/em>/ _$1_ /smg' \
    113   -i ./output/zeitung.md
    114 
    115 ## remove any content that is within specific html tags
    116 perl -pe 'BEGIN{undef $/;} s/<time[^>]*?>.*?<\/time>//smg' \
    117   -i ./output/zeitung.md
    118 perl -pe 'BEGIN{undef $/;} s/<figure[^>]*?>.*?<\/figure>//smg' \
    119   -i ./output/zeitung.md
    120 perl -pe 'BEGIN{undef $/;} s/<figcaption[^>]*?>.*?<\/figcaption>//smg' \
    121   -i ./output/zeitung.md
    122 perl -pe 'BEGIN{undef $/;} s/<iframe[^>]*?>.*?<\/iframe>//smg' \
    123   -i ./output/zeitung.md
    124 
    125 ## remove any remaining HTML tags (but leave their content)
    126 perl -pe 'BEGIN{undef $/;} s/<[^>]+?>//smg' -i ./output/zeitung.md
    127 
    128 ## remove empty space at the start of each line
    129 sed -E -e 's/^\s+//g' -i ./output/zeitung.md
    130 
    131 ## separated first letter at start of article
    132 sed -E -e 's/^([A-Z])\s([a-z])/\1\2/g' -i ./output/zeitung.md
    133 
    134 ## text is not allowed to contain literal "*" & "+", so escape it
    135 sed -E -e 's/\*/\\*/g' -e 's/\+/\\+/g' -i ./output/zeitung.md
    136 
    137 # generate internal linking by creating SHA1 hashes ----------------------------
    138 echo "[5/8] Generate article IDs..."
    139 
    140 perl -MDigest::SHA=sha1_hex \
    141   -pe 's/HASH:(.+?):HASH/sha1_hex$1/ge' \
    142   -i ./output/zeitung.md
    143 
    144 # generate domains from URLs ---------------------------------------------------
    145 echo "[6/8] Generate custom fields..."
    146 
    147 sed -E -e 's/DOMAIN:http(s)*:\/\/([^\/]+).*?:DOMAIN/\2/g' -i ./output/zeitung.md
    148 
    149 # remove double entries --------------------------------------------------------
    150 echo "[7/8] Remove double entries..."
    151 
    152 grep "{#" ./output/zeitung.md | \
    153 sed -E -e 's/(\-|\;|\\|\/|\ |\!|\"|\#|\$|\&|\(|\)|\||\*|\,|\<|\>|\[|\]|\^|\`|\{|\.)/\\\1/g' | \
    154 while read -r line; do
    155   perl -pe "BEGIN{undef $/;} s/$line.*?$line/$line/smg" -i ./output/zeitung.md
    156 done
    157 
    158 # generate output (html, pdf, epub) --------------------------------------------
    159 echo "[8/8] Generate Output..."
    160 
    161 echo "      -> HTML"
    162 pandoc ./output/zeitung.md -t html -f markdown -o ./output/zeitung.html \
    163   --css=./templates/style.css --include-in-header=./templates/scale_fonts.html \
    164   --self-contained --toc --toc-depth=1
    165 
    166 echo "      -> PDF (DIN A4)"
    167 pandoc ./output/zeitung.md -t pdf -f markdown -o ./output/zeitung.pdf \
    168   --pdf-engine=xelatex --template eisvogel
    169 
    170 echo "      -> PDF (DIN A5)"
    171 pandoc ./output/zeitung.md -t pdf -f markdown -o ./output/zeitung_mobile.pdf \
    172   --pdf-engine=xelatex --template eisvogel -V papersize=a5
    173 
    174 echo "      -> EPUB"
    175 pandoc ./output/zeitung.md -t epub -f markdown -o ./output/zeitung.epub
    176 
    177 # done
    178 echo "Done. Have fun reading."