generate.sh (5391B)
1 #!/usr/bin/env bash
2
3 # run perlanet on the config file to fetch all feeds and populate the markdown -
4 mkdir -p ./output/
5 perlanet "./perlanetrc.yaml"
6
7 # remove all articles before yesterday -----------------------------------------
8
9 ## remove start/stop markers from all recent articles
10 sed -E \
11 -e "s/^<\!--(START|STOP):$(date +%Y-%m-%d)T[0-9:]+-->$//g" \
12 -e "s/^<\!--(START|STOP):$(date +%Y-%m-%d -d 'yesterday')T[0-9:]+-->$//g" \
13 -i ./output/zeitung.md
14
15 ## remove all articles that still have start/stop markers
16 perl -pe 'BEGIN{undef $/;} s/<\!--START.*?<\!--STOP.*?-->//smg' \
17 -i ./output/zeitung.md
18
19 # ensure the file only includes characters the compile can understand ----------
20
21 sed -E \
22 -e 's/á/\[\`a\]/g' \
23 -e 's/é/\[\`e\]/g' \
24 -e 's/í/\[\`i\]/g' \
25 -e 's/ó/\[\`o\]/g' \
26 -e 's/ú/\[\`u\]/g' \
27 -e 's/Á/\[\`A\]/g' \
28 -e 's/É/\[\`E\]/g' \
29 -e 's/Í/\[\`I\]/g' \
30 -e 's/Ó/\[\`O\]/g' \
31 -e 's/Ú/\[\`U\]/g' \
32 -e 's/â/\[\^a\]/g' \
33 -e 's/ê/\[\^e\]/g' \
34 -e 's/î/\[\^i\]/g' \
35 -e 's/ô/\[\^o\]/g' \
36 -e 's/û/\[\^u\]/g' \
37 -e 's/Â/\[\^A\]/g' \
38 -e 's/Ê/\[\^E\]/g' \
39 -e 's/Î/\[\^I\]/g' \
40 -e 's/Ô/\[\^O\]/g' \
41 -e 's/Û/\[\^U\]/g' \
42 -e 's/ß/\[ss\]/g' \
43 -e 's/ä/\[ae\]/g' \
44 -e 's/ü/\[\ue\]/g' \
45 -e 's/ö/\[oe\]/g' \
46 -e 's/Ä/\[Ae\]/g' \
47 -e 's/Ü/\[Ue\]/g' \
48 -e 's/Ö/\[Oe\]/g' \
49 -i ./output/zeitung.md
50 iconv -t ascii -c ./output/zeitung.md | tee ./output/tmp.md > /dev/null
51 mv ./output/tmp.md ./output/zeitung.md
52 sed -E \
53 -e 's/\[\`a\]/á/g' \
54 -e 's/\[\`e\]/é/g' \
55 -e 's/\[\`i\]/í/g' \
56 -e 's/\[\`o\]/ó/g' \
57 -e 's/\[\`u\]/ú/g' \
58 -e 's/\[\`A\]/Á/g' \
59 -e 's/\[\`E\]/É/g' \
60 -e 's/\[\`I\]/Í/g' \
61 -e 's/\[\`O\]/Ó/g' \
62 -e 's/\[\`U\]/Ú/g' \
63 -e 's/\[\^a\]/â/g' \
64 -e 's/\[\^e\]/ê/g' \
65 -e 's/\[\^i\]/î/g' \
66 -e 's/\[\^o\]/ô/g' \
67 -e 's/\[\^u\]/û/g' \
68 -e 's/\[\^A\]/Â/g' \
69 -e 's/\[\^E\]/Ê/g' \
70 -e 's/\[\^I\]/Î/g' \
71 -e 's/\[\^O\]/Ô/g' \
72 -e 's/\[\^U\]/Û/g' \
73 -e 's/\[ss\]/ß/g' \
74 -e 's/\[ae\]/ä/g' \
75 -e 's/\[\ue\]/ü/g' \
76 -e 's/\[oe\]/ö/g' \
77 -e 's/\[Ae\]/Ä/g' \
78 -e 's/\[Ue\]/Ü/g' \
79 -e 's/\[Oe\]/Ö/g' \
80 -i ./output/zeitung.md
81
82 # clean up HTML ----------------------------------------------------------------
83
84 ## remove linebreaks at the start of headline tags
85 perl -pe 's/(<h[1-6][^>]*>)(.*)\n+/$1$2/g' -i ./output/zeitung.md
86
87 ## add linebreaks after closing headline or paragraph tags
88 perl -pe 's/(<\/[hp][1-6]*>)/$1\n\n/g' -i ./output/zeitung.md
89
90 ## replace headline tags with markdown syntax (starting at lvl 3)
91 sed -E \
92 -e 's/<h[4-6][^>]*?>/\n\n###### /g' \
93 -e 's/<h3[^>]*?>/\n\n##### /g' \
94 -e 's/<h2[^>]*?>/\n\n#### /g' \
95 -e 's/<h1[^>]*?>/\n\n### /g' \
96 -i ./output/zeitung.md
97
98 ## replace strong, bold, italic, emphasis tags with markdown syntax
99 perl -pe 'BEGIN{undef $/;} s/<strong[^>]*?>(.*?)<\/strong>/__$1__/smg' \
100 -i ./output/zeitung.md
101 perl -pe 'BEGIN{undef $/;} s/<b[^>]*?>(.*?)<\/b>/__$1__/smg' \
102 -i ./output/zeitung.md
103 perl -pe 'BEGIN{undef $/;} s/<i[^>]*?>(.*?)<\/i>/_$1_/smg' \
104 -i ./output/zeitung.md
105 perl -pe 'BEGIN{undef $/;} s/<em[^>]*?>(.*?)<\/em>/_$1_/smg' \
106 -i ./output/zeitung.md
107
108 ## remove any content that is within specific html tags
109 perl -pe 'BEGIN{undef $/;} s/<time[^>]*?>.*?<\/time>//smg' \
110 -i ./output/zeitung.md
111 perl -pe 'BEGIN{undef $/;} s/<figure[^>]*?>.*?<\/figure>//smg' \
112 -i ./output/zeitung.md
113 perl -pe 'BEGIN{undef $/;} s/<figcaption[^>]*?>.*?<\/figcaption>//smg' \
114 -i ./output/zeitung.md
115 perl -pe 'BEGIN{undef $/;} s/<iframe[^>]*?>.*?<\/iframe>//smg' \
116 -i ./output/zeitung.md
117
118 ## remove any remaining HTML tags (but leave their content)
119 perl -pe 'BEGIN{undef $/;} s/<[^>]+?>//smg' -i ./output/zeitung.md
120
121 ## remove lines starting with empty space
122 sed -E -e 's/^\s+//g' -i ./output/zeitung.md
123
124 # clean up quirks --------------------------------------------------------------
125
126 ## separated first letter at start of article
127 sed -E -e 's/^([A-Z])\s([a-z])/\1\2/g' -i ./output/zeitung.md
128
129 ## text is not allowed to contain literal "*", so escape it
130 sed -E -e 's/\*/\\*/g' -i ./output/zeitung.md
131
132 # generate internal linking by creating SHA1 hashes ----------------------------
133 perl -MDigest::SHA=sha1_hex \
134 -pe 's/HASH:(http[^(\)|\})]+)/sha1_hex$1/ge' \
135 -i ./output/zeitung.md
136
137 # remove double entries --------------------------------------------------------
138 grep "{#" ./output/zeitung.md | \
139 sed -E -e 's/(\-|\;|\\|\/|\ |\!|\"|\#|\$|\&|\(|\)|\||\*|\,|\<|\>|\[|\]|\^|\`|\{|\.)/\\\1/g' | \
140 while read -r line; do
141 perl -pe "BEGIN{undef $/;} s/$line.*?$line/$line/smg" -i ./output/zeitung.md
142 done
143
144 # determine system language ----------------------------------------------------
145 lang=$(echo "$LANG" | sed -e 's/\..*$//g' -e 's/_/-/g')
146
147 # generate output (html, pdf, epub) --------------------------------------------
148 pandoc ./output/zeitung.md -t html -f markdown -o ./output/zeitung.html \
149 --css=./templates/style.css --include-in-header=./templates/scale_fonts.html \
150 --self-contained --toc --toc-depth=1 -V lang=$lang
151 pandoc ./output/zeitung.md -t pdf -f markdown -o ./output/zeitung.pdf \
152 --pdf-engine=xelatex --template eisvogel -V lang=$lang
153 pandoc ./output/zeitung.md -t pdf -f markdown -o ./output/zeitung_mobile.pdf \
154 --pdf-engine=xelatex --template eisvogel -V papersize=a5 -V lang=$lang
155 pandoc ./output/zeitung.md -t epub -f markdown -o ./output/zeitung.epub \
156 -V lang=$lang