generate.sh (6583B)
1 #!/usr/bin/env bash
2
3 # run perlanet on the config file to fetch all feeds and populate the markdown -
4 echo "[1/10] Gather RSS sources..."
5
6 mkdir -p ./output/
7 perlanet "./perlanetrc.yaml"
8
9 # remove all articles before yesterday -----------------------------------------
10 echo "[2/10] Remove old articles..."
11
12 ## remove start/stop markers from all recent articles
13 sed -E \
14 -e "s/^<\!--(START|STOP):$(date +%Y-%m-%d)T[0-9:]+-->$//g" \
15 -e "s/^<\!--(START|STOP):$(date +%Y-%m-%d -d 'yesterday')T[0-9:]+-->$//g" \
16 -i ./output/zeitung.md
17
18 ## remove all articles that still have start/stop markers
19 perl -pe 'BEGIN{undef $/;} s/<\!--START.*?<\!--STOP.*?-->//smg' \
20 -i ./output/zeitung.md
21
22 # ensure the file only includes characters the compile can understand ----------
23 echo "[3/10] Simplify character encoding..."
24
25 sed -E \
26 -e 's/á/\[`a\]/g' \
27 -e 's/é/\[`e\]/g' \
28 -e 's/í/\[`i\]/g' \
29 -e 's/ó/\[`o\]/g' \
30 -e 's/ú/\[`u\]/g' \
31 -e 's/Á/\[`A\]/g' \
32 -e 's/É/\[`E\]/g' \
33 -e 's/Í/\[`I\]/g' \
34 -e 's/Ó/\[`O\]/g' \
35 -e 's/Ú/\[`U\]/g' \
36 -e 's/â/\[\^a\]/g' \
37 -e 's/ê/\[\^e\]/g' \
38 -e 's/î/\[\^i\]/g' \
39 -e 's/ô/\[\^o\]/g' \
40 -e 's/û/\[\^u\]/g' \
41 -e 's/Â/\[\^A\]/g' \
42 -e 's/Ê/\[\^E\]/g' \
43 -e 's/Î/\[\^I\]/g' \
44 -e 's/Ô/\[\^O\]/g' \
45 -e 's/Û/\[\^U\]/g' \
46 -e 's/ß/\[ss\]/g' \
47 -e 's/ä/\[ae\]/g' \
48 -e 's/ü/\[ue\]/g' \
49 -e 's/ö/\[oe\]/g' \
50 -e 's/Ä/\[Ae\]/g' \
51 -e 's/Ü/\[Ue\]/g' \
52 -e 's/Ö/\[Oe\]/g' \
53 -e 's/\—/\-/g' \
54 -e 's/\–/\-/g' \
55 -i ./output/zeitung.md
56 tmp=$(mktemp --dry-run)
57 iconv -t ascii -c ./output/zeitung.md | tee "${tmp}" > /dev/null
58 iconv -t utf-8 "${tmp}" | tee ./output/zeitung.md > /dev/null
59 rm "${tmp}"
60 sed -E \
61 -e 's/\[`a\]/á/g' \
62 -e 's/\[`e\]/é/g' \
63 -e 's/\[`i\]/í/g' \
64 -e 's/\[`o\]/ó/g' \
65 -e 's/\[`u\]/ú/g' \
66 -e 's/\[`A\]/Á/g' \
67 -e 's/\[`E\]/É/g' \
68 -e 's/\[`I\]/Í/g' \
69 -e 's/\[`O\]/Ó/g' \
70 -e 's/\[`U\]/Ú/g' \
71 -e 's/\[\^a\]/â/g' \
72 -e 's/\[\^e\]/ê/g' \
73 -e 's/\[\^i\]/î/g' \
74 -e 's/\[\^o\]/ô/g' \
75 -e 's/\[\^u\]/û/g' \
76 -e 's/\[\^A\]/Â/g' \
77 -e 's/\[\^E\]/Ê/g' \
78 -e 's/\[\^I\]/Î/g' \
79 -e 's/\[\^O\]/Ô/g' \
80 -e 's/\[\^U\]/Û/g' \
81 -e 's/\[ss\]/ß/g' \
82 -e 's/\[ae\]/ä/g' \
83 -e 's/\[ue\]/ü/g' \
84 -e 's/\[oe\]/ö/g' \
85 -e 's/\[Ae\]/Ä/g' \
86 -e 's/\[Ue\]/Ü/g' \
87 -e 's/\[Oe\]/Ö/g' \
88 -i ./output/zeitung.md
89
90 # clean up HTML ----------------------------------------------------------------
91 echo "[4/10] Clean up full text articles..."
92
93 ## remove linebreaks at the start of headline tags
94 perl -pe 's/(<h[1-6][^>]*>)(.*)\n+/$1$2/g' -i ./output/zeitung.md
95
96 ## add linebreaks after closing headline or paragraph tags
97 perl -pe 's/(<\/[hp][1-6]*>)/$1\n\n/g' -i ./output/zeitung.md
98
99 ## replace headline tags with markdown syntax (starting at lvl 3)
100 sed -E \
101 -e 's/<h[4-6][^>]*?>/\n\n###### /g' \
102 -e 's/<h3[^>]*?>/\n\n##### /g' \
103 -e 's/<h2[^>]*?>/\n\n#### /g' \
104 -e 's/<h1[^>]*?>/\n\n### /g' \
105 -i ./output/zeitung.md
106
107 ## replace strong, bold, italic, emphasis tags with markdown syntax
108 perl -pe 'BEGIN{undef $/;} s/<strong[^>]*?>\s*(.*?)\s*<\/strong>/ __$1__ /smg' \
109 -i ./output/zeitung.md
110 perl -pe 'BEGIN{undef $/;} s/<b[^>]*?>\s*(.*?)\s*<\/b>/ __$1__ /smg' \
111 -i ./output/zeitung.md
112 perl -pe 'BEGIN{undef $/;} s/<i[^>]*?>\s*(.*?)\s*<\/i>/ _$1_ /smg' \
113 -i ./output/zeitung.md
114 perl -pe 'BEGIN{undef $/;} s/<em[^>]*?>\s*(.*?)\s*<\/em>/ _$1_ /smg' \
115 -i ./output/zeitung.md
116
117 ## remove any content that is within specific html tags
118 perl -pe 'BEGIN{undef $/;} s/<time[^>]*?>.*?<\/time>//smg' \
119 -i ./output/zeitung.md
120 perl -pe 'BEGIN{undef $/;} s/<figure[^>]*?>.*?<\/figure>//smg' \
121 -i ./output/zeitung.md
122 perl -pe 'BEGIN{undef $/;} s/<figcaption[^>]*?>.*?<\/figcaption>//smg' \
123 -i ./output/zeitung.md
124 perl -pe 'BEGIN{undef $/;} s/<iframe[^>]*?>.*?<\/iframe>//smg' \
125 -i ./output/zeitung.md
126
127 ## remove any remaining HTML tags (but leave their content)
128 perl -pe 'BEGIN{undef $/;} s/<[^>]+?>//smg' -i ./output/zeitung.md
129
130 ## remove empty space at the start of each line
131 sed -E -e 's/^\s+//g' -i ./output/zeitung.md
132
133 ## separated first letter at start of article
134 sed -E -e 's/^([A-Z])\s([a-z])/\1\2/g' -i ./output/zeitung.md
135
136 ## text is not allowed to contain literal "*" & "+", so escape it
137 sed -E -e 's/\*/\\*/g' -e 's/\+/\\+/g' -i ./output/zeitung.md
138
139 # generate internal linking by creating SHA1 hashes ----------------------------
140 echo "[5/10] Generate article IDs..."
141
142 perl -MDigest::SHA=sha1_hex \
143 -pe 's/HASH:(.+?):HASH/sha1_hex$1/ge' \
144 -i ./output/zeitung.md
145
146 # generate domains from URLs ---------------------------------------------------
147 echo "[6/10] Generate custom fields..."
148
149 sed -E -e 's/DOMAIN:http(s)*:\/\/([^\/]+).*?:DOMAIN/\2/g' -i ./output/zeitung.md
150
151 # remove double entries --------------------------------------------------------
152 echo "[7/10] Remove double entries..."
153
154 grep "{#" ./output/zeitung.md | \
155 sed -E -e 's/(\-|\;|\\|\/|\ |\!|\"|\#|\$|\&|\(|\)|\||\*|\,|\<|\>|\[|\]|\^|\`|\{|\.)/\\\1/g' | \
156 while read -r line; do
157 perl -pe "BEGIN{undef $/;} s/$line.*?$line/$line/smg" -i ./output/zeitung.md
158 done
159
160 # count entries ----------------------------------------------------------------
161 echo "[8/10] Insert counters..."
162
163 # Insert absolute counts
164 count=$(grep -c "\[NUMBER\]" ./output/zeitung.md)
165 sed -e "s/\[COUNT\]/${count}/g" -i ./output/zeitung.md
166
167 # add running numbers into template
168 for number in $(seq 1 1 $count); do
169 sed -e "0,/\[NUMBER\]/{s//${number}/}" -i ./output/zeitung.md
170 done
171
172 # final cleanup ----------------------------------------------------------------
173 echo "[9/10] Cleaning up source file..."
174
175 # remove double empty lines
176 cat -s ./output/zeitung.md | tee ./output/tmp.md > /dev/null
177 mv ./output/tmp.md ./output/zeitung.md
178
179 # generate output (html, pdf, epub) --------------------------------------------
180 echo "[10/10] Generate Output..."
181
182 echo " -> HTML"
183 pandoc ./output/zeitung.md -t html -f markdown -o ./output/zeitung.html \
184 --css=./templates/style.css --include-in-header=./templates/scale_fonts.html \
185 --embed-resources --standalone --toc --toc-depth=1
186
187 echo " -> PDF (DIN A4)"
188 pandoc ./output/zeitung.md -t pdf -f markdown -o ./output/zeitung.pdf \
189 --pdf-engine=xelatex --template eisvogel
190
191 echo " -> PDF (DIN A5)"
192 pandoc ./output/zeitung.md -t pdf -f markdown -o ./output/zeitung_mobile.pdf \
193 --pdf-engine=xelatex --template eisvogel -V papersize=a5
194
195 echo " -> EPUB"
196 pandoc ./output/zeitung.md -t epub -f markdown -o ./output/zeitung.epub \
197 --css=./templates/style_epub.css --embed-resources --standalone
198
199 # done
200 echo "Done. Have fun reading."