generate.sh (5775B)
1 #!/usr/bin/env bash
2
3 # run perlanet on the config file to fetch all feeds and populate the markdown -
4 echo "[1/8] Gather RSS sources..."
5
6 mkdir -p ./output/
7 perlanet "./perlanetrc.yaml"
8
9 # remove all articles before yesterday -----------------------------------------
10 echo "[2/8] Remove old articles..."
11
12 ## remove start/stop markers from all recent articles
13 sed -E \
14 -e "s/^<\!--(START|STOP):$(date +%Y-%m-%d)T[0-9:]+-->$//g" \
15 -e "s/^<\!--(START|STOP):$(date +%Y-%m-%d -d 'yesterday')T[0-9:]+-->$//g" \
16 -i ./output/zeitung.md
17
18 ## remove all articles that still have start/stop markers
19 perl -pe 'BEGIN{undef $/;} s/<\!--START.*?<\!--STOP.*?-->//smg' \
20 -i ./output/zeitung.md
21
22 # ensure the file only includes characters the compile can understand ----------
23 echo "[3/8] Simplify character encoding..."
24
25 sed -E \
26 -e 's/á/\[`a\]/g' \
27 -e 's/é/\[`e\]/g' \
28 -e 's/í/\[`i\]/g' \
29 -e 's/ó/\[`o\]/g' \
30 -e 's/ú/\[`u\]/g' \
31 -e 's/Á/\[`A\]/g' \
32 -e 's/É/\[`E\]/g' \
33 -e 's/Í/\[`I\]/g' \
34 -e 's/Ó/\[`O\]/g' \
35 -e 's/Ú/\[`U\]/g' \
36 -e 's/â/\[\^a\]/g' \
37 -e 's/ê/\[\^e\]/g' \
38 -e 's/î/\[\^i\]/g' \
39 -e 's/ô/\[\^o\]/g' \
40 -e 's/û/\[\^u\]/g' \
41 -e 's/Â/\[\^A\]/g' \
42 -e 's/Ê/\[\^E\]/g' \
43 -e 's/Î/\[\^I\]/g' \
44 -e 's/Ô/\[\^O\]/g' \
45 -e 's/Û/\[\^U\]/g' \
46 -e 's/ß/\[ss\]/g' \
47 -e 's/ä/\[ae\]/g' \
48 -e 's/ü/\[ue\]/g' \
49 -e 's/ö/\[oe\]/g' \
50 -e 's/Ä/\[Ae\]/g' \
51 -e 's/Ü/\[Ue\]/g' \
52 -e 's/Ö/\[Oe\]/g' \
53 -e 's/\—/\-/g' \
54 -e 's/\–/\-/g' \
55 -i ./output/zeitung.md
56 iconv -t ascii -c ./output/zeitung.md | tee ./output/tmp.md > /dev/null
57 mv ./output/tmp.md ./output/zeitung.md
58 sed -E \
59 -e 's/\[`a\]/á/g' \
60 -e 's/\[`e\]/é/g' \
61 -e 's/\[`i\]/í/g' \
62 -e 's/\[`o\]/ó/g' \
63 -e 's/\[`u\]/ú/g' \
64 -e 's/\[`A\]/Á/g' \
65 -e 's/\[`E\]/É/g' \
66 -e 's/\[`I\]/Í/g' \
67 -e 's/\[`O\]/Ó/g' \
68 -e 's/\[`U\]/Ú/g' \
69 -e 's/\[\^a\]/â/g' \
70 -e 's/\[\^e\]/ê/g' \
71 -e 's/\[\^i\]/î/g' \
72 -e 's/\[\^o\]/ô/g' \
73 -e 's/\[\^u\]/û/g' \
74 -e 's/\[\^A\]/Â/g' \
75 -e 's/\[\^E\]/Ê/g' \
76 -e 's/\[\^I\]/Î/g' \
77 -e 's/\[\^O\]/Ô/g' \
78 -e 's/\[\^U\]/Û/g' \
79 -e 's/\[ss\]/ß/g' \
80 -e 's/\[ae\]/ä/g' \
81 -e 's/\[ue\]/ü/g' \
82 -e 's/\[oe\]/ö/g' \
83 -e 's/\[Ae\]/Ä/g' \
84 -e 's/\[Ue\]/Ü/g' \
85 -e 's/\[Oe\]/Ö/g' \
86 -i ./output/zeitung.md
87
88 # clean up HTML ----------------------------------------------------------------
89 echo "[4/8] Clean up full text articles..."
90
91 ## remove linebreaks at the start of headline tags
92 perl -pe 's/(<h[1-6][^>]*>)(.*)\n+/$1$2/g' -i ./output/zeitung.md
93
94 ## add linebreaks after closing headline or paragraph tags
95 perl -pe 's/(<\/[hp][1-6]*>)/$1\n\n/g' -i ./output/zeitung.md
96
97 ## replace headline tags with markdown syntax (starting at lvl 3)
98 sed -E \
99 -e 's/<h[4-6][^>]*?>/\n\n###### /g' \
100 -e 's/<h3[^>]*?>/\n\n##### /g' \
101 -e 's/<h2[^>]*?>/\n\n#### /g' \
102 -e 's/<h1[^>]*?>/\n\n### /g' \
103 -i ./output/zeitung.md
104
105 ## replace strong, bold, italic, emphasis tags with markdown syntax
106 perl -pe 'BEGIN{undef $/;} s/<strong[^>]*?>(.*?)<\/strong>/ __$1__ /smg' \
107 -i ./output/zeitung.md
108 perl -pe 'BEGIN{undef $/;} s/<b[^>]*?>(.*?)<\/b>/ __$1__ /smg' \
109 -i ./output/zeitung.md
110 perl -pe 'BEGIN{undef $/;} s/<i[^>]*?>(.*?)<\/i>/ _$1_ /smg' \
111 -i ./output/zeitung.md
112 perl -pe 'BEGIN{undef $/;} s/<em[^>]*?>(.*?)<\/em>/ _$1_ /smg' \
113 -i ./output/zeitung.md
114
115 ## remove any content that is within specific html tags
116 perl -pe 'BEGIN{undef $/;} s/<time[^>]*?>.*?<\/time>//smg' \
117 -i ./output/zeitung.md
118 perl -pe 'BEGIN{undef $/;} s/<figure[^>]*?>.*?<\/figure>//smg' \
119 -i ./output/zeitung.md
120 perl -pe 'BEGIN{undef $/;} s/<figcaption[^>]*?>.*?<\/figcaption>//smg' \
121 -i ./output/zeitung.md
122 perl -pe 'BEGIN{undef $/;} s/<iframe[^>]*?>.*?<\/iframe>//smg' \
123 -i ./output/zeitung.md
124
125 ## remove any remaining HTML tags (but leave their content)
126 perl -pe 'BEGIN{undef $/;} s/<[^>]+?>//smg' -i ./output/zeitung.md
127
128 ## remove empty space at the start of each line
129 sed -E -e 's/^\s+//g' -i ./output/zeitung.md
130
131 ## separated first letter at start of article
132 sed -E -e 's/^([A-Z])\s([a-z])/\1\2/g' -i ./output/zeitung.md
133
134 ## text is not allowed to contain literal "*" & "+", so escape it
135 sed -E -e 's/\*/\\*/g' -e 's/\+/\\+/g' -i ./output/zeitung.md
136
137 # generate internal linking by creating SHA1 hashes ----------------------------
138 echo "[5/8] Generate article IDs..."
139
140 perl -MDigest::SHA=sha1_hex \
141 -pe 's/HASH:(.+?):HASH/sha1_hex$1/ge' \
142 -i ./output/zeitung.md
143
144 # generate domains from URLs ---------------------------------------------------
145 echo "[6/8] Generate custom fields..."
146
147 sed -E -e 's/DOMAIN:http(s)*:\/\/([^\/]+).*?:DOMAIN/\2/g' -i ./output/zeitung.md
148
149 # remove double entries --------------------------------------------------------
150 echo "[7/8] Remove double entries..."
151
152 grep "{#" ./output/zeitung.md | \
153 sed -E -e 's/(\-|\;|\\|\/|\ |\!|\"|\#|\$|\&|\(|\)|\||\*|\,|\<|\>|\[|\]|\^|\`|\{|\.)/\\\1/g' | \
154 while read -r line; do
155 perl -pe "BEGIN{undef $/;} s/$line.*?$line/$line/smg" -i ./output/zeitung.md
156 done
157
158 # generate output (html, pdf, epub) --------------------------------------------
159 echo "[8/8] Generate Output..."
160
161 echo " -> HTML"
162 pandoc ./output/zeitung.md -t html -f markdown -o ./output/zeitung.html \
163 --css=./templates/style.css --include-in-header=./templates/scale_fonts.html \
164 --self-contained --toc --toc-depth=1
165
166 echo " -> PDF (DIN A4)"
167 pandoc ./output/zeitung.md -t pdf -f markdown -o ./output/zeitung.pdf \
168 --pdf-engine=xelatex --template eisvogel
169
170 echo " -> PDF (DIN A5)"
171 pandoc ./output/zeitung.md -t pdf -f markdown -o ./output/zeitung_mobile.pdf \
172 --pdf-engine=xelatex --template eisvogel -V papersize=a5
173
174 echo " -> EPUB"
175 pandoc ./output/zeitung.md -t epub -f markdown -o ./output/zeitung.epub
176
177 # done
178 echo "Done. Have fun reading."