1 | #!/usr/bin/perl |
---|
2 | |
---|
3 | use URI::Escape; |
---|
4 | use HTML::Entities; |
---|
5 | use Encode; |
---|
6 | require "../common/common.pm"; |
---|
7 | use utf8; |
---|
8 | |
---|
9 | $| = 1; |
---|
10 | $file = shift; |
---|
11 | open(FILE, $file); |
---|
12 | @doc = <FILE>; |
---|
13 | $doc = "@doc"; |
---|
14 | @doc = (); |
---|
15 | close FILE; |
---|
16 | |
---|
17 | $url_source = uri_unescape($file); |
---|
18 | $url_source =~ s/.*http/http/; |
---|
19 | |
---|
20 | $session = ''; |
---|
21 | %num_lois = (); |
---|
22 | |
---|
23 | if ($doc =~ /ance du (\d+e?r? \S+ \d+)/i) { |
---|
24 | @date = datize($1); |
---|
25 | $date = join '-', @date; |
---|
26 | $session = sessionize(@date); |
---|
27 | } |
---|
28 | |
---|
29 | $doc =~ s/\n/ /g; |
---|
30 | $doc =~ s/ / /gi; |
---|
31 | $doc =~ s/.* id="par_1"/<p id="par_1"/; |
---|
32 | $doc =~ s/<p class="l1_signature" .*//; |
---|
33 | $doc =~ s/<!--[^>]*-->//g; |
---|
34 | $doc =~ s/> +</></g; |
---|
35 | $doc =~ s/ / /g; |
---|
36 | $doc =~ s/<\/i> *<i>/ /gi; |
---|
37 | $doc =~ s/(info_entre_parentheses">[^<]*)<\/span> *<i>([^<]*)<\/i>/$1 $2<\/span>/gi; |
---|
38 | |
---|
39 | #$doc =~ s/<\/?(i|cri|div)[^>]*>//ig; |
---|
40 | #$doc =~ s/<p[^>]+class="[^"]+_article"[^>]*>[^<]+<\/p>//g; |
---|
41 | |
---|
42 | |
---|
43 | $doc =~ s/ +/ /g; |
---|
44 | $doc =~ s/<\/p>/<\/p>\n/g; |
---|
45 | |
---|
46 | $intervention = ''; |
---|
47 | $timestamp = 0; |
---|
48 | sub print_inter { |
---|
49 | if ($heure && $intervention && $intervention ne "<p></p>") { |
---|
50 | $intervention =~ s/\s*,\s*/, /g; |
---|
51 | $timestamp += 20; |
---|
52 | $context = $bigcontext; |
---|
53 | $context =~ s/ suite$//; |
---|
54 | $context .= ' > '.$subcontext if ($subcontext); |
---|
55 | if ($resetcontexte) { |
---|
56 | if (!$inter) { |
---|
57 | $context = ""; |
---|
58 | } else { |
---|
59 | $resetcontexte = 0; |
---|
60 | if ($intervention !~ /Nous poursuivons / && $intervention !~ /séance.*reprise.*(amendement.*présenté|(poursuiv|continu|repren)ons.*(discussion|examen|débat)|dans.*discussion.*sommes.*(arrivés|parvenus)|parole.*répondre.*orateurs|je.*mets.*aux.*voix|nous.*allons.*procéder.*(scrutin|délibération))/) { |
---|
61 | $oldbigcontext = $bigcontext; |
---|
62 | $bigcontext = ""; |
---|
63 | $subcontext = ""; |
---|
64 | $context = ""; |
---|
65 | $numeros_loi = ''; |
---|
66 | } |
---|
67 | } |
---|
68 | } |
---|
69 | |
---|
70 | $cpt = 0; |
---|
71 | if ($context =~ /procès verbal|ordre du jour|Conf[&#\d;é]+rence des pr[&#\d;é]+sidents|question.*(crible|orale|gouvernement)/i) { |
---|
72 | $numeros_loi = ''; |
---|
73 | }elsif ($subcontext !~ /article|discussion g/i && $intervention =~ /((projet|proposition|motion|lettre)\s[^<]*(n°|n<sup>os?<\/sup>|nos?|n&[^;]+;&[^;]+;)[^<\.]{1,5}\d[^<\.]+)/i && $intervention !~ /amendements? n/) { |
---|
74 | while ($intervention =~ /((projet|proposition|motion|lettre)\s[^<]*(n°|n<sup>os?<\/sup>|nos?|n&[^;]+;&[^;]+;)[^<\.]{1,5}\d[^<\.]+)/gi) { |
---|
75 | $docs = $1; |
---|
76 | $docs =~ s/°//g; |
---|
77 | $docs =~ s/&[^;]*;//g; |
---|
78 | if ($docs =~ /(\d+)([\(\[\, ]+(\d{4}[- ]+\d{4})|)/) { |
---|
79 | while ($docs =~ /\D(\d{1,3})([\(\[\, ]+(\d{4}[- ]+\d{4})|)/g) { |
---|
80 | if ($3) { |
---|
81 | $numeros_loi .= law_numberize($1,$3).","; |
---|
82 | }else{ |
---|
83 | $numeros_loi .= law_numberize($1,$session).","; |
---|
84 | } |
---|
85 | } |
---|
86 | } |
---|
87 | } |
---|
88 | chop($numeros_loi) if ($cpt); |
---|
89 | $num_lois{$bigcontext} = $numeros_loi if ($cpt); |
---|
90 | } |
---|
91 | $numeros_loi = $num_lois{$bigcontext} if (!$numeros_loi); |
---|
92 | if ($intervention =~ /amendements? n([^<]+)/) { |
---|
93 | $amdt = $1; |
---|
94 | $amdt =~ s/&[^;]*;//g; |
---|
95 | $amdt =~ s/°//g; |
---|
96 | $amendements = ''; |
---|
97 | while ($amdt =~ /(\d+)( ?rect|)\D/g) { |
---|
98 | $amendements .= "$1"; |
---|
99 | # $amendements .= " rectifié" if ($2); |
---|
100 | $amendements .= ","; |
---|
101 | } |
---|
102 | chop($amendements); |
---|
103 | } |
---|
104 | $intervention =~ s/<p> +/<p>/g; |
---|
105 | $secondinter = ''; |
---|
106 | $secondinter = $1 if ($inter =~ s/ et (.*)//) ; |
---|
107 | $secondinter =~ s/^\s*M[mles]{0,3}[\.\s]+//; |
---|
108 | $json = '{"contexte": "'.quotize($context).'", "intervention": "'.quotize($intervention).'", "timestamp": "'.$timestamp.'", "date": "'.$date.'", "source": "'.$url_source.$source.'", "heure":"'.$heure.'", "intervenant": "'.name_lowerize($inter,1).'", "fonction": "'.$fonction.'", "intervenant_url": "'.$url_inter.'", "session":"'.$session.'"'; |
---|
109 | $json .= ', "numeros_loi":"'.$numeros_loi.'"' if ($numeros_loi && $context); |
---|
110 | $json .= ', "amendements":"'.$amendements.'"' if ($amendements && $context); |
---|
111 | $json .= "}\n"; |
---|
112 | utf8::encode($json); |
---|
113 | print $json; |
---|
114 | if ($secondinter) { |
---|
115 | $json = '{"contexte": "'.quotize($context).'", "intervention": "'.quotize($intervention).'", "timestamp": "'.$timestamp.'", "date": "'.$date.'", "source": "'.$url_source.$source.'", "heure":"'.$heure.'", "intervenant": "'.name_lowerize($secondinter,1).'", "fonction": "", "intervenant_url": "'.$url_inter.'", "session":"'.$session.'"'; |
---|
116 | $json .= ', "numeros_loi":"'.$numeros_loi.'"' if ($numeros_loi && $context); |
---|
117 | $json .= ', "amendements":"'.$amendements.'"' if ($amendements && $context); |
---|
118 | $json .= "}\n"; |
---|
119 | utf8::encode($json); |
---|
120 | print $json; |
---|
121 | } |
---|
122 | } |
---|
123 | $intervention = ''; |
---|
124 | $inter = ''; |
---|
125 | $fonction = ''; |
---|
126 | $url_inter = ''; |
---|
127 | $amendements = ''; |
---|
128 | } |
---|
129 | |
---|
130 | $doc =~ s/(class="titre_S1"[^>]*>[^<]*)\s*<[^\n]*\n[^\n]*class="titre_S1"[^>]*>\s*/\1 /g; |
---|
131 | $resetcontexte = $oldhtab = 0; |
---|
132 | foreach (split /\n/, $doc) { |
---|
133 | s/&(nbsp|#160);/ /ig; |
---|
134 | utf8::decode($_); |
---|
135 | s/ n<sup>[0os\s]+<\/sup>\s*/ n° /ig; |
---|
136 | $_ = decode_entities($_); |
---|
137 | if (/<\/span><span([^>]*>)/ && $1 !~ /orateur_qualite/) { |
---|
138 | s/<\/span><span[^>]*>/ /g; |
---|
139 | s/ ' /'/g; |
---|
140 | } |
---|
141 | if (/ (id|name)="([^"]+)"/) { |
---|
142 | $source = "#$2"; |
---|
143 | } |
---|
144 | if (/<(i|span class="info_entre_parentheses")>\s*\((.*)<\/(i|span)>([\.\s\)]*)/) { |
---|
145 | $didasc = $2; |
---|
146 | $didasc =~ s/\)$//; |
---|
147 | $didasc =~ s/<[^>]*>//g; |
---|
148 | $didasc =~ s/vingt et une/vingt-et-une/gi; |
---|
149 | if ($didasc =~ /(ouverte|reprise) (à|à) (midi\s*\S*|\S+ heures\s*\S*)\W/) { |
---|
150 | $h = heurize($3); |
---|
151 | ($htab) = split /:/, $h; |
---|
152 | if (!$heure || ($htab > 13 && $oldhtab < 14) || ($htab > 20 && $oldhtab < 21)) { |
---|
153 | print_inter(); |
---|
154 | $intervention = "<p>$didasc</p>"; |
---|
155 | $oldhtab = $htab; |
---|
156 | if (!$heure) { |
---|
157 | $heure = $h; |
---|
158 | print_inter(); |
---|
159 | next; |
---|
160 | } |
---|
161 | print_inter(); |
---|
162 | $resetcontexte = 1 if ($heure); |
---|
163 | $heure = $h; |
---|
164 | $timestamp = 0; |
---|
165 | } |
---|
166 | } |
---|
167 | } |
---|
168 | if (/>[^a-z]*Pr(é|É)sidence de (M[^<]*)/i) { |
---|
169 | $president = $2; |
---|
170 | $president =~ s/^\s*M[mles]{0,3}[\.\s]+//; |
---|
171 | $president =~ s/\s([a-z])(\w+)$/ \U$1$2/; |
---|
172 | } |
---|
173 | next if (!$heure); |
---|
174 | if (/class="intervenant/) { |
---|
175 | if (/class="orateur_nom"[^>]*>([^<]+)</) { |
---|
176 | $tmpinter = $1; |
---|
177 | }elsif(/<a [^>]*>(.+)<\/a>/) { |
---|
178 | $tmpinter = $1; |
---|
179 | } |
---|
180 | $tmpinter =~ s/^\s*M[mles]{0,3}[\.\s]+//; |
---|
181 | $tmpfonction = ''; |
---|
182 | $tmpurl_inter = ''; |
---|
183 | if (/class="orateur_qualite"[^>]*>([^>]*)</) { |
---|
184 | $tmpfonction = $1; |
---|
185 | } |
---|
186 | if (/href="(\/sen[^"]+)"/i) { |
---|
187 | $tmpurl_inter = "http://www.senat.fr$1"; |
---|
188 | } |
---|
189 | $tmpinter =~ s/<[^>]*>//g; |
---|
190 | $tmpinter =~ s/[\.,]\s*$//; |
---|
191 | $tmpfonction =~ s/[\.,]\s*$//; |
---|
192 | #Cas mauvais formatage des interventions |
---|
193 | if ($tmpinter =~ /^(.{4}[^\(]*[^M])\./) { |
---|
194 | $tmpinter = $1; |
---|
195 | s/$tmpinter/$tmpinter<\/span>/; |
---|
196 | } |
---|
197 | |
---|
198 | if (($tmpinter =~ /l[ae][ &\#\;0-9]+(pr(\&\#233\;|é|É)sidente?)/i) && $tmpinter !~ /mission/i && $president) { |
---|
199 | $tmpinter = $president; |
---|
200 | $tmpfonction = $1; |
---|
201 | } |
---|
202 | if (!$tmpfonction && $tmpinter =~ s/,(.*)//) { |
---|
203 | $tmpfonction = $1; |
---|
204 | } |
---|
205 | $tmpfonction =~ s/^[,\s]+//; |
---|
206 | if ($tmpinter ne $inter) { |
---|
207 | print_inter(); |
---|
208 | $inter = $tmpinter; |
---|
209 | $url_inter = $tmpurl_inter; |
---|
210 | $fonction = $tmpfonction; |
---|
211 | } |
---|
212 | s/<span class="info_entre_parentheses">([^\(][^<]*)<\/span>/<i>\1<\/i>/g; |
---|
213 | } |
---|
214 | |
---|
215 | if (!(/"titre_S([123][^"]*)"/ || /"mention_(article)"/)) { |
---|
216 | while (s/([^>]*)<(i|span class="info_entre_parentheses")>\(([^\)]*)\)?<\/(i|span)>([\.\s\)]*)//) { |
---|
217 | $i = $1; |
---|
218 | $didasc = $3; |
---|
219 | $i =~ s/<[^>]*>//g; |
---|
220 | $i =~ s/\s+/ /g; |
---|
221 | $i =~ s/\s+$//; |
---|
222 | $intervention .= "<p>".$i."</p>"; |
---|
223 | $didasc =~ s/<[^>]*>//gi; |
---|
224 | $didasc =~ s/\)//g; |
---|
225 | if ($didasc && $didasc !~ /^(suite|nouveau)$/i) { |
---|
226 | $predida_inter = $inter; |
---|
227 | $predida_urlinter = $url_inter; |
---|
228 | $predida_fonction = $fonction; |
---|
229 | print_inter(); |
---|
230 | $intervention = '<p>'.$didasc.'</p>'; |
---|
231 | print_inter(); |
---|
232 | $inter = $predida_inter; |
---|
233 | $url_inter = $predida_urlinter; |
---|
234 | $fonction = $predida_fonction; |
---|
235 | } |
---|
236 | } |
---|
237 | } |
---|
238 | |
---|
239 | if (/class="titre_/) { |
---|
240 | if ($inter) { |
---|
241 | print_inter(); |
---|
242 | } |
---|
243 | } |
---|
244 | |
---|
245 | $iscontext = ''; |
---|
246 | if (/"titre_S([123][^"]*)"/ || /"mention_(article)"/) { |
---|
247 | $iscontext = $1; |
---|
248 | print_inter(); |
---|
249 | s/<(i|span class="info_entre_parentheses")>\([^\)]*\)?<\/(i|span)>//; |
---|
250 | } |
---|
251 | if (s/.*id="(intv_|)par_[^>]*>\s*(.*)\s*<\/p>.*/$2/i) { |
---|
252 | s/(<span.*|)<\/span>\s*//i; |
---|
253 | s/\.\.\.\.+//g; |
---|
254 | s/\s+$//; |
---|
255 | s/<\/sup><i>/<\/sup> <i>/gi; |
---|
256 | if ($_) { |
---|
257 | if ($iscontext) { |
---|
258 | s/<[^>]*>//g; |
---|
259 | if ($iscontext eq '1') { |
---|
260 | if (!/^\s*PR(É|É)SIDENCE DE /) { |
---|
261 | $resetcontexte = 0; |
---|
262 | $bigcontext = $_; |
---|
263 | $subcontext = ''; |
---|
264 | $intervention = "<p>$bigcontext</p>"; |
---|
265 | print_inter(); |
---|
266 | } |
---|
267 | }else{ |
---|
268 | if (!/^\s*(vice-)?pr(é|é)sident/) { |
---|
269 | $resetcontexte = 0; |
---|
270 | $bigcontext = $oldbigcontext if (!$bigcontext); |
---|
271 | $subcontext = $_; |
---|
272 | $subcontext =~ s/<[^>]+>//g; |
---|
273 | $subcontext =~ s/\s+/ /g; |
---|
274 | $intervention = "<p>$subcontext</p>"; |
---|
275 | print_inter(); |
---|
276 | } |
---|
277 | } |
---|
278 | }elsif(/[a-z]/i){ |
---|
279 | s/^\. //; |
---|
280 | $intervention .= "<p>".$_."</p>"; |
---|
281 | } |
---|
282 | } |
---|
283 | } |
---|
284 | |
---|
285 | } |
---|
286 | print_inter(); |
---|