1 | #!/usr/bin/perl |
---|
2 | |
---|
3 | use URI::Escape; |
---|
4 | require "../common/common.pm"; |
---|
5 | |
---|
6 | $file = shift; |
---|
7 | $url_source = uri_unescape($file); |
---|
8 | if ($url_source =~ /(2\d{3})/) { |
---|
9 | $url_year = $1; |
---|
10 | } |
---|
11 | $url_source =~ s/.*html.*\/http/http/; |
---|
12 | |
---|
13 | |
---|
14 | open FILE, $file; |
---|
15 | @lignes = <FILE>; |
---|
16 | close FILE; |
---|
17 | $content = "@lignes"; |
---|
18 | $content =~ s/\n//g; |
---|
19 | $content =~ s/(<td[^>]*>)(\s*<\/?(a|strong|p|em)[^>]*>)+/$1/gi; |
---|
20 | $content =~ s/<\/?(a|strong|p|em)[^>]*>\s*<\/td>/<\/td>/gi; |
---|
21 | |
---|
22 | $content =~ s/<\/(p|h[1234]|ul|div)>/<\/$1>\n/gi; |
---|
23 | |
---|
24 | $content =~ s/(<h\d[^>]*>)\s*<b>/$1/gi; |
---|
25 | $content =~ s/<\/b>\s*(<\/h\d[^>]*>)/$1/gi; |
---|
26 | $content =~ s/[ \t]+/ /g; |
---|
27 | |
---|
28 | %fonctions = (); |
---|
29 | |
---|
30 | $timestamp = 0; |
---|
31 | $nb_seance = 1; |
---|
32 | sub print_inter { |
---|
33 | if ($intervention && $intervention ne '<p></p>') { |
---|
34 | if ($intervention =~ /(projet de loi|texte)( n[^<]+)/) { |
---|
35 | $doc = $2; |
---|
36 | $doc =~ s/&[^;]+;//g; |
---|
37 | $numeros_loi = ''; |
---|
38 | while ($doc =~ / n\s*(\d+) ?(\(\d+\-\d+\))/g) { |
---|
39 | $numeros_loi .= law_numberize($1,$2).","; |
---|
40 | } |
---|
41 | if ($numeros_loi) { |
---|
42 | chop($numeros_loi); |
---|
43 | $numeros_loi =~ s/[^0-9\-\,]//g; |
---|
44 | } |
---|
45 | } |
---|
46 | if ($intervention =~ /amendement( n[^<]+)/) { |
---|
47 | $doc = $1; |
---|
48 | $doc =~ s/&[^;]+;//g; |
---|
49 | if ($doc =~ / n\s*([COM\-\d]+)/) { |
---|
50 | $amendements = $1; |
---|
51 | } |
---|
52 | } |
---|
53 | $timestamp += 20; |
---|
54 | $intervenant =~ s/\ / /g; |
---|
55 | if ($date !~ /\d{4}\-\d{2}-\d{2}/) { |
---|
56 | print STDERR "ERROR pas de date pour $file\n"; |
---|
57 | exit 1; |
---|
58 | } |
---|
59 | if (!$commission || $commission =~ /[\/<>]/) { |
---|
60 | print STDERR "ERROR pas de commission pour $file\n"; |
---|
61 | exit 1; |
---|
62 | } |
---|
63 | print '{"commission": "'.quotize($commission).'", "contexte": "'.$context.'", "intervention": "'.quotize($intervention).'", "timestamp": "'.$timestamp.'", "date": "'.$date.'", "source": "'.$url_source.$source.'", "heure":"'.$heure.'", "intervenant": "'.name_lowerize($intervenant).'", "fonction": "'.$fonction.'", "intervenant_url": "'.$url_intervenant.'", "session":"'.$session.'"'; |
---|
64 | print ', "numeros_loi":"'.$numeros_loi.'"' if ($numeros_loi); |
---|
65 | print ', "amendements":"'.$amendements.'"' if ($amendements); |
---|
66 | print "}\n"; |
---|
67 | } |
---|
68 | $intervenant = ''; |
---|
69 | $fonction = ''; |
---|
70 | $url_intervenant = ''; |
---|
71 | $intervention = ''; |
---|
72 | $amendements = ''; |
---|
73 | } |
---|
74 | |
---|
75 | sub setfonction { |
---|
76 | my $f = shift; |
---|
77 | if ($f =~ /audition de (M[^<]+)/) { |
---|
78 | $a = $1; |
---|
79 | while ($a =~ /(M[me\.]* [^\,\.]+), ([^\,\.]+)/g) { |
---|
80 | $fonctions{$1} = $2; |
---|
81 | } |
---|
82 | } |
---|
83 | } |
---|
84 | |
---|
85 | $begin = 0; |
---|
86 | $recointer = "(M\\\.?m?e?|Amiral|Général|S\\\.E|Son |colonel)"; |
---|
87 | |
---|
88 | $interstrong = 1 if ($content =~ /<(a|strong)[^>]*>\s*($recointer[^<]*)<\/(a|strong)>/i); |
---|
89 | |
---|
90 | foreach (split /\n/, $content) { |
---|
91 | $begin = 1 if (/name="toc1"/); |
---|
92 | #print STDERR "title: $1\n" if (/<title>([^<]*)</); |
---|
93 | if (/TITLE>[^<]*(Commission[^\&:<]*)/i) { |
---|
94 | $commission = $1; |
---|
95 | }else { |
---|
96 | $commission = $1 if (/TITLE>[^<]*((Mission|Office|Délégation|Groupe de travail)[^\&:<]*)/i); |
---|
97 | } |
---|
98 | # print ; print "\n"; |
---|
99 | if ((!/\d{4}\-\d{4}/) && (/<(h[123])[^>]*>(\s*<[^>]*>)*([^<\(]+\d{4})\W*<\/(h[123])>/i)) { |
---|
100 | #print STDERR "date: $3 $url_year\n"; |
---|
101 | @date = datize($3, $url_year); |
---|
102 | if (@date) { |
---|
103 | #print STDERR "date:"."@date"." ($timestamp $intervention)\n"; |
---|
104 | print_inter() if ($intervention && !$timestamp && $date); |
---|
105 | $date = join '-', @date; |
---|
106 | print_inter() if ($intervention && !$timestamp); |
---|
107 | #print STDERR "date:".$date."\n"; |
---|
108 | $heure = ''; |
---|
109 | $session = sessionize(@date); |
---|
110 | $numeros_loi = ''; |
---|
111 | $nb_seance = 1; |
---|
112 | } |
---|
113 | } |
---|
114 | next if (!$begin); |
---|
115 | if (/<h3>(\s*<[^>]*>)*([^<]+)<\/h3>/) { |
---|
116 | $titre = $2; |
---|
117 | print_inter(); |
---|
118 | $context = $titre; |
---|
119 | setfonction($titre); |
---|
120 | $context =~ s/ / /g; |
---|
121 | $context =~ s/ - / > /; |
---|
122 | $intervention = '<p>'.$titre.'</p>'; |
---|
123 | %fonctions = (); |
---|
124 | $numeros_loi = ''; |
---|
125 | $is_newcontext = 1; |
---|
126 | } |
---|
127 | $source = "#$1" if (/name="([^"]+)"/); |
---|
128 | |
---|
129 | if (/<p[^>]*>(.*)<\/p>/i) { |
---|
130 | $inter = $1; |
---|
131 | if ($inter =~ /<u>(Au cours[^<]*)<\/u>/) { |
---|
132 | $aucours = $1; |
---|
133 | if ($aucours =~ /\Wapr[^s]+s( | |-)*midi($|\W)/) { |
---|
134 | $nb_seance = 2; |
---|
135 | }elsif ($aucours =~ /\Wsoir(é|&[^;]*;)e($|\W)/) { |
---|
136 | $nb_seance = 3; |
---|
137 | } |
---|
138 | print_inter() if (!$is_newcontext); |
---|
139 | $heure = ($nb_seance == 1) ? '1ere' : $nb_seance.'ieme'; |
---|
140 | $heure .= ' séance'; |
---|
141 | $timestamp = '0'; |
---|
142 | } |
---|
143 | if($is_newcontext) { |
---|
144 | $is_newcontext = 0; |
---|
145 | print_inter(); |
---|
146 | } |
---|
147 | $inter =~ s/<a[^>]*><\/a>//ig; |
---|
148 | if ($inter =~ /^<(u|strong|em)>(.*)<\/(u|strong|em)>$/i) { |
---|
149 | $inter = $2; |
---|
150 | print_inter(); |
---|
151 | $inter =~ s/<[^>]+>//g; |
---|
152 | setfonction($inter); |
---|
153 | $intervention = '<p>'.$inter.'</p>'; |
---|
154 | next; |
---|
155 | } |
---|
156 | $inter =~ s/(<\/(strong|a)[^>]*>)+([\s,]*)(<\/?(strong)[^>]*>)+/$3/ig; |
---|
157 | if (($interstrong && $inter =~ /<(a|strong)[^>]*>($recointer[^<]+)<\/(a|strong)>/i) || |
---|
158 | (!$interstrong && ($inter =~ /(>)\s*($recointer[^<]{10}[^<\.]*)/))) { |
---|
159 | $tmpintervenant = $2; |
---|
160 | $tmpintervenant =~ s/<[^>]*>//g; |
---|
161 | if ($tmpintervenant =~ s/^([^,]+), ([^,]*).*/$1/g) { |
---|
162 | $tmpfonction = $2; |
---|
163 | $tmpfonction =~ s/\W+$//; |
---|
164 | $fonctions{$tmpintervenant} = $tmpfonction; |
---|
165 | }else{ |
---|
166 | if ($tmpintervenant =~ s/\s*l[ae].{1,6}(pr(&[^;]*;|é)sidente?)\s*/ /) { |
---|
167 | $tmpfonction = $1; |
---|
168 | }else{ |
---|
169 | $tmpfonction = $fonctions{$tmpintervenant}; |
---|
170 | } |
---|
171 | } |
---|
172 | print_inter() if ($tmpintervenant ne $intervenant); |
---|
173 | $intervenant = $tmpintervenant; |
---|
174 | $fonction = $tmpfonction; |
---|
175 | $url_intervenant = $1 if ($inter =~ /href="([^"]+senfic\/[^"]+)"/i); |
---|
176 | } |
---|
177 | $inter =~ s/<[^>]+>//g; |
---|
178 | $sintervenant = $intervenant; |
---|
179 | $sintervenant =~ s/([\(\)\*])/\\$1/g; |
---|
180 | $sfonction = $fonction; |
---|
181 | $sfonction =~ s/([\(\)\*])/\\$1/g; |
---|
182 | $inter =~ s/^[^\w\&]*$sintervenant[^\w\&]*($sfonction[^\w\&]*|)//; |
---|
183 | $intervention .= '<p>'.$inter.'</p>' if ($inter =~ /[a-z]/i); |
---|
184 | } |
---|
185 | # print "$date $titre $source\n"; |
---|
186 | } |
---|
187 | print_inter(); |
---|