1 | <?php |
---|
2 | |
---|
3 | class tagSeanceTask extends sfBaseTask |
---|
4 | { |
---|
5 | protected function configure() |
---|
6 | { |
---|
7 | $this->namespace = 'tag'; |
---|
8 | $this->name = 'Seance'; |
---|
9 | $this->briefDescription = 'Tag Seance'; |
---|
10 | $this->addOption('env', null, sfCommandOption::PARAMETER_OPTIONAL, 'Changes the environment this task is run in', 'test'); |
---|
11 | $this->addOption('app', null, sfCommandOption::PARAMETER_OPTIONAL, 'Changes the environment this task is run in', 'frontend'); |
---|
12 | } |
---|
13 | |
---|
14 | protected function count($array, $excludeS = 0) { |
---|
15 | foreach($array as $i) { |
---|
16 | $i = strip_tags($i['intervention']); |
---|
17 | $i = preg_replace('/<[^>]+>/', '', $i); |
---|
18 | $i = html_entity_decode(str_replace(' ', ' ', htmlentities($i, ENT_COMPAT, 'UTF-8')), ENT_COMPAT, 'UTF-8'); |
---|
19 | $i = preg_replace('/\([^\)]+\)/', '', $i); |
---|
20 | $i = preg_replace('/œ/', 'oe', $i); |
---|
21 | $i = str_replace(array(',',';','.',':','_','(',')','&','#','<','>','\'','"','«','»',' -','- ','?','!'), ' ', $i); |
---|
22 | $i = preg_replace('/\s+/', ' ', $i); |
---|
23 | foreach(explode(" ", $i) as $w) { |
---|
24 | if (!preg_match('/^[A-Z]+$/', $w)) |
---|
25 | $w = strtolower($w); |
---|
26 | if (strlen($w)>2 && preg_match('/[a-z]/i', $w)) { |
---|
27 | // $s = soundex($w); |
---|
28 | $s = $w; |
---|
29 | if (!isset($words[$s])) |
---|
30 | $words[$s] = 1; |
---|
31 | else |
---|
32 | $words[$s]++; |
---|
33 | if (!isset($this->sound[$s])) |
---|
34 | $this->sound[$s] = $w; |
---|
35 | } |
---|
36 | } |
---|
37 | } |
---|
38 | foreach(array_keys($words) as $k) { |
---|
39 | if (preg_match('/s$/', $k)) { |
---|
40 | $ks = preg_replace('/s$/', '', $k); |
---|
41 | if (isset($words[$ks])) { |
---|
42 | $words[$ks]+=$words[$k]; |
---|
43 | if ($excludeS) |
---|
44 | unset($words[$k]); |
---|
45 | } |
---|
46 | } |
---|
47 | } |
---|
48 | arsort($words); |
---|
49 | return $words; |
---|
50 | } |
---|
51 | |
---|
52 | protected function execute($arguments = array(), $options = array()) |
---|
53 | { |
---|
54 | |
---|
55 | // your code here |
---|
56 | $manager = new sfDatabaseManager($this->configuration); |
---|
57 | $q = Doctrine_Query::create(); |
---|
58 | $q->select('intervention')->from('Intervention i')->where('i.parlementaire_id IS NOT NULL'); |
---|
59 | echo "count:\n\t"; |
---|
60 | echo $q->count()."\n"; |
---|
61 | $array = $q->fetchArray(); |
---|
62 | $words = $this->count($array); |
---|
63 | $cpt = 0; |
---|
64 | $tot = count($words); |
---|
65 | |
---|
66 | $exclude = array('vice-président' => 1, 'restant' => 1, 'mixte' => 1, 'paritaire' => 1, 'rapporteure' => 1, 'rapporteur' => 1, 'députée' => 1, 'député' => 1, 'sénateur' => 1, 'sénatrice' => 1, 'présidente' => 1, 'président' => 1, 'rédaction' => 1, 'issue' => 1, 'spéciale' => 1, 'adopté' => 1, 'lecture'=>1, 'séance'=>1, 'alinéa'=>1, 'résolution'=>1, 'adoption'=>1, 'collègue'=>1, 'cher'=>1, 'collègues'=>1, 'chers'=>1,'bis'=>1, '1er'=>1, 'rectifié'=>1, 'question'=>1, 'rédactionnel'=>1, 'scrutin'=>1, 'exposer'=>1, 'identiques'=>1, 'identique'=>1, 'commission'=>1, 'adopte'=>1, 'rejette' => 1, 'additionnel' => 1, 'tendant' => 1, 'examiné' => 1, 'examine' => 1, 'rejeté'=> 1, 'avis' => 1, 'suivant'=>1, 'estimé'=>1, 'déclaré'=>1); |
---|
67 | $include = array('télévision' => 1, 'dimanche'=>1, 'internet'=>1, 'outre-mer'=>1, 'logement'=>1, 'militaire'=>1, 'taxe'=>1, 'médecin'=>1, 'hôpital'=>1); |
---|
68 | $exclude_sentences = array('commission spéciale' => 1, 'garde des sceaux'=>1, 'haut-commissaire' => 1, 'monsieur' => 1, 'madame'=>1, 'mixte paritaire' => 1, 'commission mixte' => 1, 'commission mixte paritaire' => 1); |
---|
69 | |
---|
70 | foreach(array_keys($words) as $k) { |
---|
71 | if (!isset($include[$k])) |
---|
72 | $exclude[$k] = 1; |
---|
73 | echo $k.': '.$words[$k]*100/$tot."\n"; |
---|
74 | if ($words[$k]*100/$tot < 3) |
---|
75 | break; |
---|
76 | } |
---|
77 | unset($words); |
---|
78 | $q = Doctrine_Query::create(); |
---|
79 | $q->select('nom as intervention')->from('Parlementaire o'); |
---|
80 | $array = $q->fetchArray(); |
---|
81 | $words = $this->count($array); |
---|
82 | foreach(array_keys($words) as $k) { |
---|
83 | $exclude[$k] = 1; |
---|
84 | } |
---|
85 | unset($words); |
---|
86 | |
---|
87 | $qs = Doctrine::getTable('Seance')->createQuery()->select('id')->where('tagged IS NULL')->orderBy('date DESC'); |
---|
88 | //$qs = Doctrine::getTable('Seance')->createQuery()->select('id')->where('id = ?', '1859'); |
---|
89 | |
---|
90 | foreach($qs->fetchArray() as $s) { |
---|
91 | echo "Seance ".$s['id']." .."; |
---|
92 | |
---|
93 | //Recherche toutes les interventions pour cette séance |
---|
94 | $q = Doctrine_Query::create(); |
---|
95 | $q->select('intervention, id, parlementaire_id')->from('Intervention i')->where('seance_id = ?', $s['id'])->andWhere('( i.parlementaire_id IS NOT NULL OR i.personnalite_id IS NOT NULL )')->andWhere('(i.fonction IS NULL OR i.fonction NOT LIKE ? )', 'président%'); |
---|
96 | |
---|
97 | $array = $q->fetchArray(); |
---|
98 | if (!count($array)) { |
---|
99 | echo " pas d'intervention trouvée\n"; |
---|
100 | continue; |
---|
101 | } |
---|
102 | $words = $this->count($array, 1); |
---|
103 | $cpt = 0; |
---|
104 | $tot = count($words); |
---|
105 | $tags = array(); |
---|
106 | //Pour les mots le plus populaires non exclus on les gardes |
---|
107 | foreach(array_keys($words) as $k) { |
---|
108 | $k = trim($k); |
---|
109 | if (!isset($exclude[$k])) { |
---|
110 | $cpt++; |
---|
111 | $pc = $words[$k]*100/$tot; |
---|
112 | if ($pc < 0.8) |
---|
113 | break; |
---|
114 | $tags[$k] = strlen($k); |
---|
115 | } |
---|
116 | } |
---|
117 | |
---|
118 | $sentences = null; |
---|
119 | $sent2word = null; |
---|
120 | //On cherche des groupes de mots commums à partir des tags trouvés |
---|
121 | foreach ($array as $inter) { |
---|
122 | $i = null; |
---|
123 | $inter['intervention'] = html_entity_decode(str_replace(' ', ' ', htmlentities($inter['intervention'], ENT_COMPAT, 'UTF-8')), ENT_COMPAT, 'UTF-8'); |
---|
124 | |
---|
125 | foreach (array_keys($tags) as $tag) { |
---|
126 | $srctag = preg_replace('/\//', '\/', $tag); |
---|
127 | if (preg_match('/([^\s,\.:>;\(\)«»]*[^,\.:>;\(\)«»]{7}'.$srctag.'[^\s,\.:<\(\)«»]*)/i', $inter['intervention'], $match)) { |
---|
128 | $sent = trim(strtolower($match[1])); |
---|
129 | if (!isset($sentences[$sent])) |
---|
130 | $sentences[$sent] = 1; |
---|
131 | else |
---|
132 | $sentences[$sent]++; |
---|
133 | $sent2word[$sent] = $tag; |
---|
134 | } |
---|
135 | if (preg_match('/([^\s,\.:>;\(\)«»]*'.$srctag.'[^,\.:<\(\)«»]{7}[^\s,\.:<\(\)«»]*)/i', $inter['intervention'], $match)) { |
---|
136 | $sent = trim(strtolower($match[1])); |
---|
137 | if (!isset($sentences[$sent])) |
---|
138 | $sentences[$sent] = 1; |
---|
139 | else |
---|
140 | $sentences[$sent]++; |
---|
141 | $sent2word[$sent] = $tag; |
---|
142 | } |
---|
143 | } |
---|
144 | } |
---|
145 | //asort($sentences); |
---|
146 | |
---|
147 | //Si les groupes de mots ont une certernaines popularité, on les garde |
---|
148 | //Au dessus de 70% d'utilisation le tag contenu est supprimé |
---|
149 | $debut_bani = 'à|de|la|ainsi|ensuite'; |
---|
150 | if (count($sentences)) { |
---|
151 | foreach (array_keys($sentences) as $sent) { |
---|
152 | if (preg_match("/^($debut_bani)/i", $sent) || preg_match("/($debut_bani)$/i", $sent) || preg_match('/\d|amendement|rapporteur|commision|collègue/i', $sent) ) |
---|
153 | continue; |
---|
154 | |
---|
155 | if (preg_match('/^[A-Z][a-z]/', $sent)) { |
---|
156 | unset($tags[$sent2word[$sent]]); |
---|
157 | continue; |
---|
158 | } |
---|
159 | |
---|
160 | if (preg_match('/^([a-z]{2} |[A-Z]+)/', $sent) || preg_match('/ [a-z]$/i', $sent)) { |
---|
161 | continue; |
---|
162 | } |
---|
163 | |
---|
164 | if (($sentences[$sent]*100/$tot > 0.8 || $sentences[$sent]*100/$words[$sent2word[$sent]] > 70)&& $words[$sent2word[$sent]] > 5) { |
---|
165 | $ok = 1; |
---|
166 | foreach($exclude_sentences as $excl_sent) { |
---|
167 | if (preg_match('/'.$excl_sent.'/i', $sent)) { |
---|
168 | $ok = 0; |
---|
169 | break; |
---|
170 | } |
---|
171 | } |
---|
172 | if ($ok) |
---|
173 | $tags[$sent] = strlen($sent); |
---|
174 | if ($sentences[$sent]*100/$words[$sent2word[$sent]] > 70) |
---|
175 | unset($tags[$sent2word[$sent]]); |
---|
176 | } |
---|
177 | } |
---|
178 | } |
---|
179 | unset($words); |
---|
180 | unset($sentences); |
---|
181 | unset($sent2word); |
---|
182 | |
---|
183 | foreach ($tags as $t => $n) print htmlentities($t, ENT_COMPAT, 'UTF-8')."\n"; |
---|
184 | print_r($tags); |
---|
185 | |
---|
186 | //On cherche maintenant les tags dans les interventions pour les associer |
---|
187 | arsort($tags); |
---|
188 | $tagged = 0; |
---|
189 | foreach ($array as $inter) { |
---|
190 | if (!$inter['parlementaire_id']) |
---|
191 | continue; |
---|
192 | |
---|
193 | $i = null; |
---|
194 | foreach (array_keys($tags) as $tag) { |
---|
195 | $tag = trim($tag); |
---|
196 | if (preg_match('/'.preg_replace('/\//', '\/', $tag).'/i', $inter['intervention'])) { |
---|
197 | if (!$i) |
---|
198 | $i = Doctrine::getTable('Intervention')->find($inter['id']); |
---|
199 | $i->addTag($tag); |
---|
200 | } |
---|
201 | } |
---|
202 | if ($i) { |
---|
203 | $tagged = 1; |
---|
204 | $i->save(); |
---|
205 | } |
---|
206 | } |
---|
207 | if ($tagged == 1) { |
---|
208 | $seance = Doctrine::getTable('Seance')->find($s['id']); |
---|
209 | $seance->tagged = 1; |
---|
210 | $seance->save(); |
---|
211 | } |
---|
212 | unset($tags); |
---|
213 | unset($array); |
---|
214 | echo " done."; |
---|
215 | unset($s); |
---|
216 | if ($tagged == 0) |
---|
217 | echo " WARNING: No tag found !"; |
---|
218 | echo "\n"; |
---|
219 | } |
---|
220 | } |
---|
221 | } |
---|