Changeset 1362
- Timestamp:
- Sep 12, 2010, 2:39:47 AM (10 years ago)
- Location:
- cpc/trunk/project/batch/documents
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
cpc/trunk/project/batch/documents/compute_latest.sh
r1333 r1362 1 1 #!/bin/bash 2 3 if ! test -e out/ ; then 4 mkdir out 5 fi 2 6 3 7 for file in `perl download_docs.pl`; do 4 8 echo $file 5 file2=`echo $file | sed 's/ \(pjl|ppl|ppr|rap|ta\)/out/'`6 perl parse_ document.pl $file > $file29 file2=`echo $file | sed 's/^\(pjl\|ppl\|ppr\|rap\|ta\)\//out\//'` 10 perl parse_metas.pl $file > $file2 7 11 done; 8 12 -
cpc/trunk/project/batch/documents/parse_metas.pl
r1361 r1362 36 36 $string =~ s/\s+/ /g; 37 37 $string =~ s/"//g; 38 if ($dir == "rap") { 39 $string =~ s/L\W\W([AÀEÈÉÊIOUYH])/L'\1/g; 40 $string =~ s/D\W\W([AÀEÈÉÊIOUYH])/D'\1/g; 41 } 38 42 39 $keywords = ""; 43 40 #print $string."\n"; … … 159 156 $categorie =~ s/Ç/ç/g; 160 157 158 $string =~ s/<[^>]*>//gi; 159 $string =~ s/"//gi; 160 $string =~ s/[\n\t]/ /gi; 161 $string =~ s/ +/ /gi; 161 162 163 $string =~ s/if \(window!= top\) top\.location\.href=location\.href//i; 164 $string =~ s/Recherche \| Aide \| Plan du site Accueil \>\; Documents parlementaires \>\; Les rapports législatifs//i; 165 $string =~ s/_____ ASSEMBL'E NATIONALE CONSTITUTION DU 4 OCTOBRE 1958 TREIZIÈME LÉGISLATURE//i; 166 $string =~ s/__*//i; 162 167 #print "\n"; 163 print '{"source": "'.$source.'", "legislature": "'.$legislature.'", "id": "'.$id.'", "numero": "'.$num.'", "annexe": "'.$annexe.'", "date_depot": "'.$date0.'", "date_publi": "'.$date1.'", "auteurs": "'.$auteurs.'", "dossier": "'.$dossier.'", "type": "'.$type0.'", "type_details": "'.$type1.'", "titre": "'.$titre.'", "categorie": "'.$categorie.'", "motscles": "'.$keywords.'"}'."\n"; 164 165 168 print '{"source": "'.$source.'", "legislature": "'.$legislature.'", "id": "'.$id.'", "numero": "'.$num.'", "annexe": "'.$annexe.'", "date_depot": "'.$date0.'", "date_publi": "'.$date1.'", "auteurs": "'.$auteurs.'", "dossier": "'.$dossier.'", "type": "'.$type0.'", "type_details": "'.$type1.'", "titre": "'.$titre.'", "categorie": "'.$categorie.'", "motscles": "'.$keywords.'", "contenu": "'.$string.'"}'."\n";
Note: See TracChangeset
for help on using the changeset viewer.