Changeset 1362


Ignore:
Timestamp:
Sep 12, 2010, 2:39:47 AM (10 years ago)
Author:
teymour
Message:

Ajout du contenu des documents
+ correction de compute_latest.sh

Location:
cpc/trunk/project/batch/documents
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • cpc/trunk/project/batch/documents/compute_latest.sh

    r1333 r1362  
    11#!/bin/bash
     2
     3if ! test -e out/ ; then
     4        mkdir out
     5fi
    26
    37for file in `perl download_docs.pl`; do
    48  echo $file
    5   file2=`echo $file | sed 's/\(pjl|ppl|ppr|rap|ta\)/out/'`
    6   perl parse_document.pl $file > $file2
     9  file2=`echo $file | sed 's/^\(pjl\|ppl\|ppr\|rap\|ta\)\//out\//'`
     10  perl parse_metas.pl $file > $file2
    711done;
    812
  • cpc/trunk/project/batch/documents/parse_metas.pl

    r1361 r1362  
    3636$string =~ s/\s+/ /g;
    3737$string =~ s/"//g;
    38 if ($dir == "rap") {
    39   $string =~ s/L\W\W([AÀEÈÉÊIOUYH])/L'\1/g;
    40   $string =~ s/D\W\W([AÀEÈÉÊIOUYH])/D'\1/g;
    41 }
     38
    4239$keywords = "";
    4340#print $string."\n";
     
    159156$categorie =~ s/Ç/ç/g;
    160157
     158$string =~ s/<[^>]*>//gi;
     159$string =~ s/"//gi;
     160$string =~ s/[\n\t]/ /gi;
     161$string =~ s/  +/ /gi;
    161162
     163$string =~ s/if \(window!= top\) top\.location\.href=location\.href//i;
     164$string =~ s/Recherche \| Aide \| Plan du site Accueil \&gt\; Documents parlementaires \&gt\; Les rapports législatifs//i;
     165$string =~ s/_____ ASSEMBL'E NATIONALE CONSTITUTION DU 4 OCTOBRE 1958 TREIZIÈME LÉGISLATURE//i;
     166$string =~ s/__*//i;
    162167#print "\n";
    163 print '{"source": "'.$source.'", "legislature": "'.$legislature.'", "id": "'.$id.'", "numero": "'.$num.'", "annexe": "'.$annexe.'", "date_depot": "'.$date0.'", "date_publi": "'.$date1.'", "auteurs": "'.$auteurs.'", "dossier": "'.$dossier.'", "type": "'.$type0.'", "type_details": "'.$type1.'", "titre": "'.$titre.'", "categorie": "'.$categorie.'", "motscles": "'.$keywords.'"}'."\n";
    164 
    165 
     168print '{"source": "'.$source.'", "legislature": "'.$legislature.'", "id": "'.$id.'", "numero": "'.$num.'", "annexe": "'.$annexe.'", "date_depot": "'.$date0.'", "date_publi": "'.$date1.'", "auteurs": "'.$auteurs.'", "dossier": "'.$dossier.'", "type": "'.$type0.'", "type_details": "'.$type1.'", "titre": "'.$titre.'", "categorie": "'.$categorie.'", "motscles": "'.$keywords.'", "contenu": "'.$string.'"}'."\n";
Note: See TracChangeset for help on using the changeset viewer.