Changeset 2526


Ignore:
Timestamp:
Dec 2, 2011, 6:03:43 PM (9 years ago)
Author:
teymour
Message:

Optimisation du téléchargement des amendements

File:
1 edited

Legend:

Unmodified
Added
Removed
  • cpc/branches/senat/project/batch/download_elements_dossiers.pl

    r2198 r2526  
    77use File::Path qw(make_path);
    88
     9#Annee des dossiers à télécharger
     10$year = shift;
     11$since_hour = shift || 24;
     12$verbose = shift || 0;
     13
    914$lastyear = localtime(time);
    1015my @month = `date +%m`;
     
    1217$lastyear-- if ($month[0] < 10);
    1318
    14 $year = shift;
    1519$year = $lastyear if (!$year);
    1620$yearzero = $year;
     
    2024}
    2125
    22 $verbose = shift || 0;
    23 
    2426%done = ();
    2527%donedo = ();
    2628%donedl = ();
    2729$a = WWW::Mechanize->new();
     30$aif = WWW::Mechanize->new();
     31$aif->add_header('If-Modified-Since' => scalar(localtime(time()-3600*$since_hour)));
    2832
    2933sub download_one {
     
    3337  return if ($donedl{$uri});
    3438  $donedl{$uri} = 1;
    35   eval {$a->get($uri);};
    36   if ($a->status() == 404) {
    37     $a->back();
     39  eval {$aif->get($uri);};
     40  if ($aif->status() == 404) {
     41    $aif->back();
    3842    if ($dir =~ /r(ga|ap)/i && $uri =~ /_mono/i) {
    3943      $uri =~ s/_mono//i;
    4044      download_one($uri, $dir);
    4145    } else {
    42       print "ERREUR 404 sur $uri\n";
     46      print STDERR "ERREUR 404 sur $uri\n";
    4347    }
    4448    return;
    4549  }
    46   $htmfile = uri_escape($a->uri);
     50  $htmfile = uri_escape($aif->uri);
    4751  if ($donedl{$htmfile}) {
    48     $a->back();
     52    $aif->back();
    4953    return;
    5054  }
    5155  $donedl{$htmfile} = 1;
     56
     57  $thecontent = $aif->content;
     58  if (!$thecontent) {
     59        $aif->back();
     60        return ;
     61  }
     62
    5263  print "    $dir\t\t->\t\t$htmfile\n";
    5364  open FILE, ">:utf8", "$dir/$htmfile";
    54   $thecontent = $a->content;
    5565  if ($thecontent =~ s/iso-8859-1/utf-8/gi) {
    5666    $thecontent = decode("windows-1252", $thecontent);
     
    5868  print FILE $thecontent;
    5969  close FILE;
    60   $a->back();
     70  $aif->back();
    6171}
    6272
     
    97107  my $urla = "";
    98108  my $outdir = "";
    99   $a->get($urlp);
    100   if ($donedo{$a->uri}) {
    101     $a->back();
     109  $aif->get($urlp);
     110  if ($donedo{$aif->uri}) {
    102111    return;
    103112  }
    104   $donedo{$a->uri} = 1;
     113  $donedo{$aif->uri} = 1;
    105114  print STDERR " examine dossier $urlp\n" if ($verbose);
    106   my $contentleg = $a->content;
     115  my $contentleg = $aif->content;
    107116  if ($contentleg =~ s/iso-8859-1/utf-8/gi) {
    108117    $contentleg = decode("windows-1252", $contentleg);
     
    116125    if ($line =~ /(\/amendements.*\/(\d{4})-.+\/)accueil\.html/) {
    117126      if ($2 lt $yearzero) {
    118         $a->back();
    119127        next;
    120128      }
     
    128136   }
    129137  }
    130   $a->back();
    131138}
    132 
    133 # urls en compte-rendu-commissions a checker si already dl?
    134139
    135140sub explore_page {
Note: See TracChangeset for help on using the changeset viewer.