[vhffs-dev] [1313] improved parsing and normalization |
[ Thread Index |
Date Index
| More vhffs.org/vhffs-dev Archives
]
Revision: 1313
Author: gradator
Date: 2008-12-04 23:46:43 +0100 (Thu, 04 Dec 2008)
Log Message:
-----------
improved parsing and normalization
Modified Paths:
--------------
trunk/vhffs-robots/src/repository_stats.pl
Modified: trunk/vhffs-robots/src/repository_stats.pl
===================================================================
--- trunk/vhffs-robots/src/repository_stats.pl 2008-12-04 12:32:54 UTC (rev 1312)
+++ trunk/vhffs-robots/src/repository_stats.pl 2008-12-04 22:46:43 UTC (rev 1313)
@@ -138,47 +138,31 @@
open( MERGEDIN , "< ".$log_incoming_root."/mergedlog" );
open( REJECTOUT, "> ".$log_incoming_root."/rejectlog" );
-while( my $line = <MERGEDIN> ) {
+while( <MERGEDIN> ) {
- $line =~ s/\n//;
+ my ( $remotehost , $rfc931 , $authuser , $date , $request , $status , $size , $referer , $useragent ) = ( $_ =~ /^([^\s]*)\s+([^\s]*)\s+([^\s]*)\s+\[([^\]]*)\]\s+\"([^\"]*)\"\s+([^\s]*)\s+([^\s]*)(?:\s+\"([^\"]*)\")?(?:\s+\"([^\"]*)\")?$/ );
- # is it a ftp or http log ?
- my ( undef , undef , $type , undef ) = split( /\ /, $line, 4);
+ next unless ( defined $remotehost && defined $rfc931 && defined $authuser && defined $date && defined $request && defined $status && defined $size );
- # modify loglines to produce an uniformised output for ftp and http
- if( $type eq 'ftp' ) {
+ # define referer and useragent (convert common to combined log)
+ $referer = '-' unless defined $referer;
+ $useragent = '-' unless defined $useragent;
- # remove the "/data/repository/" part of the query
- $line =~ s%$ddir%%g;
- $line =~ s%$%\ \"\-\"\ \"\-\"%;
+ # remove the "/data/repository/" part of the query
+ $request =~ s%$ddir/*%/%;
- # add HTTP/1.1 at the end of the query
- my ( $part1 , $part2 ) = split( /\]/, $line, 2);
- my ( $part21 , $part22 , $part23 ) = split( /\"/ , $part2 , 3);
+ # remove the http:// part of the query if it exists
+ $request =~ s%http://[^/]+/*%/%;
- $line = $part1.']'.$part21.'"'.$part22.' HTTP/1.1"'.$part23;
+ # add HTTP/1.0 at the end of the query if needed
+ $request .= ' HTTP/1.0' if( $request && $request !~ /\ HTTP\/1.[01]$/ );
- } else {
+ # fetch the group
+ my ( $area ) = ( $request =~ /^[^\/]*\/([^\/]+)/ );
- # remove the http:// part of the query if it exists
- my ( $part1 , $part2 ) = split( /\]/, $line, 2);
- my ( $part21 , $part22 , $part23 ) = split( /\"/ , $part2 , 3);
+ # rebuild
+ my $log = $remotehost.' '.$rfc931.' '.$authuser.' ['.$date.'] "'.$request.'" '.$status.' '.$size.' "'.$referer.'" "'.$useragent.'"';
- if( $part22 =~ /^\w+\ http:\/\/.+$/ ) {
- my ( $part221 , undef ) = split( /\ / , $part22 , 2);
- my ( undef, undef, undef, $part222) = split( /\// , $part22 , 4);
-
- $line = $part1.']'.$part21.'"'.$part221.' /'.$part222.'"'.$part23;
- }
- }
-
- # now fetch the download area and modify the logline to discard download area information
- my ( $part1 , $part2 ) = split( /\]/, $line, 2);
- my ( $part21 , $part22 , $part23 ) = split( /\"/ , $part2 , 3);
-
- my ( $part221 , $area , $part223 ) = split( /\// , $part22 , 3);
- my $log = $part1.']'.$part21.'"'.$part221.'/'.$part223.'"'.$part23;
-
# append log line to the concerned download area
if ( defined $area && defined $log ) {