[vhffs-dev] [1313] improved parsing and normalization

[ Thread Index | Date Index | More vhffs.org/vhffs-dev Archives ]


Revision: 1313
Author:   gradator
Date:     2008-12-04 23:46:43 +0100 (Thu, 04 Dec 2008)

Log Message:
-----------
improved parsing and normalization

Modified Paths:
--------------
    trunk/vhffs-robots/src/repository_stats.pl


Modified: trunk/vhffs-robots/src/repository_stats.pl
===================================================================
--- trunk/vhffs-robots/src/repository_stats.pl	2008-12-04 12:32:54 UTC (rev 1312)
+++ trunk/vhffs-robots/src/repository_stats.pl	2008-12-04 22:46:43 UTC (rev 1313)
@@ -138,47 +138,31 @@
 open( MERGEDIN , "< ".$log_incoming_root."/mergedlog" );
 open( REJECTOUT, "> ".$log_incoming_root."/rejectlog" );
 
-while( my $line = <MERGEDIN> )  {
+while( <MERGEDIN> )  {
 
-	$line =~ s/\n//;
+	my ( $remotehost , $rfc931 , $authuser , $date , $request , $status , $size , $referer , $useragent ) = ( $_ =~ /^([^\s]*)\s+([^\s]*)\s+([^\s]*)\s+\[([^\]]*)\]\s+\"([^\"]*)\"\s+([^\s]*)\s+([^\s]*)(?:\s+\"([^\"]*)\")?(?:\s+\"([^\"]*)\")?$/ );
 
-	# is it a ftp or http log ?
-	my ( undef , undef , $type , undef ) = split( /\ /, $line, 4);
+	next unless ( defined $remotehost && defined $rfc931 && defined $authuser && defined $date && defined $request && defined $status && defined $size );
 
-	# modify loglines to produce an uniformised output for ftp and http
-	if( $type eq 'ftp' )  {
+	# define referer and useragent (convert common to combined log)
+	$referer = '-' unless defined $referer;
+	$useragent = '-' unless defined $useragent;
 
-		# remove the "/data/repository/" part of the query
-		$line =~ s%$ddir%%g;
-		$line =~ s%$%\ \"\-\"\ \"\-\"%;
+	# remove the "/data/repository/" part of the query
+	$request =~ s%$ddir/*%/%;
 
-		# add HTTP/1.1 at the end of the query
-		my ( $part1 , $part2 ) = split( /\]/, $line, 2);
-		my ( $part21 , $part22 , $part23 ) = split( /\"/ , $part2 , 3);
+	# remove the http:// part of the query if it exists
+	$request =~ s%http://[^/]+/*%/%;
 
-		$line = $part1.']'.$part21.'"'.$part22.' HTTP/1.1"'.$part23;
+	# add HTTP/1.0 at the end of the query if needed
+	$request .= ' HTTP/1.0' if( $request && $request !~ /\ HTTP\/1.[01]$/ );
 
-	}  else  {
+	# fetch the group
+	my ( $area ) = ( $request =~ /^[^\/]*\/([^\/]+)/ ); 
 
-		# remove the http:// part of the query if it exists
-		my ( $part1 , $part2 ) = split( /\]/, $line, 2);
-		my ( $part21 , $part22 , $part23 ) = split( /\"/ , $part2 , 3);
+	# rebuild
+	my $log = $remotehost.' '.$rfc931.' '.$authuser.' ['.$date.'] "'.$request.'" '.$status.' '.$size.' "'.$referer.'" "'.$useragent.'"';
 
-		if( $part22 =~ /^\w+\ http:\/\/.+$/ ) {
-			my ( $part221 , undef ) = split( /\ / , $part22 , 2);
-			my ( undef, undef, undef, $part222) = split( /\// , $part22 , 4);
-
-			$line = $part1.']'.$part21.'"'.$part221.' /'.$part222.'"'.$part23;
-		}
-	}
-
-	# now fetch the download area and modify the logline to discard download area information
-	my ( $part1 , $part2 ) = split( /\]/, $line, 2);
-	my ( $part21 , $part22 , $part23 ) = split( /\"/ , $part2 , 3);
-
-	my ( $part221 , $area , $part223 ) = split( /\// , $part22 , 3);
-	my $log = $part1.']'.$part21.'"'.$part221.'/'.$part223.'"'.$part23;
-
 	# append log line to the concerned download area
 	if ( defined $area && defined $log )  {
 


Mail converted by MHonArc 2.6.19+ http://listengine.tuxfamily.org/