#!/usr/bin/perl ######################################################################## # Program Name : htsearchterm.pl # Description : Processes a Common or Combined Logfile Format access_log # file extracting and organizes Ht://Dig search terms into # two formats - numeric and alphabetic - outputting in # simple HTML format. # Version : 0.1 # # Written by : Max Pyziur - BRAMA, Inc, pyz@brama.com # with credit due to Roy Fielding's wwwstat # located at http://www.ics.uci.edu/pub/websoft/wwwstat # and Stephane Bortzmeyer, (bortzmeyer@pasteur.fr)'s # http://www.htdig.org/files/contrib/scripts/log.pl # Last Modified : 02/24/02 ######################################################################## # -=* Main *=- validate(); assignvariables(); getterms(); prepterms() ; printdata() ; # -=* End Main *=- sub validate { if ($#ARGV < 0 || $#ARGV > 1) { print "\n\n\tMissing or too many arguments !\n" ; usage(); exit; } if ($ARGV[0] eq 'h') { usage(); exit;} if ($#ARGV = 1 ) { $ARGV[0] =~ tr/A-Z/a-z/; if($ARGV[0] !~ /cp1251/) { print "\n\n\tLangCode can only be a choice of \"cp1251\"!\n" ; usage(); exit; } $LANGCODE = $ARGV[0]; $ACCESSLOG = $ARGV[1]; } else { $ACCESSLOG = $ARGV[0] } if ( !-e $ACCESSLOG || !-r $ACCESSLOG || !-f $ACCESSLOG ) { print "\n\n\tLogFile not found or unreadable!\n" ; usage(); exit; } } sub usage { $PROG = `basename $0`; chomp $PROG; print "\n\t$PROG processes a Common or Combined Logfile Format access_log file extracting and organizes Ht://Dig search terms into two formats - numeric and alphabetic - outputting in simple HTML format. $PROG is run in the following ways: $PROG LogFile or $PROG LangCode LogFile or where \"LangCode\" can be a choice of the following: cp1251 for Cyrillic windows-1251 coding (Ukrainian, Belorussian, Russian, Macedonian, Bulgarian, Serbian) Examples: $PROG /path/to/access_log $PROG cp1251 /path/to/access_log \(to work with Cyrillic cp1251 search terms\) $PROG h produces this usage help see http://www.htdig.org/ for all of the details on Ht://Dig " ; } sub ukrusage { $PROG = `basename $0`; chomp $PROG; print "\n\t$PROG Common or Combined Logfile Format access_log Ht://Dig - - . $PROG is run in the following ways: $PROG LogFile or $PROG LangCode LogFile or \"LangCode\" : cp1251 -1251 (, , , , , ) : $PROG ///access_log $PROG cp1251 ///access_log \(to work with Cyrillic cp1251 search terms\) $PROG h produces this usage help see http://www.htdig.org/ for all of the details on Ht://Dig " ; } sub assign_variables { $WEBSITENAME = "BRAMA"; $TODAY = `date`; %HEXARRAY = ( "\%20", " ", "\%21", "\!", "\%22", "\*", "\%23", "\#", "\%24", "\$", "\%25", "\%", "\%26", "\&", "\%27", "\'", "\%28", "\(", "\%29", "\)", "\%2A", "\*", "\%2B", "\+", "\%2C", "\,", "\%2D", "\-", "\%2E", "\.", "\%2F", "\/", "\%30", "0", "\%31", "1", "\%32", "2", "\%33", "3", "\%34", "4", "\%35", "5", "\%36", "6", "\%37", "7", "\%38", "8", "\%39", "9", "\%3A", "\:", "\%3B", "\;", "\%3C", "\<", "\%3D", "\=", "\%3E", "\>", "\%3F", "\?", "\%40", "\@", "\%41", "A", "\%42", "B", "\%43", "C", "\%44", "D", "\%45", "E", "\%46", "F", "\%47", "G", "\%48", "H", "\%49", "I", "\%4A", "J", "\%4B", "K", "\%4C", "L", "\%4D", "M", "\%4E", "N", "\%4F", "O", "\%50", "P", "\%51", "Q", "\%52", "R", "\%53", "S", "\%54", "T", "\%55", "U", "\%56", "V", "\%57", "W", "\%58", "X", "\%59", "Y", "\%5A", "Z", "\%5B", "\[", "\%5C", "\\", "\%5D", "\]", "\%5E", "\^", "\%5F", "\_", "\%60", "\`", "\%61", "a", "\%62", "b", "\%63", "c", "\%64", "d", "\%65", "e", "\%66", "f", "\%67", "g", "\%68", "h", "\%69", "i", "\%6A", "j", "\%6B", "k", "\%6C", "l", "\%6D", "m", "\%6E", "n", "\%6F", "o", "\%70", "p", "\%71", "q", "\%72", "r", "\%73", "s", "\%74", "t", "\%75", "u", "\%76", "v", "\%77", "w", "\%78", "x", "\%79", "y", "\%7A", "z", "\%7B", "{", "\%7C", "|", "\%7D", "}", "\%7E", "~", "\%7F", "", "\%80", "", "\%81", "", "\%82", "\", "\%83", "", "\%84", "\", "\%85", "\", "\%86", "\", "\%87", "\", "\%88", "", "\%89", "\", "\%8A", "", "\%8B", "\", "\%8C", "", "\%8D", "", "\%8E", "", "\%8F", "", "\%90", "", "\%91", "\", "\%92", "\", "\%93", "\", "\%94", "\", "\%95", "\", "\%96", "\", "\%97", "\", "\%98", "", "\%99", "\", "\%9A", "", "\%9B", "\", "\%9C", "", "\%9D", "", "\%9E", "", "\%9F", "", "\%A0", "", "\%A1", "", "\%A2", "", "\%A3", "", "\%A4", "", "\%A5", "", "\%A6", "\", "\%A7", "", "\%A8", "", "\%A9", "", "\%AA", "", "\%AB", "\", "\%AC", "\", "\%AD", "\", "\%AE", "", "\%AF", "", "\%B0", "\", "\%B1", "", "\%B2", "", "\%B3", "", "\%B4", "", "\%B5", "", "\%B6", "", "\%B7", "\", "\%B8", "", "\%B9", "", "\%BA", "", "\%BB", "\", "\%BC", "", "\%BD", "", "\%BE", "", "\%BF", "", "\%C0", "", "\%C1", "", "\%C2", "", "\%C3", "", "\%C4", "", "\%C5", "", "\%C6", "", "\%C7", "", "\%C8", "", "\%C9", "", "\%CA", "", "\%CB", "", "\%CC", "", "\%CD", "", "\%CE", "", "\%CF", "", "\%D0", "", "\%D1", "", "\%D2", "", "\%D3", "", "\%D4", "", "\%D5", "", "\%D6", "", "\%D7", "", "\%D8", "", "\%D9", "", "\%DA", "", "\%DB", "", "\%DC", "", "\%DD", "", "\%DE", "", "\%DF", "", "\%E0", "", "\%E1", "", "\%E2", "", "\%E3", "", "\%E4", "", "\%E5", "", "\%E6", "", "\%E7", "", "\%E8", "", "\%E9", "", "\%EA", "", "\%EB", "", "\%EC", "", "\%ED", "", "\%EE", "", "\%EF", "", "\%F0", "", "\%F1", "", "\%F2", "", "\%F3", "", "\%F4", "", "\%F5", "", "\%F6", "", "\%F7", "", "\%F8", "", "\%F9", "", "\%FA", "", "\%FB", "", "\%FC", "", "\%FD", "", "\%FE", "", "\%FF", "" ) ; @HEXKEYS = keys(%HEXARRAY); @HEXVALUES = values(%HEXARRAY); } sub getterms { open(ACCESSLOG, "egrep \'GET /cgi-bin/htsearch\' $ACCESSLOG | "); while () { chomp ; ($HOST, $RFC931, $AUTHUSER, $TIMESTAMP, $SEARCHTERM, $STATUS, $BYTES) = /^(\S+) (\S+) (\S+) \[([^\]]*)\] \"([^"]*)\" (\S+) (\S+)/; $SEARCHTERM =~ s/GET.+words\=//g; # remove leading stuff $SEARCHTERM =~ s/\;page.+HTTP.+//g; # remove trailing stuff $SEARCHTERM =~ s/ HTTP.+//g; # remove more trailing stuff $SEARCHTERM =~ s/\+/ /g ; # change plus to space $SEARCHTERM =~ tr/a-z/A-Z/; # make sure all hex stuff is in caps for ($i=0; $i<$#HEXKEYS; $i++) { $SEARCHTERM =~ s/$HEXKEYS[$i]/$HEXVALUES[$i]/g ; } $SEARCHTERM =~ s/(\*)|(\+)|(\,)/ /g; # change these characters to spaces $SEARCHTERM =~ s/\s+/ /g; # change multiple spaces to one $SEARCHTERM =~ s/^\s//g; # remove leading space $SEARCHTERM =~ s/(^\')|(\'$)//g; # get rid of leading and trailing single quote $SEARCHTERM =~ tr/A-Z/a-z/; $SEARCHTERM =~ s/get \/cgi-bin\/htsearch//g; if ($LANGCODE eq "cp1251") { $SEARCHTERM =~ tr/åŪȲܡ݀/賿s/; } if ($SEARCHTERM ne "") { $SEARCHTERMREQUESTS{$SEARCHTERM}++; } } close(ACCESSLOG) ; } sub prepterms() { @SEARCHTERMS = sort(keys(%SEARCHTERMREQUESTS)); foreach $SEARCHTERM (@SEARCHTERMS) { push(@ALPHASTS, sprintf("%5.0f %s\n", $SEARCHTERMREQUESTS{$SEARCHTERM}, $SEARCHTERM)); } @REVSTS = sort mostly_rev_numeric @ALPHASTS; } sub printdata() { print "\n"; print "$WEBSITENAME's Ht://Dig Search Engine Search Terms \n"; print "\n"; print "\n"; print "

$WEBSITENAME's Ht://Dig Search Engine Search Terms

\n"; print "Report Run $TODAY

\n\n"; print "

  • Terms sorted by popularity"; print "
  • Terms sorted alphabetically"; print "Most Popular Search Terms\n"; print "
    ";
    
    	foreach $REVST (@REVSTS) {
    		print "\t$REVST";
    	}
    	print "


    "; print "
    ";
    	print "Alphabetized Search Terms\n";
    
    	foreach $ALPHAST (@ALPHASTS) {
    		print  "\t$ALPHAST"; 
    	}
    
    	print "\n";
    
    }
    
    sub mostly_rev_numeric { 
     	($b <=> $a) || ($a cmp $b); 
    }
    
    # end of script