#!/usr/local/bin/perl # ############################################################################### # # Script name: sessions # Script author: Michael Hayward # # Description: # This perl script reads through a single logfile as specified on the # invoking command line, and looks for "sessions" from individual IP # numbers. It writes out session records on standard out. # # A session is considered to be a sequence of "hits" from a given IP # number, where the timestamps between hits from the same IP is less # than the value in $maximum_interpage_time # # The logfile is assumed to be in the format of the EMWAC Windows NT WWW server. # # ############################################################################### require "timelocal.pl"; require "ctime.pl"; # Local variables and "constants" $maximum_interpage_time = 5 * 60; # seconds. If hits from the same IP are further apart # than this, it is considered another session. %Month = ("jan",0, "feb",1, "mar",2, "apr",3, "may",4, "jun",5, "jul",6, "aug",7, "sep",8, "oct",9, "nov",10, "dec",11); %Week = ("sun",0, "mon",1, "tue", 2, "wed",3, "thu",4, "fri",5, "sat",6); for ($i = 0; $i <= 20; $i += 1) { $timebucket[$i] = 0; # Zero time bucket counters } for ($i = 0; $i <= 16; $i += 1) { $pagebucket[$i] = 0; # Zero page bucket counters } $bad_lines = 0; $pages_ignored = 0; $IPs_ignored = 0; $earliest_timestamp = 999999999; $latest_timestamp = 0; $wday_being_processed = "XXX"; $total_IPs = 0; $total_sessions = 0; $total_of_session_times = 0; $total_of_session_pages = 0; $max_sessions = 0; $max_sessions_IP = ""; $shortest_session_time = 999999999; $shortest_session_time_IP = ""; $shortest_session_pages = 999999999; $shortest_session_pages_IP = ""; $longest_session_time = 0; $longest_session_time_IP = ""; $longest_session_pages = 0; $longest_session_pages_IP = ""; $longest_page_interval = 0; $longest_page = ""; $max_page_count = 0; $max_page_count_page = ""; #DEBUG# ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($previous_t); #DEBUG# print "Now is: $previous_t "; #DEBUG# print "Second: $sec, Minute: $min, Hour: $hour, MonthDay: $mday, Month: $mon, Year: $year, Weekday: $wday, YearDay: $yday, Isdst: $isdst\n"; # The logfile we are to process is supplied via a parameter to the command line which invokes # this script $log_file = $ARGV[0]; # We will process the logfile and write session and page-reference information to separate files # The names of the files are derived from the name of the logfile: if the logfile name is # HS941217.LOG, the two output files will be HS941217.ssn (sessions) and HS941217.pgs (pages) ($filename_root, $filename_suffix) = split(/\./,$log_file); $sessions_file = $filename_root . ".ssn"; $pages_file = $filename_root . ".pgs"; print "Session and page summary for logfile $log_file to files $sessions_file and $pages_file\n"; open (SESSIONSFILE, ">$sessions_file") || die("Unable to open file $sessions_file for write"); open (PAGESFILE, ">$pages_file") || die("Unable to open file $pages_file for write"); # Read through the entire logfile, a line at a time, collecting information on "sessions" GETLINE: while ($LINE = <>) { #DEBUG# print $LINE; #DEBUG# $total_hits = $total_hits + 1; # Translate all data to lowercase, for consistency ($line = $LINE) =~ tr/A-Z/a-z/; chop ($line); # Split the logfile line into pieces ($day_name, $mon_name, $day_date, $time, $year, $incoming_IP_addr, $remote_IP_addr, $action, $page, $server) = split(/ +/, $line); ($hour, $minute, $second) = split(/:/, $time); $page =~ s/(\S*)(\s*)$/$1/; # Trim trailing blanks (if any) on the page name $remote_IP_addr =~ s/(\S*)(\s*)$/$1/; # Trim trailing blanks (if any) on the remote IP addr # Do a bit of data verification before processing the line, in an attempt # to exclude partial logfile records, and records that we want to ignore (i.e. hits # on "gif" files, hits from local IPs) if ($action ne "get") { $bad_lines += 1; next GETLINE; } elsif (&ignore_page($page)) { # Filter out any hits on pages that we want to ignore $pages_ignored += 1; next GETLINE; } elsif (&ignore_IP($remote_IP_addr)) { # Filter out any hits from IPs that we want to ignore $IPs_ignored += 1; next GETLINE; } #DEBUG# print $line; #DEBUG# # Adjust any pieces to fit format required by "&timelocal" routine $year = $year-1900; # Convert from logfile (text) timestamp data to internal time format. This allows us # to calculate intervals between successive timestamps. It doesn't # seem to matter if you choose 0 or 1 for the "isdst" value... $timestamp = &timelocal(($second, $minute, $hour, $day_date, $Month{$mon_name}, $year, $Week{$day_name}, 0, 0)); #DEBUG# print "Timestamp is: $timestamp "; #DEBUG# #DEBUG# print "Second: $second, Minute: $minute, Hour: $hour, MonthDay: $day_date, Month: $Month{$mon_name}, Year: $year, Weekday: $Week{$day_name}, YearDay: ???, Isdst: 0\n"; #DEBUG# # Convert the internal form back into external, to compare #DEBUG# ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($timestamp); #DEBUG# #DEBUG# print "Converted back: $timestamp "; #DEBUG# #DEBUG# print "Second: $sec, Minute: $min, Hour: $hour, MonthDay: $mday, Month: $mon, Year: $year, Weekday: $wday, YearDay: $yday, Isdst: $isdst\n"; #DEBUG# if ($timestamp < $earliest_timestamp) { $earliest_timestamp = $timestamp; } if ($timestamp > $latest_timestamp) { $latest_timestamp = $timestamp; } if ($day_name ne $wday_being_processed) { # We're on a different day: print out a message to show our progress if ($wday_being_processed ne "XXX") { # Print out a session count for the preceding day unless we're just starting print "$this_days_session_count sessions for the above date\n"; } $day_being_processed = &ctime($timestamp); chop($day_being_processed); print "Now processing logfile data for $day_being_processed ...\n"; $wday_being_processed = $day_name; $this_days_session_count = 0; } if (! defined($session_start_time{$remote_IP_addr})) { # There has been no session yet for this IP #DEBUG# print "DEBUG : no session yet\t"; $session_count{$remote_IP_addr} = 0; &start_session($remote_IP_addr, $timestamp, $page); } else { if (($timestamp - $session_latest_hit_time{$remote_IP_addr}) <= $maximum_interpage_time) { # This hit is still part of the same session $session_num_of_pages{$remote_IP_addr} += 1; # Append the number of seconds spent on the previous page # in the form pagename:time # and then the name of the current page $interval = $timestamp - $session_latest_hit_time{$remote_IP_addr}; $session_pages{$remote_IP_addr} = $session_pages{$remote_IP_addr} . ":" . $interval . " " . $page; # Whenever we hit a new page, record a hit on the previous page, along # with the interval (the time spent "reading" that page) #DEBUG# print "DEBUG : same session(A)\tIP=$remote_IP_addr PAGE=$session_latest_hit_page{$remote_IP_addr}, TIME=$session_latest_hit_time{$remote_IP_addr}\n"; &record_a_hit($session_latest_hit_page{$remote_IP_addr}, $interval); # Update the most recently hit page in this session, and the timestamp # associated with it $session_latest_hit_page{$remote_IP_addr} = $page; $session_latest_hit_time{$remote_IP_addr} = $timestamp; #DEBUG# print "DEBUG : same session(B)\tIP=$remote_IP_addr PAGE=$session_latest_hit_page{$remote_IP_addr}, TIME=$session_latest_hit_time{$remote_IP_addr}\n"; } else { # This is a new session for the same IP # Print out the previous session information &print_session($remote_IP_addr); #DEBUG# print "DEBUG : new session\t"; # Initialize for a new session for this IP &start_session($remote_IP_addr, $timestamp, $page); } } } # Print out all of the "in progress" sessions. Doesn't need to be sorted... $total_IPs = 0; foreach $remote_IP_addr (keys(%session_start_time)) { &print_session($remote_IP_addr); $total_IPs += 1; } # Print out all of the page-related information. Sort by page name $total_unique_pages = 0; foreach $page (sort (keys(%page_count))) { &print_page($page); $total_unique_pages = $total_unique_pages + 1; } close(SESSIONSFILE); close(PAGESFILE); # Print final day's session count print "$this_days_session_count sessions for the above date\n"; # ===================================================================== # P R I N T O U T S T A T I S T I C A L I N F O R M A T I O N # # Session-related statistical information # $start = &ctime($earliest_timestamp); $finish = &ctime($latest_timestamp); chop($start); chop($finish); print "Session summary for $log_file ($start to $finish):\n"; print "=======================================================================================\n"; print "Total page references (hits) : $total_hits ($pages_ignored of these excluded by page name; $IPs_ignored by IP)\n"; print "Total bad data records : $bad_lines\n"; print "Total sessions : $total_sessions (max interpage time=$maximum_interpage_time seconds"; $time_in_minutes = $maximum_interpage_time / 60; printf " / %-5.2f minutes)\n", $time_in_minutes; print "Total unique pages referenced : $total_unique_pages\n"; print "Total # of different readers : $total_IPs\n\n"; $time_in_minutes = $total_of_session_times / 60; printf "Total of all session times : %9.2f minutes\n", $time_in_minutes; $time_in_minutes = $shortest_session_time / 60; printf "Shortest session (time) : %9.2f minutes", $time_in_minutes; print " (from $shortest_session_time_IP)\n"; $time_in_minutes = $longest_session_time / 60; printf "Longest session (time) : %9.2f minutes", $time_in_minutes; print " (from $longest_session_time_IP)\n"; $average = $total_of_session_times / $total_sessions / 60; # Calculate average time as minutes printf "Average session (time) : %9.2f minutes\n", $average; print "Shortest session (pages) : $shortest_session_pages (from $shortest_session_pages_IP)\n"; print "Longest session (pages) : $longest_session_pages (from $longest_session_pages_IP)\n"; $average = $total_of_session_pages / $total_sessions; printf "Average session (pages) : %-9.2f\n", $average; # ===================================================================== # Remaining statistical information # print "Max sessions by one reader : $max_sessions (from $max_sessions_IP)\n"; print "Max page count for a page : $max_page_count (for $max_page_count_page)\n"; $time_in_minutes = $longest_page_interval / 60; printf "Longest time reading a page : %9.2f minutes", $time_in_minutes; print " (for $longest_page)\n"; # ===================================================================== # Bar chart distribution of session length in time and # of pages # @bucket_label = (" none", " <1.0", " <2.0", " <3.0", " <4.0", " <5.0", " <6.0", " <7.0", " <8.0", " <9.0", "<10.0", "<11.0", "<12.0", "<13.0", "<14.0", "<15.0", "<16.0", "<17.0", "<18.0", "<19.0", " >=20"); # Try to print out the above stats as a bar chart, with the axes flipped. For this (and # the following bar chart) we want to scale the bar chart to handle our maximum value print "\nDistribution of session time as bar chart\n"; print "Sessions lasting indicated # of minutes (from $total_sessions sessions)\n"; print " Time | Cnt | % |\n"; print "======|======|====0====|===1|0===|===2|0===|===3|0===|===4|0===|===5|0===|===6|0===|===7|0===|===8|0===|===9|0===|==10|0\n"; for ($i = 1; $i <= 20; $i += 1) { $percent = ($timebucket[$i] * 100) / $total_sessions; printf "%5s | %4d | %2d |", $bucket_label[$i], $timebucket[$i], $percent; for ($j = 1; $j <= $percent; $j += 1) { print "%"; } print "\n"; } print "======|======|====0====|===1|0===|===2|0===|===3|0===|===4|0===|===5|0===|===6|0===|===7|0===|===8|0===|===9|0===|==10|0\n"; @bucket_label = (" 1", " <3", " <6", " <9", " <12", " <15", " <18", " <21", " <24", " <27", " <30", " <33", " <36", " <39", " <42", " <45", " <48", " <51", " <54"); # Try to print out the above stats as a bar chart, with the axes flipped. For this (and # the following bar chart) we want to scale the bar chart to handle our maximum value print "\nDistribution of session page count as bar chart\n"; print "Sessions consisting of the indicated # of pages (\"hits\") (from $total_sessions sessions)\n"; print "Pages| Cnt | % |\n"; print "=====|======|====0====|===1|0===|===2|0===|===3|0===|===4|0===|===5|0===|===6|0===|===7|0===|===8|0===|===9|0===|==10|0\n"; for ($i = 0; $i <= 16; $i += 1) { $percent = ($pagebucket[$i] * 100) / $total_sessions; printf "%4s | %4d | %2d |", $bucket_label[$i], $pagebucket[$i], $percent; for ($j = 1; $j <= $percent; $j += 1) { print "%"; } print "\n"; } print "=====|======|====0====|===1|0===|===2|0===|===3|0===|===4|0===|===5|0===|===6|0===|===7|0===|===8|0===|===9|0===|==10|0\n"; # ===================================================================== # S U B R O U T I N E S E T C . # ===================================================================== sub ignore_IP { local($remote_IP_addr) = @_; # Filter out hits from certain IPs (presumably local IPs) #DEBUG# print "DEBUG : ignore_IP\t($remote_IP_addr)\n"; #DEBUG# #DEBUG# $temp = ""; # This results in a FALSE result for the routine $remote_IP_addr =~ /^204\.94\.123\./; # Exclude Hip IPs } # ===================================================================== sub ignore_page { local($page) = @_; # Filter out hits on certain pages (gifs etc) #DEBUG# print "DEBUG : ignore_page\t($page)\n"; #DEBUG# $page =~ /.gif$/; } # ===================================================================== sub record_a_hit { local($page, $interval) = @_; # Record page-related information #DEBUG# print "DEBUG : record_a_hit\t($page; $interval)\n"; #DEBUG# if (! defined($page_count{$page})) { # This page hasn't been referenced before. Initialize the counters. # if $interval is -1, it indicates that we are to use the average time for # this page. If this is the only reference so far, use 1 second if ($interval == -1) { $interval = 1; } $page_count{$page} = 1; $page_total_intervals{$page} = $interval; } else { if ($interval == -1) { # This indicates that we are to use the average time for # this page (don't count this hit when calculating the average) $interval = $page_total_intervals{$page} / $page_count{$page}; } # Increment the counters for this page $page_count{$page} += 1; $page_total_intervals{$page} += $interval; } #DEBUG# print "count=$page_count{$page} total=$page_total_intervals{$page} max=$max_page_count "; #DEBUG# if ($page_count{$page} > $max_page_count) { #DEBUG# print "Max_page_count=$max_page_count"; #DEBUG# $max_page_count = $page_count{$page}; $max_page_count_page = $page; } #DEBUG# print "\n"; #DEBUG# # We're interested in the longest interval if ($interval > $longest_page_interval) { $longest_page_interval = $interval; $longest_page = $page; } # Return the interval (or average interval, if we calculated it) as the subroutine's result $temp = $interval; } # ===================================================================== sub start_session { local($remote_IP_addr, $timestamp, $page) = @_; # Initialize all inter-related variables associated with a session #DEBUG# print "start_session\t($remote_IP_addr, $timestamp, $page)\n"; #DEBUG# $session_start_time{$remote_IP_addr} = $timestamp; # Track each page name. As the session proceeds, we will append # the "reading time" spent on each page in parens after the pagename $session_pages{$remote_IP_addr} = $page; $session_latest_hit_page{$remote_IP_addr} = $page; $session_latest_hit_time{$remote_IP_addr} = $timestamp; $session_length{$remote_IP_addr} = 0; $session_num_of_pages{$remote_IP_addr} = 1; $session_count{$remote_IP_addr} = $session_count{$remote_IP_addr} + 1; $this_days_session_count += 1; if ($session_count{$remote_IP_addr} > $max_sessions) { $max_sessions = $session_count{$remote_IP_addr}; $max_sessions_IP = $remote_IP_addr; } } # ===================================================================== sub print_page { local($page) = @_; # Print out a record of information on references to this page $average = $page_total_intervals{$page} / $page_count{$page}; print PAGESFILE "$page\t : $page_count{$page}\t$page_total_intervals{$page}\t$average\n"; } # ===================================================================== sub print_session { local($remote_IP_addr) = @_; #DEBUG# print "DEBUG : print_session\t($remote_IP_addr)\n"; #DEBUG# $total_sessions += 1; # Record a hit for the last page referenced in this session. Since we don't # know when the session ended, use the average interval time for this page as the # interval (indicated with a parameter value of -1). The returned value will # be the average interval time for that page, which we append to the session # record (below) $average_interval = &record_a_hit($session_latest_hit_page{$remote_IP_addr}, -1); # Calculate the session length for this session. Include the (estimated) time spent # reading the last page in the session (estimated to be the average for that page) $session_length{$remote_IP_addr} = $session_latest_hit_time{$remote_IP_addr} - $session_start_time{$remote_IP_addr} + $average_interval; # Print out a session record from the variables associated with a session print SESSIONSFILE "$remote_IP_addr\t$session_start_time{$remote_IP_addr}\t$session_length{$remote_IP_addr}\t" . "$session_num_of_pages{$remote_IP_addr} : "; # Each session record ends with a list of the pages in that session, and the time spent on that # page. Since there is no record in the logfile of when a session ended, the last page name will # not have a "reading time" associated with it. Use the average time for that page (returned by # "record_a_hit" above), rather than nothing at all. print SESSIONSFILE "$session_pages{$remote_IP_addr}:$average_interval\n"; # Accumulate grand total stats for the entire logfile $total_of_session_times += $session_length{$remote_IP_addr}; $total_of_session_pages += $session_num_of_pages{$remote_IP_addr}; # Fit the session time and page count into the appropriate bucket for our # bar charts later $session_webzine_time = $session_length{$remote_IP_addr}; TIME: { if ($session_webzine_time == 0) { $timebucket[0] += 1; last TIME; } # Won't be used if ($session_webzine_time <= 60) { $timebucket[1] += 1; last TIME; } if ($session_webzine_time <= 120) { $timebucket[2] += 1; last TIME; } # 2 minute if ($session_webzine_time <= 180) { $timebucket[3] += 1; last TIME; } if ($session_webzine_time <= 240) { $timebucket[4] += 1; last TIME; } # 4 minutes if ($session_webzine_time <= 300) { $timebucket[5] += 1; last TIME; } if ($session_webzine_time <= 360) { $timebucket[6] += 1; last TIME; } # 6 minutes if ($session_webzine_time <= 420) { $timebucket[7] += 1; last TIME; } if ($session_webzine_time <= 480) { $timebucket[8] += 1; last TIME; } # 8 minutes if ($session_webzine_time <= 540) { $timebucket[9] += 1; last TIME; } if ($session_webzine_time <= 600) { $timebucket[10] += 1; last TIME; } # 10 minutes if ($session_webzine_time <= 660) { $timebucket[11] += 1; last TIME; } if ($session_webzine_time <= 720) { $timebucket[12] += 1; last TIME; } # 12 minutes if ($session_webzine_time <= 780) { $timebucket[13] += 1; last TIME; } if ($session_webzine_time <= 840) { $timebucket[14] += 1; last TIME; } # 14 minutes if ($session_webzine_time <= 900) { $timebucket[15] += 1; last TIME; } if ($session_webzine_time <= 960) { $timebucket[16] += 1; last TIME; } # 16 minutes if ($session_webzine_time <= 1020) { $timebucket[17] += 1; last TIME; } if ($session_webzine_time <= 1080) { $timebucket[18] += 1; last TIME; } # 18 minutes if ($session_webzine_time <= 1140) { $timebucket[19] += 1; last TIME; } $timebucket[20] += 1; } $session_webzine_pages = $session_num_of_pages{$remote_IP_addr}; PAGES: { if ($session_webzine_pages == 1) { $pagebucket[0] += 1; last PAGES; } if ($session_webzine_pages <= 3) { $pagebucket[1] += 1; last PAGES; } if ($session_webzine_pages <= 6) { $pagebucket[2] += 1; last PAGES; } if ($session_webzine_pages <= 9) { $pagebucket[3] += 1; last PAGES; } if ($session_webzine_pages <= 12) { $pagebucket[4] += 1; last PAGES; } if ($session_webzine_pages <= 15) { $pagebucket[5] += 1; last PAGES; } if ($session_webzine_pages <= 18) { $pagebucket[6] += 1; last PAGES; } if ($session_webzine_pages <= 21) { $pagebucket[7] += 1; last PAGES; } if ($session_webzine_pages <= 24) { $pagebucket[8] += 1; last PAGES; } if ($session_webzine_pages <= 27) { $pagebucket[9] += 1; last PAGES; } if ($session_webzine_pages <= 30) { $pagebucket[10] += 1; last PAGES; } if ($session_webzine_pages <= 33) { $pagebucket[11] += 1; last PAGES; } if ($session_webzine_pages <= 36) { $pagebucket[12] += 1; last PAGES; } if ($session_webzine_pages <= 39) { $pagebucket[13] += 1; last PAGES; } if ($session_webzine_pages <= 42) { $pagebucket[14] += 1; last PAGES; } if ($session_webzine_pages <= 45) { $pagebucket[15] += 1; last PAGES; } $pagebucket[16] += 1; } if ($session_length{$remote_IP_addr} < $shortest_session_time) { $shortest_session_time = $session_length{$remote_IP_addr}; $shortest_session_time_IP = $remote_IP_addr; } if ($session_num_of_pages{$remote_IP_addr} < $shortest_session_pages) { $shortest_session_pages = $session_num_of_pages{$remote_IP_addr}; $shortest_session_pages_IP = $remote_IP_addr; } if ($session_length{$remote_IP_addr} > $longest_session_time) { $longest_session_time = $session_length{$remote_IP_addr}; $longest_session_time_IP = $remote_IP_addr; } if ($session_num_of_pages{$remote_IP_addr} > $longest_session_pages) { $longest_session_pages = $session_num_of_pages{$remote_IP_addr}; $longest_session_pages_IP = $remote_IP_addr; } }
#!/usr/local/bin/perl # ############################################################################### # # Script name: transit # Script author: Michael Hayward # # Description: # This perl script reads through a "sessions" logfile as specified on the # invoking command line, and gathers stats on the "transitions" between # the various pages in the sessions. # # The session logfile is produced as output from the "sessions" script, # as it processes a Windows NT WWW server's logfile. The session logfile # is stored in a file with the suffix ".ssn" # # ############################################################################### require "timelocal.pl"; require "ctime.pl"; $top = 25; $total_sessions = 0; $most_popular_transition_count = 0; $most_popular_transition = ""; $sessions_file = $ARGV[0]; # We will process the sessions file and write inter-page transition information to 2 separate files. # The names of the files are derived from the name of the session file: if the sessions file name is # HS941217.ssn, the two output files will be HS941217.trn (transitions sorted by To_page) and HS941217.trs # (transitions sorted by frequency of transition, from most to least) ($filename_root, $filename_suffix) = split(/\./,$sessions_file); $transitions_file = $filename_root . ".trn"; $sorted_transitions = $filename_root . ".trs"; print "Inter-page transition summary for session file $sessions_file (to files $transitions_file & $sorted_transitions)\n"; open (TRANSITIONSFILE, ">$transitions_file") || die("Unable to open file $transitions_file for write"); open (SORTEDFILE, ">$sorted_transitions") || die("Unable to open file $sorted_transitions for write"); # Read through the entire sessions file, a line (session) at a time, collecting information on # inter-page "transitions" GETLINE: while ($line = <>) { #DEBUG# print $line; #DEBUG# $total_sessions = $total_sessions + 1; chop ($line); # Split the session file line into pieces ($part1, $part2) = split (/ : /, $line); ($remote_IP_addr, $session_start_time, $session_length, $session_num_of_pages) = split(/\t/, $part1); (@session_pages) = split(/ /, $part2); #DEBUG# print "DEBUG : $remote_IP_addr, $session_start_time, $session_length, $session_num_of_pages\n"; #DEBUG# print "DEBUG : $part2\n"; # Go through all of the pages in the session and accumulate the inter-page transitions $from_page_entry = shift(@session_pages); # Register the transition out of "hyperspace" as well (i.e. start of session, entering # our server's space from parts unknown... ®ister_a_transition(":0", $from_page_entry); $to_page_entry = shift(@session_pages); while ($to_page_entry ne "") { ®ister_a_transition($from_page_entry, $to_page_entry); $from_page_entry = $to_page_entry; $to_page_entry = shift(@session_pages); } # Register the transition into "hyperspace" as well (i.e. end of session, leaving # our server's space for parts unknown... ®ister_a_transition($from_page_entry, " :0"); } $total_transitions = 0; foreach $transition (sort (keys(%transition_count))) { &print_transition(TRANSITIONSFILE, $transition, 0); $total_transitions += 1; } print "Total session records processed : $total_sessions\n"; print "Total unique inter-page transitions : $total_transitions\n"; if ($top <= 0) { ($from_page, $to_page) = split(/:/, $most_popular_transition); print "Most common inter-page transition : $from_page to $to_page ($most_popular_transition_count times)\n"; } else { print "Top $top inter-page transitions (complete sorted list in $sorted_transitions):\n"; print "From page\tTo page\tCount\tTotal inter-page delay\tShortest\tLongest\tAverage\n"; } # Now print the transitions into another file, sorted from the most to the least frequent transition # Echo the top $top transitions to STDOUT $i = 0; foreach $transition (sort byvalue (keys(%transition_count))) { $i += 1; &print_transition(SORTEDFILE, $transition, ($i <= $top)); } close (TRANSITIONSFILE); close (SORTEDFILE); # ===================================================================== # S U B R O U T I N E S E T C . # ===================================================================== sub byvalue { $transition_count{$b} <=> $transition_count{$a}; } sub print_transition { local($FILE, $transition, $echo) = @_; #DEBUG# print "DEBUG : print_transition\t($transition)\n"; #DEBUG# # Print a transition entry. The $transition key is in the form $from_page:$to_page # Split this into its parts ($from_page, $to_page) = split(/:/, $transition); # Now print out a transition record $average = $transition_inter_page_delay_total{$transition} / $transition_count{$transition}; print $FILE "$from_page\t$to_page\t$transition_count{$transition}\t" . "$transition_inter_page_delay_total{$transition}\t" . "$transition_shortest_inter_page_delay{$transition}\t" . "$transition_longest_inter_page_delay{$transition}\t$average\n"; if ($echo) { # Echo to STDOUT (if asked) print "$from_page\t$to_page\t$transition_count{$transition}\t" . "$transition_inter_page_delay_total{$transition}\t" . "$transition_shortest_inter_page_delay{$transition}\t" . "$transition_longest_inter_page_delay{$transition}\t$average\n"; } if ($transition_count{$transition} > $most_popular_transition_count) { $most_popular_transition_count = $transition_count{$transition}; $most_popular_transition = $transition; } } sub register_a_transition { local($from_page_entry, $to_page_entry) = @_; # Process an inter-page transition from one page entry to another. # Each page entry is in the form pagename:readingtime #DEBUG# print "DEBUG : register_a_transition\t($from_page, $to_page)\n"; #DEBUG# # Split the page entries into their component parts and construct the transition key ($from_page, $from_time) = split(/:/, $from_page_entry); ($to_page, $to_time) = split(/:/, $to_page_entry); $transition = $from_page . ":" . $to_page; # Now record information on the transition if (! defined($transition_count{$transition})) { # This is the first time that this transition has occurred: initialize $transition_count{$transition} = 1; $transition_inter_page_delay_total{$transition} = $from_time; $transition_shortest_inter_page_delay{$transition} = $from_time; $transition_longest_inter_page_delay{$transition} = $from_time; } else { # Increment the counters for this transition $transition_count{$transition} += 1; #DEBUG# print "DEBUG : increment to $transition_count{$transition} for $transition\n"; $transition_inter_page_delay_total{$transition} += $from_time; # Track the shortest and longest inter-page delays if ($from_time < $transition_shortest_inter_page_delay{$transition}) { $transition_shortest_inter_page_delay{$transition} = $from_time; } elsif ($from_time > $transition_longest_inter_page_delay{$transition}) { $transition_longest_inter_page_delay{$transition} = $from_time; } } }
#!/usr/local/bin/perl # ############################################################################### # # Script name: hipstats # Script author: Michael Hayward # # Description: # This perl script builds on the "sessions" and "transit" scripts to extract # some useful information specific to the hip webzine. It takes the name # of a logfile (or root), and processes the associated session and transition # files. # # The session logfile is produced as output from the "sessions" script, # as it processes a Windows NT WWW server's logfile. The session logfile # is stored in a file with the suffix ".ssn". The transition files are # produced by the "transit" script: ".trn" suffix contains transition stats # sorted by "to" page; ".trs" suffix contains transition stats sorted from # most frequent to least frequently used. # # ############################################################################### require "timelocal.pl"; require "ctime.pl"; # Patterns to use to determine if a page is webzine-related @webzine_patterns = ("/col/.*", "/bestnet/.*", "/tour/.*", "/zodiac/.*", "/", "/cover.htm", "/cgi-bin/.*/tour.*htm"); $total_hits = 0; $top = 10; # Number of "most popular" pages to show hits for $filename = $ARGV[0]; # Get the filename root from the command line argument, and then determine the # filenames we will be processing ($filename_root, $filename_suffix) = split(/\./,$filename); $pages_file = $filename_root . ".pgs"; $sessions_file = $filename_root . ".ssn"; $transitions_file = $filename_root . ".trn"; $sorted_transitions = $filename_root . ".trs"; print "Hip Webzine stats from files $sessions_file/$pages_file/$transitions_file/$sorted_transitions\n"; open (PAGESFILE, "$pages_file") || die("Unable to open file $pages_file for read"); open (SESSIONSFILE, "$sessions_file") || die("Unable to open file $sessions_file for read"); open (TRANSITIONSFILE, "$transitions_file") || die("Unable to open file $transitions_file for read"); open (SORTEDFILE, "$sorted_transitions") || die("Unable to open file $sorted_transitions for read"); ########################################################################################### # # R E P O R T S B A S E D O N T H E P A G E S F I L E ( R O O T . p g s ) # ########################################################################################### # Process the pages file print "\n=========================\n"; print "= Page based reports =\n"; print "=========================\n"; GETLINE: while ($line =) { #DEBUG# print $line; #DEBUG# chop ($line); # Split the pages file line into pieces, and re-build the associative arrays # of page-related information ($page, $part2) = split (/ : /, $line); $page =~ s/(\S*)(\s*)$/$1/; # Trim trailing blanks on the page name ($page_count{$page}, $page_total_intervals{$page}, $average{$page}) = split(/\t/, $part2); # Track the total hits in this logfile $total_hits += $page_count{$page}; #DEBUG# print "$page, $page_count{$page}, $page_total_intervals{$page}, $average{$page}\n"; #DEBUG# } # ############################################################################ # # Report # 1 & #2: Calculate completion rate on the hip columns. And # while we're at it, determine "bestnet" readership too. # # Go through all the pages by name, and find information on the columns. They are # named according to the following convention: # /col/thisweek/ 1.htm First page of current column # /col/thisweek/ 2.htm Second page of current column # /col/ / 1.htm First page of back issue column # /col/ / 2.htm Second page of back issue column # "Best of the net" pages are named according to the following convention: # /bestnet/thisweek/index.htm "Best of Net" index # /bestnet/thisweek/bnN.htm Nth "Best of Net" item (N=1, 2, 3) # /bestnet/ /index.htm Back issue "Best of Net" index # /bestnet/ /bnN.htm Nth back issue "Best of Net" item foreach $page (sort (keys(%page_count))) { if ($page =~ /^\/col\/(.*)1.htm$/) { # Page one of a column #DEBUG# print "Column $1\t"; #DEBUG# if ($page =~ /^\/col\/thisweek\/(.*)1.htm$/) { # Current issue, column name in $1 $column_page1_count{$1} = $page_count{$page}; if (! defined($column_page2_count{$1})) { $column_page2_count{$1} = 0; } } elsif ($page =~ /^\/col\/(\d\dwk\d\d)\/(.*)1.htm$/) { # Back issue, date in $1; column name in $2 #DEBUG# print "backissue1: $1 $2\n"; #DEBUG# $key = $2 . ":" . $1; $column_backissue_page1_count{$key} = $page_count{$page}; if (! defined($column_backissue_page2_count{$key})) { $column_backissue_page2_count{$key} = 0; } } } elsif ($page =~ /^\/col\/.*2.htm$/) { # Page two of a column if ($page =~ /^\/col\/thisweek\/(.*)2.htm$/) { # Current issue, column name in $1 if (! defined($column_page1_count{$1})) { $column_page1_count{$1} = 0; } $column_page2_count{$1} = $page_count{$page}; } elsif ($page =~ /^\/col\/(\d\dwk\d\d)\/(.*)2.htm$/) { # Back issue, date in $1; column name in $2 #DEBUG# print "backissue2: $1 $2\n"; #DEBUG# $key = $2 . ":" . $1; if (! defined($column_backissue_page1_count{$key})) { $column_backissue_page1_count{$key} = 0; } $column_backissue_page2_count{$key} = $page_count{$page}; } } elsif ($page =~ /^\/bestnet\/.*\/index.htm$/) { # A "Best of Net" index if ($page =~ /^\/bestnet\/thisweek\/index.htm$/) { # Current issue index $bestnet_count{"index"} = $page_count{$page}; } elsif ($page =~ /^\/bestnet\/(\d\dwk\d\d)\/index.htm$/) { # Back issue index; date in $1 $key = "index" . ":" . $1; $bestnet_backissue_count{$key} = $page_count{$page}; } } elsif ($page =~ /^\/bestnet\/.*\d.htm$/) { # A "Best of Net" item if ($page =~ /^\/bestnet\/thisweek\/(bn\d).htm$/) { # Current issue item; item name/number in $1 $bestnet_count{$1} = $page_count{$page}; } elsif ($page =~ /^\/bestnet\/(\d\dwk\d\d)\/(bn\d).htm$/) { # Back issue index; date in $1, item name/number in $2 $key = $2 . ":" . $1; $bestnet_backissue_count{$key} = $page_count{$page}; } } } # ############################################################################ # # Report # 1: print "\nReport 1: Hit count (and completion rate) for webzine columns, sorted by hit count\n"; print "Column Hit counts: Cmpltn Back iss. Hit counts: Cmpltn\n"; print "name Page1 Page2 (%) date Page1 Page2 (%)\n"; print "====== ===== ===== ====== ======== ===== ===== ======\n"; # Sort the backissue keys once before generating the report @sorted_backissue_keys = sort (keys(%column_backissue_page1_count)); foreach $column (sort by_page1_hitrate (keys(%column_page1_count))) { $completion = ($column_page2_count{$column} * 100) / $column_page1_count{$column}; printf "%-7s %5d %5d %6.2f", $column, $column_page1_count{$column}, $column_page2_count{$column}, $completion; &print_column_backissues($column); print "\n"; } # ############################################################################ # # Report # 2: print "\nReport 2: Hit count for \"Best of Net\" items, sorted by hit count\n"; print "Item Hit Back iss. Hit\n"; print "name Count date Count\n"; print "===== ===== ======== =====\n"; # Sort the backissue keys once before generating the report @sorted_backissue_keys = sort (keys(%bestnet_backissue_count)); foreach $item (sort by_bestnet_hitrate (keys(%bestnet_count))) { printf "%-5s %5d ", $item, $bestnet_count{$item}; &print_bestnet_backissues($item); print "\n"; } # ################################################################ # # Report # 3: Hit rates for all pages # print "\nReport 3: Hit count for top $top of all pages, sorted by hit count\n"; print "(from $total_hits page hits on all pages for the period)\n"; print " Hit % of\n"; print "Page name count total\n"; print "======================================= ===== ====== |===1|0==2|0==3|0==4|0==5|0==6|0==7|0==8|0==9|0=10|0\n"; $i = 0; # Sort all pages by hit rate once here, before generating this report @sorted_pages = sort by_hitrate (keys(%page_count)); foreach $page (@sorted_pages) { $rate = ($page_count{$page} * 100) / $total_hits; printf "%-35s %9d %6.2f |", $page, $page_count{$page}, $rate; for ($j = 2; $j <= $rate; $j += 2) { print "%"; } print "\n"; $i += 1; last if ($i >= $top); } print "======================================= ===== ====== |===1|0==2|0==3|0==4|0==5|0==6|0==7|0==8|0==9|0=10|0\n"; # ################################################################ # # Report # 4: Hit rates for all webzine-related pages # print "\nReport 4: Hit count for top $top of all webzine-related pages, sorted by hit count\n"; print "(from $total_hits page hits on all pages for the period)\n"; print " Hit % of\n"; print "Page name count total\n"; print "======================================= ===== ====== |===1|0==2|0==3|0==4|0==5|0==6|0==7|0==8|0==9|0=10|0\n"; $i = 0; foreach $page (@sorted_pages) { if (&page_is_of_interest($page)) { $rate = ($page_count{$page} * 100) / $total_hits; printf "%-35s %9d %6.2f |", $page, $page_count{$page}, $rate; for ($j = 2; $j <= $rate; $j += 2) { print "%"; } print "\n"; $i += 1; last if ($i >= $top); } } print "======================================= ===== ====== |===1|0==2|0==3|0==4|0==5|0==6|0==7|0==8|0==9|0=10|0\n"; ############################################################################################## # # R E P O R T S B A S E D O N T H E S E S S I O N S F I L E ( R O O T . s s n ) # ############################################################################################## # Process the sessions file print "\n=========================\n"; print "= Session based reports =\n"; print "=========================\n"; $total_sessions = 0; for ($i = 0; $i <= 16; $i += 1) { $timebucket[$i] = 0; # Zero time bucket counters } for ($i = 0; $i <= 20; $i += 1) { $percentbucket[$i] = 0; # Zero percent bucket counters } GETLINE2: while ($line = ) { #DEBUG# print $line; #DEBUG# $total_sessions += 1; chop ($line); # Split the session file line into pieces ($part1, $part2) = split (/ : /, $line); ($remote_IP_addr, $session_start_time, $session_length, $session_num_of_pages) = split(/\t/, $part1); (@session_pages) = split(/ /, $part2); #DEBUG# print "DEBUG : $remote_IP_addr, $session_start_time, $session_length, $session_num_of_pages\n"; #DEBUG# print "DEBUG : $part2\n"; # Go through all of the pages in the session and accumulate the "webzine-space" time $from_page_entry = shift(@session_pages); $to_page_entry = shift(@session_pages); $session_webzine_time = 0; while ($to_page_entry ne "") { $session_webzine_time += &accumulate_webzine_space_info($from_page_entry, $to_page_entry); $from_page_entry = $to_page_entry; $to_page_entry = shift(@session_pages); } # Include the time spent on the last page of the session too $session_webzine_time += &accumulate_webzine_space_info($from_page_entry, $to_page_entry); # Now that we have this session's time spent in "webzine-space", increment the counter # associated with the corresponding "bucket" for our distribution chart. There's # probably a more efficient way to do this, but this should work... $percent = ($session_webzine_time * 100) / $session_length; TIME: { if ($session_webzine_time == 0) { $timebucket[0] += 1; last TIME; } if ($session_webzine_time <= 30) { $timebucket[1] += 1; last TIME; } if ($session_webzine_time <= 60) { $timebucket[2] += 1; last TIME; } # 1 minute if ($session_webzine_time <= 90) { $timebucket[3] += 1; last TIME; } if ($session_webzine_time <= 120) { $timebucket[4] += 1; last TIME; } # 2 minutes if ($session_webzine_time <= 150) { $timebucket[5] += 1; last TIME; } if ($session_webzine_time <= 180) { $timebucket[6] += 1; last TIME; } # 3 minutes if ($session_webzine_time <= 210) { $timebucket[7] += 1; last TIME; } if ($session_webzine_time <= 240) { $timebucket[8] += 1; last TIME; } # 4 minutes if ($session_webzine_time <= 270) { $timebucket[9] += 1; last TIME; } if ($session_webzine_time <= 300) { $timebucket[10] += 1; last TIME; } # 5 minutes if ($session_webzine_time <= 330) { $timebucket[11] += 1; last TIME; } if ($session_webzine_time <= 360) { $timebucket[12] += 1; last TIME; } # 6 minutes if ($session_webzine_time <= 390) { $timebucket[13] += 1; last TIME; } if ($session_webzine_time <= 420) { $timebucket[14] += 1; last TIME; } # 7 minutes if ($session_webzine_time <= 450) { $timebucket[15] += 1; last TIME; } $timebucket[16] += 1; } PERCENT: { if ($percent == 0) { $percentbucket[0] += 1; last PERCENT; } if ($percent <= 5) { $percentbucket[1] += 1; last PERCENT; } if ($percent <= 10) { $percentbucket[2] += 1; last PERCENT; } if ($percent <= 15) { $percentbucket[3] += 1; last PERCENT; } if ($percent <= 20) { $percentbucket[4] += 1; last PERCENT; } if ($percent <= 25) { $percentbucket[5] += 1; last PERCENT; } if ($percent <= 30) { $percentbucket[6] += 1; last PERCENT; } if ($percent <= 35) { $percentbucket[7] += 1; last PERCENT; } if ($percent <= 40) { $percentbucket[8] += 1; last PERCENT; } if ($percent <= 45) { $percentbucket[9] += 1; last PERCENT; } if ($percent <= 50) { $percentbucket[10] += 1; last PERCENT; } if ($percent <= 55) { $percentbucket[11] += 1; last PERCENT; } if ($percent <= 60) { $percentbucket[22] += 1; last PERCENT; } if ($percent <= 65) { $percentbucket[13] += 1; last PERCENT; } if ($percent <= 70) { $percentbucket[14] += 1; last PERCENT; } if ($percent <= 75) { $percentbucket[15] += 1; last PERCENT; } if ($percent <= 80) { $percentbucket[16] += 1; last PERCENT; } if ($percent <= 85) { $percentbucket[17] += 1; last PERCENT; } if ($percent <= 90) { $percentbucket[18] += 1; last PERCENT; } if ($percent <= 95) { $percentbucket[19] += 1; last PERCENT; } $percentbucket[20] += 1; } } # ################################################################ # # Report # 5: Distribution of time in "webzine-space" in seconds # #print "\nReport 5: Distribution of session time\nSessions spending indicated # of seconds in \"webzine-space\" (of $total_sessions sessions)\n"; #print "none 0-30 <60 <90 <120 <150 <180 <210 <240 <270 <300 <330 <360 <390 <420 <450 >450\n"; #print "==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====\n"; #for ($i = 0; $i <= 16; $i += 1) { # printf "%4d ", $timebucket[$i]; #} #print "(#)\n"; #for ($i = 0; $i <= 16; $i += 1) { # $percent = ($timebucket[$i] * 100) / $total_sessions; # printf "%4d ", $percent; #} #print "(%)\n"; @bucket_label = ("none", "<0.5", "<1.0", "<1.5", "<2.0", "<2.5", "<3.0", "<3.5", "<4.0", "<4.5", "<5.0", "<5.5", "<6.0", "<6.5", "<7.0", "<7.5", "<8.0", "<8.5", "<9.0"); # Try to print out the above stats as a bar chart, with the axes flipped. For this (and # the following bar chart) we want to scale the bar chart to handle our maximum value print "\nReport 5: Distribution of session time as bar chart\n"; print "Sessions spending indicated # of minutes in \"webzine-space\" (of $total_sessions sessions)\n"; print "Time | Cnt | % |\n"; print "=====|======|====0====|===1|0===|===2|0===|===3|0===|===4|0===|===5|0===|===6|0===|===7|0===|===8|0===|===9|0===|==10|0\n"; for ($i = 0; $i <= 16; $i += 1) { $percent = ($timebucket[$i] * 100) / $total_sessions; printf "%4s | %4d | %2d |", $bucket_label[$i], $timebucket[$i], $percent; for ($j = 1; $j <= $percent; $j += 1) { print "%"; } print "\n"; } print "=====|======|====0====|===1|0===|===2|0===|===3|0===|===4|0===|===5|0===|===6|0===|===7|0===|===8|0===|===9|0===|==10|0\n"; # ################################################################ # # Report # 6: Distribution of time in "webzine-space" as % of total session time # #print "\nReport 6: Distribution of session time\nSessions spending indicated % of session time in \"webzine-space\" (of $total_sessions sessions)\n"; #print "none <5 <10 <15 <20 <25 <30 <35 <40 <45 <50 <55 <60 <65 <70 <75 <80 <85 <90 <95 <100\n"; #print "==== === === === === === === === === === === === === === === === === === === === ====\n"; #for ($i = 0; $i <= 20; $i += 1) { # printf "%4d", $percentbucket[$i]; #} #print " (#)\n"; #for ($i = 0; $i <= 20; $i += 1) { # $percent = ($percentbucket[$i] * 100) / $total_sessions; # printf "%4d", $percent; #} #print " (%)\n"; @bucket_label = ("none", " <5", " <10", " <15", " <20", " <25", " <30", " <35", " <40", " <45", " <50", " <55", " <60", " <65", " <70", " <75", " <80", " <85", " <90", " <95", "<100"); # Print out the above stats as a bar chart, with the axes flipped print "\nReport 6: Distribution of session time as bar chart\n"; print "Sessions spending indicated % of session time in \"webzine-space\" (of $total_sessions sessions)\n"; print " % | Cnt | % |\n"; print "=====|======|====0====|===1|0===|===2|0===|===3|0===|===4|0===|===5|0===|===6|0===|===7|0===|===8|0===|===9|0===|==10|0\n"; for ($i = 0; $i <= 20; $i += 1) { $percent = ($percentbucket[$i] * 100) / $total_sessions; printf "%4s | %4d | %2d |", $bucket_label[$i], $percentbucket[$i], $percent; for ($j = 1; $j <= $percent; $j += 1) { print "%"; } print "\n"; } print "=====|======|====0====|===1|0===|===2|0===|===3|0===|===4|0===|===5|0===|===6|0===|===7|0===|===8|0===|===9|0===|==10|0\n"; ############################################################################################## # # R E P O R T S B A S E D O N O N E O F T H E T R A N S I T I O N S F I L E S # # ( R O O T . t r n : T R A N S I T I O N S S O R T E D B Y " F R O M " ) # ############################################################################################## # Process the transition file print "\n============================\n"; print "= Transition based reports =\n"; print "============================\n"; $total_transitions = 0; $total_entries = 0; $total_exits = 0; $total_inter_webzine = 0; GETLINE3: while ($line = ) { #DEBUG# print $line; #DEBUG# chop ($line); # Split the transitions file line into pieces ($from_page, $to_page, $transition_count, $transition_inter_page_delay_total, $transition_shortest_inter_page_delay, $transition_longest_inter_page_delay, $average) = split(/\t/, $line); $total_transitions += $transition_count; $transition = $from_page . ":" . $to_page; if (&page_is_of_interest($from_page) && (! &page_is_of_interest($to_page))) { # Exit from "webzine-space" $total_exits += $transition_count; $exit_count{$transition} = $transition_count; } elsif ((! &page_is_of_interest($from_page)) && &page_is_of_interest($to_page)) { # Entry into "webzine-space" $total_entries += $transition_count; $entry_count{$transition} = $transition_count; } elsif (&page_is_of_interest($from_page) && &page_is_of_interest($to_page)) { # An inter-page transition within the webzine $total_inter_webzine += $transition_count; $inter_webzine_count{$transition} = $transition_count; } } # ################################################################ # # Report # 7: Top $top ways our readers entered "webzine-space" # print "\nReport 7: Top $top ways our readers entered \"webzine-space\"\n"; print "($total_entries entries to webzine-space from $total_transitions total inter-page transitions)\n"; print " (% of (% of \n"; print "Entry from: To webzine-space at: Count entries) total)\n"; print "========================== ======================== ===== ======== ======\n"; $i = 0; foreach $transition (sort by_entry_count (keys(%entry_count))) { ($from_page, $to_page) = split(/:/, $transition); $percent1 = ($entry_count{$transition} * 100) / $total_entries; $percent2 = ($entry_count{$transition} * 100) / $total_transitions; printf "%-35s %-35s %5d %6.2f %6.2f\n", $from_page, $to_page, $entry_count{$transition}, $percent1, $percent2; $i += 1; last if ($i >= $top); } # ################################################################ # # Report # 8: Top $top ways our readers left "webzine-space" # print "\nReport 8: Top $top ways our readers left \"webzine-space\"\n"; print "($total_exits exits from webzine-space from $total_transitions total inter-page transitions)\n"; print " (% of (% of \n"; print "Exit from webzine-space at: To: Count entries) total)\n"; print "=========================== ======================== ===== ======== ======\n"; $i = 0; foreach $transition (sort by_exit_count (keys(%exit_count))) { ($from_page, $to_page) = split(/:/, $transition); $percent1 = ($exit_count{$transition} * 100) / $total_exits; $percent2 = ($exit_count{$transition} * 100) / $total_transitions; printf "%-35s %-35s %5d %6.2f %6.2f\n", $from_page, $to_page, $exit_count{$transition}, $percent1, $percent2; $i += 1; last if ($i >= $top); } # ################################################################ # # Report # 9: Top $top inter-page transitions within "webzine-space" # print "\nReport 9: Top $top inter-page transitions within \"webzine-space\"\n"; print "($total_inter_webzine inter-webzine transitions from $total_transitions total inter-page transitions)\n"; print " (% of (% of \n"; print "Transition from: To: Count links) total)\n"; print "=========================== ======================== ===== ======== ======\n"; $i = 0; foreach $transition (sort by_inter_webzine_count (keys(%inter_webzine_count))) { ($from_page, $to_page) = split(/:/, $transition); $percent1 = ($inter_webzine_count{$transition} * 100) / $total_inter_webzine; $percent2 = ($inter_webzine_count{$transition} * 100) / $total_transitions; printf "%-35s %-35s %5d %6.2f %6.2f\n", $from_page, $to_page, $inter_webzine_count{$transition}, $percent1, $percent2; $i += 1; last if ($i >= $top); } close (PAGESFILE); close (SESSIONSFILE); close (TRANSITIONSFILE); close (SORTEDFILE); # ###################################################################################### # S U B R O U T I N E S # sub by_bestnet_hitrate { $bestnet_count{$b} <=> $bestnet_count{$a}; } sub by_entry_count { $entry_count{$b} <=> $entry_count{$a}; } sub by_exit_count { $exit_count{$b} <=> $exit_count{$a}; } sub by_inter_webzine_count { $inter_webzine_count{$b} <=> $inter_webzine_count{$a}; } sub by_hitrate { $page_count{$b} <=> $page_count{$a}; } sub by_page1_hitrate { $column_page1_count{$b} <=> $column_page1_count{$a}; } sub accumulate_webzine_space_info{ local($from_page_entry, $to_page_entry) = @_; # Return the number of seconds that were spent in "webzine-space" # (basically this will be 0 or the time spent on the "from" page) local($from_page, $from_time, $to_page, $to_time); ($from_page, $from_time) = split(/:/, $from_page_entry); ($to_page, $to_time) = split(/:/, $to_page_entry); #DEBUG# print "DEBUG : accumulate_webzine_space_info\t($from_page_entry, $to_page_entry)\n"; #DEBUG# # If the "from_page" is in "webzine-space", then the time spent on that page # was spent in "webzine-space" $answer = 0; foreach $filter (@webzine_patterns) { #DEBUG# print "filter=$filter "; #DEBUG# if ($from_page =~ /^$filter$/) { # We've been in "webzine-space": crunch a few numbers $answer = $from_time; last; } } $answer; } sub page_is_of_interest { local($page) = @_; # See if the page is one we're interested in (i.e. webzine-related) #DEBUG# print "DEBUG : page_is_of_interest\t($page)\n"; #DEBUG# # Check for a match by comparing each of the patterns against the page name $answer = ""; # Assume FALSE result for the routine foreach $filter (@webzine_patterns) { #DEBUG# print "filter=$filter "; #DEBUG# $answer = ($page =~ /^$filter$/); last if ($answer); } #DEBUG# print "\n"; #DEBUG# $answer; } sub print_bestnet_backissues { local($item) = @_; local($col, $date, $printed); # Print out statistics on any back issues for this "Best of Net" item #DEBUG# print "DEBUG : print_bestnet_backissues\t($item)\n"; #DEBUG# $printed = 0; foreach $key (@sorted_backissue_keys) { ($col, $date) = split(/:/, $key); if ($col eq $item) { if ($printed == 1) { # Second and subsequent back issues for a given item # are prefixed with a bunch of blanks to line up properly print "\n "; } printf "%10s %5d", $date, $bestnet_backissue_count{$key}; $printed = 1; } } } sub print_column_backissues { local($column) = @_; local($col, $date, $completion, $printed); # Print out statistics on any back issues for this column #DEBUG# print "DEBUG : print_column_backissues\t($column)\n"; #DEBUG# $printed = 0; foreach $key (@sorted_backissue_keys) { ($col, $date) = split(/:/, $key); if ($col eq $column) { $completion = ($column_backissue_page2_count{$key} * 100) / $column_backissue_page1_count{$key}; if ($printed == 1) { # Second and subsequent back issues for a given column # are prefixed with a bunch of blanks to line up properly print "\n "; } printf "%10s %5d %5d %6.2f", $date, $column_backissue_page1_count{$key}, $column_backissue_page2_count{$key}, $completion; $printed = 1; } } }