Appendix B - Perl scripts for WWW logfile analysis

As part of my attempts to investigate Hip Webzine readership patterns using the information recorded in the World Wide Web server's logfile, I wrote a suite of three Perl scripts, as documented in the body of my paper, and in the accompanying work report. The three scripts are included below. They were written for version 5 of Perl as installed on a NeXT workstation running version 3.1 of the NeXTStep operating system. However they should be portable to other operating systems which run Perl with a minimum of change. Electronic versions of these scripts are available upon request to the author. It is hoped that the comments within the scripts themselves (accompanied by a rudimentary familiarity with Perl and Unix) will sufficiently document their workings. The logfile is assumed to be in the format of the EMWAC Windows NT Web server, but only minimal changes would be required to adapt them to other logfile formats.

The "sessions.pl" script
The "transit.pl" script
The "hipstats.pl" script

The "sessions.pl" script

#!/usr/local/bin/perl

# ###############################################################################
#
# Script name:		sessions
# Script author:		Michael Hayward
#
# Description:
#	This perl script reads through a single logfile as specified on the
#	invoking command line, and looks for "sessions" from individual IP
#	numbers. It writes out session records on standard out.
#
#	A session is considered to be a sequence of "hits" from a given IP
#	number, where the timestamps between hits from the same IP is less
#	than the value in $maximum_interpage_time
#
#	The logfile is assumed to be in the format of the EMWAC Windows NT WWW server.
#
# ###############################################################################

require "timelocal.pl";
require "ctime.pl";

# Local variables and "constants"
$maximum_interpage_time = 5 * 60; 	# seconds. If hits from the same IP are further apart
								# than this, it is considered another session.

%Month = ("jan",0, "feb",1, "mar",2, "apr",3, "may",4, "jun",5,
	"jul",6, "aug",7, "sep",8, "oct",9, "nov",10, "dec",11);
%Week = ("sun",0, "mon",1, "tue", 2, "wed",3, "thu",4, "fri",5, "sat",6);

for ($i = 0; $i <= 20; $i += 1) {
	$timebucket[$i] = 0; # Zero time bucket counters
} 
for ($i = 0; $i <= 16; $i += 1) {
	$pagebucket[$i] = 0; # Zero page bucket counters
} 

$bad_lines = 0;
$pages_ignored = 0;
$IPs_ignored = 0;
$earliest_timestamp = 999999999;
$latest_timestamp = 0;
$wday_being_processed = "XXX";

$total_IPs = 0;
$total_sessions = 0;
$total_of_session_times = 0;
$total_of_session_pages = 0;
$max_sessions = 0;
$max_sessions_IP = "";

$shortest_session_time = 999999999;
$shortest_session_time_IP = "";
$shortest_session_pages = 999999999;
$shortest_session_pages_IP = "";

$longest_session_time = 0;
$longest_session_time_IP = "";
$longest_session_pages = 0;
$longest_session_pages_IP = "";

$longest_page_interval = 0;
$longest_page = "";
$max_page_count = 0;
$max_page_count_page = "";

#DEBUG# ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($previous_t);
#DEBUG# print "Now  is: $previous_t ";
#DEBUG# print "Second: $sec, Minute: $min, Hour: $hour, MonthDay: $mday, Month: $mon, Year: $year, Weekday: $wday, YearDay: $yday, Isdst: $isdst\n";

# The logfile we are to process is supplied via a parameter to the command line which invokes
# this script
$log_file = $ARGV[0];

# We will process the logfile and write session and page-reference information to separate files
# The names of the files are derived from the name of the logfile: if the logfile name is
# HS941217.LOG, the two output files will be HS941217.ssn (sessions) and HS941217.pgs (pages)
($filename_root, $filename_suffix) = split(/\./,$log_file);
$sessions_file = $filename_root . ".ssn";
$pages_file = $filename_root . ".pgs";
print "Session and page summary for logfile $log_file to files $sessions_file and $pages_file\n";
open (SESSIONSFILE, ">$sessions_file") || die("Unable to open file $sessions_file for write");
open (PAGESFILE, ">$pages_file") || die("Unable to open file $pages_file for write");

# Read through the entire logfile, a line at a time, collecting information on "sessions"
GETLINE: while ($LINE = <>)
   {
	#DEBUG# print $LINE; #DEBUG#
	$total_hits = $total_hits + 1;

	# Translate all data to lowercase, for consistency
	($line = $LINE) =~ tr/A-Z/a-z/;

	chop ($line);

	# Split the logfile line into pieces
	($day_name, $mon_name, $day_date, $time, $year, $incoming_IP_addr, $remote_IP_addr, 
		$action, $page, $server) = split(/ +/, $line);
	($hour, $minute, $second) = split(/:/, $time);
	$page =~ s/(\S*)(\s*)$/$1/; # Trim trailing blanks (if any) on the page name
	$remote_IP_addr =~ s/(\S*)(\s*)$/$1/; # Trim trailing blanks (if any) on the remote IP addr

	# Do a bit of data verification before processing the line, in an attempt
	# to exclude partial logfile records, and records that we want to ignore (i.e. hits
	# on "gif" files, hits from local IPs)
	if ($action ne "get") { 
		$bad_lines += 1;
		next GETLINE;
	} elsif (&ignore_page($page))  {
		# Filter out any hits on pages that we want to ignore
		$pages_ignored += 1;
		next GETLINE;
	} elsif (&ignore_IP($remote_IP_addr))  {
		# Filter out any hits from IPs that we want to ignore
		$IPs_ignored += 1;
		next GETLINE;
	}

	#DEBUG# print $line; #DEBUG#
	
	# Adjust any pieces to fit format required by "&timelocal" routine
	$year = $year-1900;

	# Convert from logfile (text) timestamp data to internal time format. This allows us
	# to calculate intervals between successive timestamps. It doesn't
	# seem to matter if you choose 0 or 1 for the "isdst" value...
	$timestamp = &timelocal(($second, $minute, $hour, $day_date, $Month{$mon_name}, $year, 
		$Week{$day_name}, 0, 0));

	#DEBUG# print "Timestamp is: $timestamp "; #DEBUG# 
	#DEBUG# print "Second: $second, Minute: $minute, Hour: $hour, MonthDay: $day_date, Month: $Month{$mon_name}, Year: $year, Weekday: $Week{$day_name}, YearDay: ???, Isdst: 0\n"; #DEBUG# 

	# Convert the internal form back into external, to compare
	#DEBUG# ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($timestamp); #DEBUG# 
	#DEBUG# print "Converted back: $timestamp "; #DEBUG# 
	#DEBUG# print "Second: $sec, Minute: $min, Hour: $hour, MonthDay: $mday, Month: $mon, Year: $year, Weekday: $wday, YearDay: $yday, Isdst: $isdst\n"; #DEBUG# 

	if ($timestamp < $earliest_timestamp) {
		$earliest_timestamp = $timestamp;
	}
	if ($timestamp > $latest_timestamp) {
		$latest_timestamp = $timestamp;
	}
	if ($day_name ne $wday_being_processed) {
		# We're on a different day: print out a message to show our progress
		if ($wday_being_processed ne "XXX") {
			# Print out a session count for the preceding day unless we're just starting
			print "$this_days_session_count sessions for the above date\n";
		}
		$day_being_processed = &ctime($timestamp);
		chop($day_being_processed);
		print "Now processing logfile data for $day_being_processed ...\n";
		$wday_being_processed = $day_name;
		$this_days_session_count = 0;
	}

	if (! defined($session_start_time{$remote_IP_addr})) {
		# There has been no session yet for this IP
		#DEBUG# print "DEBUG : no session yet\t";
		$session_count{$remote_IP_addr} = 0;
		&start_session($remote_IP_addr, $timestamp, $page);

	} else {
		if (($timestamp - $session_latest_hit_time{$remote_IP_addr}) <= $maximum_interpage_time) {
			# This hit is still part of the same session
			$session_num_of_pages{$remote_IP_addr} += 1;

			# Append the number of seconds spent on the previous page
			#	in the form 	pagename:time
			# and then the name of the current page
			$interval = $timestamp - $session_latest_hit_time{$remote_IP_addr};
			$session_pages{$remote_IP_addr} = $session_pages{$remote_IP_addr} 
				. ":" . $interval . " " . $page;

			# Whenever we hit a new page, record a hit on the previous page, along 
			# with the interval (the time spent "reading" that page)
			#DEBUG# print "DEBUG : same session(A)\tIP=$remote_IP_addr PAGE=$session_latest_hit_page{$remote_IP_addr}, TIME=$session_latest_hit_time{$remote_IP_addr}\n";
			&record_a_hit($session_latest_hit_page{$remote_IP_addr}, $interval);

			# Update the most recently hit page in this session, and the timestamp
			# associated with it
			$session_latest_hit_page{$remote_IP_addr} = $page;
			$session_latest_hit_time{$remote_IP_addr} = $timestamp;
			#DEBUG# print "DEBUG : same session(B)\tIP=$remote_IP_addr PAGE=$session_latest_hit_page{$remote_IP_addr}, TIME=$session_latest_hit_time{$remote_IP_addr}\n";
		} else {
			# This is a new session for the same IP
			# Print out the previous session information
			&print_session($remote_IP_addr);

			#DEBUG# print "DEBUG : new session\t";
			# Initialize for a new session for this IP
			&start_session($remote_IP_addr, $timestamp, $page);
		}
	}
  }

# Print out all of the "in progress" sessions. Doesn't need to be sorted...
$total_IPs = 0;
foreach $remote_IP_addr (keys(%session_start_time)) {
	&print_session($remote_IP_addr);
	$total_IPs += 1;
}

# Print out all of the page-related information. Sort by page name
$total_unique_pages = 0;
foreach $page (sort (keys(%page_count))) {
	&print_page($page);
	$total_unique_pages = $total_unique_pages + 1;
}

close(SESSIONSFILE);
close(PAGESFILE);

# Print final day's session count
print "$this_days_session_count sessions for the above date\n";

# =====================================================================
#   P R I N T   O U T   S T A T I S T I C A L   I N F O R M A T I O N
#
#		Session-related statistical information
#
$start = &ctime($earliest_timestamp);
$finish = &ctime($latest_timestamp);
chop($start); chop($finish);
print "Session summary for $log_file ($start to $finish):\n";
print "=======================================================================================\n";
print "Total page references (hits)  : $total_hits ($pages_ignored of these excluded by page name; $IPs_ignored by IP)\n";
print "Total bad data records        : $bad_lines\n";

print "Total sessions                : $total_sessions (max interpage time=$maximum_interpage_time seconds";
$time_in_minutes = $maximum_interpage_time / 60;
printf " / %-5.2f minutes)\n", $time_in_minutes;

print "Total unique pages referenced : $total_unique_pages\n";
print "Total # of different readers  : $total_IPs\n\n";

$time_in_minutes = $total_of_session_times / 60;
printf "Total of all session times    : %9.2f minutes\n", $time_in_minutes;

$time_in_minutes = $shortest_session_time / 60;
printf "Shortest session (time)       : %9.2f minutes", $time_in_minutes;
print " (from $shortest_session_time_IP)\n";

$time_in_minutes = $longest_session_time / 60;
printf "Longest session (time)        : %9.2f minutes", $time_in_minutes;
print " (from $longest_session_time_IP)\n";

$average = $total_of_session_times / $total_sessions / 60; # Calculate average time as minutes
printf "Average session (time)        : %9.2f minutes\n", $average;
print "Shortest session (pages)      : $shortest_session_pages (from $shortest_session_pages_IP)\n";
print "Longest session (pages)       : $longest_session_pages (from $longest_session_pages_IP)\n";
$average = $total_of_session_pages / $total_sessions;
printf "Average session (pages)       : %-9.2f\n", $average;

# =====================================================================
#		Remaining statistical information
#
print "Max sessions by one reader    : $max_sessions (from $max_sessions_IP)\n";
print "Max page count for a page     : $max_page_count (for $max_page_count_page)\n";

$time_in_minutes = $longest_page_interval / 60;
printf "Longest time reading a page   : %9.2f minutes", $time_in_minutes;
print " (for $longest_page)\n";

# =====================================================================
#		Bar chart distribution of session length in time and # of pages
#
@bucket_label = (" none", " <1.0", " <2.0", " <3.0", " <4.0", " <5.0", " <6.0", " <7.0", 
	 " <8.0", " <9.0", "<10.0", "<11.0", "<12.0", "<13.0", "<14.0", "<15.0", "<16.0",
	 "<17.0", "<18.0", "<19.0", " >=20");
# Try to print out the above stats as a bar chart, with the axes flipped. For this (and
# the following bar chart) we want to scale the bar chart to handle our maximum value
print "\nDistribution of session time as bar chart\n";
print "Sessions lasting indicated # of minutes (from $total_sessions sessions)\n";
print " Time |  Cnt |  % |\n";
print "======|======|====0====|===1|0===|===2|0===|===3|0===|===4|0===|===5|0===|===6|0===|===7|0===|===8|0===|===9|0===|==10|0\n";
for ($i = 1; $i <= 20; $i += 1) {
	$percent = ($timebucket[$i] * 100) / $total_sessions;
	printf "%5s | %4d | %2d |", $bucket_label[$i], $timebucket[$i], $percent;
	for ($j = 1; $j <= $percent; $j += 1) { print "%"; } print "\n";
}
print "======|======|====0====|===1|0===|===2|0===|===3|0===|===4|0===|===5|0===|===6|0===|===7|0===|===8|0===|===9|0===|==10|0\n";

@bucket_label = ("   1", "  <3", "  <6",  "  <9", " <12", " <15", " <18", " <21", " <24",
	 " <27", " <30", " <33", " <36", " <39", " <42", " <45", " <48", " <51", " <54");
# Try to print out the above stats as a bar chart, with the axes flipped. For this (and
# the following bar chart) we want to scale the bar chart to handle our maximum value
print "\nDistribution of session page count as bar chart\n";
print "Sessions consisting of the indicated # of pages (\"hits\") (from $total_sessions sessions)\n";
print "Pages|  Cnt |  % |\n";
print "=====|======|====0====|===1|0===|===2|0===|===3|0===|===4|0===|===5|0===|===6|0===|===7|0===|===8|0===|===9|0===|==10|0\n";
for ($i = 0; $i <= 16; $i += 1) {
	$percent = ($pagebucket[$i] * 100) / $total_sessions;
	printf "%4s | %4d | %2d |", $bucket_label[$i], $pagebucket[$i], $percent;
	for ($j = 1; $j <= $percent; $j += 1) { print "%"; } print "\n";
}
print "=====|======|====0====|===1|0===|===2|0===|===3|0===|===4|0===|===5|0===|===6|0===|===7|0===|===8|0===|===9|0===|==10|0\n";

# =====================================================================
#		S U B R O U T I N E S    E T C .
# =====================================================================
sub ignore_IP {
	local($remote_IP_addr) = @_;

	# Filter out hits from certain IPs (presumably local IPs)

	#DEBUG# print "DEBUG : ignore_IP\t($remote_IP_addr)\n";  #DEBUG#

	#DEBUG# $temp = ""; # This results in a FALSE result for the routine
	
	$remote_IP_addr =~ /^204\.94\.123\./;	# Exclude Hip IPs
}

# =====================================================================
sub ignore_page {
	local($page) = @_;

	# Filter out hits on certain pages (gifs etc)

	#DEBUG# print "DEBUG : ignore_page\t($page)\n";  #DEBUG#

	$page =~ /.gif$/;
}

# =====================================================================
sub record_a_hit {
	local($page, $interval) = @_;

	# Record page-related information

	#DEBUG# print "DEBUG : record_a_hit\t($page; $interval)\n";  #DEBUG#

	if (! defined($page_count{$page})) {
		# This page hasn't been referenced before. Initialize the counters.
		# if $interval is -1, it indicates that we are to use the average time for
		# this page. If this is the only reference so far, use 1 second
		if ($interval == -1) { $interval = 1; } 
		$page_count{$page} = 1;
		$page_total_intervals{$page} = $interval;
	} else {
		if ($interval == -1) {
			# This indicates that we are to use the average time for
			# this page (don't count this hit when calculating the average)
			$interval = $page_total_intervals{$page} / $page_count{$page};
		}
		# Increment the counters for this page
		$page_count{$page} += 1;
		$page_total_intervals{$page} += $interval;
	}
	#DEBUG# print "count=$page_count{$page} total=$page_total_intervals{$page} max=$max_page_count "; #DEBUG#
	if ($page_count{$page} > $max_page_count) {
		#DEBUG# print "Max_page_count=$max_page_count"; #DEBUG#
		$max_page_count = $page_count{$page};
		$max_page_count_page = $page;
	}

	#DEBUG# print "\n"; #DEBUG#

	# We're interested in the longest interval
	if ($interval > $longest_page_interval) {
		$longest_page_interval = $interval;
		$longest_page = $page;
	}

	# Return the interval (or average interval, if we calculated it) as the subroutine's result
	$temp = $interval;
}

# =====================================================================
sub start_session {
	local($remote_IP_addr, $timestamp, $page) = @_;

	# Initialize all inter-related variables associated with a session
	#DEBUG# print "start_session\t($remote_IP_addr, $timestamp, $page)\n";  #DEBUG#

	$session_start_time{$remote_IP_addr} = $timestamp;
	# Track each page name. As the session proceeds, we will append 
	# the "reading time" spent on each page in parens after the pagename 
	$session_pages{$remote_IP_addr} = $page;
	$session_latest_hit_page{$remote_IP_addr} = $page;
	$session_latest_hit_time{$remote_IP_addr} = $timestamp;
	$session_length{$remote_IP_addr} = 0;
	$session_num_of_pages{$remote_IP_addr} = 1;
	$session_count{$remote_IP_addr} = $session_count{$remote_IP_addr} + 1;
	$this_days_session_count += 1;

	if ($session_count{$remote_IP_addr} > $max_sessions) {
		$max_sessions = $session_count{$remote_IP_addr};
		$max_sessions_IP = $remote_IP_addr;
	}
}

# =====================================================================
sub print_page {
	local($page) = @_;

	# Print out a record of information on references to this page
	$average = $page_total_intervals{$page} / $page_count{$page};
	print PAGESFILE "$page\t : $page_count{$page}\t$page_total_intervals{$page}\t$average\n";
}

# =====================================================================
sub print_session {
	local($remote_IP_addr) = @_;
	#DEBUG# print "DEBUG : print_session\t($remote_IP_addr)\n";  #DEBUG# 

	$total_sessions += 1;

	# Record a hit for the last page referenced in this session. Since we don't
	# know when the session ended, use the average interval time for this page as the
	# interval (indicated with a parameter value of -1). The returned value will
	# be the average interval time for that page, which we append to the session
	# record (below)
	$average_interval = &record_a_hit($session_latest_hit_page{$remote_IP_addr}, -1);

	# Calculate the session length for this session. Include the (estimated) time spent
	# reading the last page in the session (estimated to be the average for that page)
	$session_length{$remote_IP_addr} = $session_latest_hit_time{$remote_IP_addr}
		- $session_start_time{$remote_IP_addr} + $average_interval;

	# Print out a session record from the variables associated with a session
	print SESSIONSFILE "$remote_IP_addr\t$session_start_time{$remote_IP_addr}\t$session_length{$remote_IP_addr}\t"
		. "$session_num_of_pages{$remote_IP_addr} : ";

	# Each session record ends with a list of the pages in that session, and the time spent on that
	# page. Since there is no record in the logfile of when a session ended, the last page name will
	# not have a "reading time" associated with it. Use the average time for that page (returned by
	# "record_a_hit" above), rather than nothing at all.
	print SESSIONSFILE "$session_pages{$remote_IP_addr}:$average_interval\n";

	# Accumulate grand total stats for the entire logfile
	$total_of_session_times += $session_length{$remote_IP_addr};
	$total_of_session_pages += $session_num_of_pages{$remote_IP_addr};
	
	# Fit the session time and page count into the appropriate bucket for our
	# bar charts later
	$session_webzine_time = $session_length{$remote_IP_addr};
	TIME: {
		if ($session_webzine_time == 0) { $timebucket[0] += 1; last TIME; } # Won't be used
		if ($session_webzine_time <= 60) { $timebucket[1] += 1; last TIME; }
		if ($session_webzine_time <= 120) { $timebucket[2] += 1; last TIME; } # 2 minute
		if ($session_webzine_time <= 180) { $timebucket[3] += 1; last TIME; }
		if ($session_webzine_time <= 240) { $timebucket[4] += 1; last TIME; } # 4 minutes
		if ($session_webzine_time <= 300) { $timebucket[5] += 1; last TIME; }
		if ($session_webzine_time <= 360) { $timebucket[6] += 1; last TIME; } # 6 minutes
		if ($session_webzine_time <= 420) { $timebucket[7] += 1; last TIME; }
		if ($session_webzine_time <= 480) { $timebucket[8] += 1; last TIME; } # 8 minutes
		if ($session_webzine_time <= 540) { $timebucket[9] += 1; last TIME; }
		if ($session_webzine_time <= 600) { $timebucket[10] += 1; last TIME; } # 10 minutes
		if ($session_webzine_time <= 660) { $timebucket[11] += 1; last TIME; }
		if ($session_webzine_time <= 720) { $timebucket[12] += 1; last TIME; } # 12 minutes
		if ($session_webzine_time <= 780) { $timebucket[13] += 1; last TIME; }
		if ($session_webzine_time <= 840) { $timebucket[14] += 1; last TIME; } # 14 minutes
		if ($session_webzine_time <= 900) { $timebucket[15] += 1; last TIME; }
		if ($session_webzine_time <= 960) { $timebucket[16] += 1; last TIME; } # 16 minutes
		if ($session_webzine_time <= 1020) { $timebucket[17] += 1; last TIME; }
		if ($session_webzine_time <= 1080) { $timebucket[18] += 1; last TIME; } # 18 minutes
		if ($session_webzine_time <= 1140) { $timebucket[19] += 1; last TIME; }
		$timebucket[20] += 1;
	}
	$session_webzine_pages = $session_num_of_pages{$remote_IP_addr};
	PAGES: {
		if ($session_webzine_pages == 1) { $pagebucket[0] += 1; last PAGES; }
		if ($session_webzine_pages <= 3) { $pagebucket[1] += 1; last PAGES; }
		if ($session_webzine_pages <= 6) { $pagebucket[2] += 1; last PAGES; }
		if ($session_webzine_pages <= 9) { $pagebucket[3] += 1; last PAGES; }
		if ($session_webzine_pages <= 12) { $pagebucket[4] += 1; last PAGES; }
		if ($session_webzine_pages <= 15) { $pagebucket[5] += 1; last PAGES; }
		if ($session_webzine_pages <= 18) { $pagebucket[6] += 1; last PAGES; }
		if ($session_webzine_pages <= 21) { $pagebucket[7] += 1; last PAGES; }
		if ($session_webzine_pages <= 24) { $pagebucket[8] += 1; last PAGES; }
		if ($session_webzine_pages <= 27) { $pagebucket[9] += 1; last PAGES; }
		if ($session_webzine_pages <= 30) { $pagebucket[10] += 1; last PAGES; }
		if ($session_webzine_pages <= 33) { $pagebucket[11] += 1; last PAGES; }
		if ($session_webzine_pages <= 36) { $pagebucket[12] += 1; last PAGES; }
		if ($session_webzine_pages <= 39) { $pagebucket[13] += 1; last PAGES; }
		if ($session_webzine_pages <= 42) { $pagebucket[14] += 1; last PAGES; }
		if ($session_webzine_pages <= 45) { $pagebucket[15] += 1; last PAGES; }
		$pagebucket[16] += 1;
	}


	if ($session_length{$remote_IP_addr} < $shortest_session_time) {
		$shortest_session_time = $session_length{$remote_IP_addr};
		$shortest_session_time_IP = $remote_IP_addr;
	}
	if ($session_num_of_pages{$remote_IP_addr} < $shortest_session_pages) {
		$shortest_session_pages = $session_num_of_pages{$remote_IP_addr};
		$shortest_session_pages_IP = $remote_IP_addr;
	}

	if ($session_length{$remote_IP_addr} > $longest_session_time) {
		$longest_session_time = $session_length{$remote_IP_addr};
		$longest_session_time_IP = $remote_IP_addr;
	}
	if ($session_num_of_pages{$remote_IP_addr} > $longest_session_pages) {
		$longest_session_pages = $session_num_of_pages{$remote_IP_addr};
		$longest_session_pages_IP = $remote_IP_addr;
	}
}

The "transit.pl" script

#!/usr/local/bin/perl

# ###############################################################################
#
# Script name:		transit
# Script author:		Michael Hayward
#
# Description:
#	This perl script reads through a "sessions" logfile as specified on the
#	invoking command line, and gathers stats on the "transitions" between
#	the various pages in the sessions.
#
#	The session logfile is produced as output from the "sessions" script,
#	as it processes a Windows NT WWW server's logfile. The session logfile
#	is stored in a file with the suffix ".ssn"
#
# ###############################################################################

require "timelocal.pl";
require "ctime.pl";
$top = 25;
$total_sessions = 0;
$most_popular_transition_count = 0;
$most_popular_transition = "";

$sessions_file = $ARGV[0];

# We will process the sessions file and write inter-page transition information to 2 separate files.
# The names of the files are derived from the name of the session file: if the sessions file name is
# HS941217.ssn, the two output files will be HS941217.trn (transitions sorted by To_page) and HS941217.trs 
# (transitions sorted by frequency of transition, from most to least)
($filename_root, $filename_suffix) = split(/\./,$sessions_file);
$transitions_file = $filename_root . ".trn";
$sorted_transitions = $filename_root . ".trs";
print "Inter-page transition summary for session file $sessions_file (to files $transitions_file & $sorted_transitions)\n";
open (TRANSITIONSFILE, ">$transitions_file") || die("Unable to open file $transitions_file for write");
open (SORTEDFILE, ">$sorted_transitions") || die("Unable to open file $sorted_transitions for write");

# Read through the entire sessions file, a line (session) at a time, collecting information on 
# inter-page "transitions"
GETLINE: while ($line = <>) {

	#DEBUG# print $line; #DEBUG#
	$total_sessions = $total_sessions + 1;

	chop ($line);

	# Split the session file line into pieces
	($part1, $part2) = split (/ : /, $line);
	($remote_IP_addr, $session_start_time, $session_length, $session_num_of_pages) = split(/\t/, $part1);
	(@session_pages) = split(/ /, $part2);

	#DEBUG# print "DEBUG : $remote_IP_addr, $session_start_time, $session_length, $session_num_of_pages\n";
	#DEBUG# print "DEBUG : $part2\n";

	# Go through all of the pages in the session and accumulate the inter-page transitions
	$from_page_entry = shift(@session_pages);
	# Register the transition out of "hyperspace" as well (i.e. start of session, entering
	# our server's space from parts unknown...
	®ister_a_transition(":0", $from_page_entry);
	$to_page_entry = shift(@session_pages);
	while ($to_page_entry ne "") {
		®ister_a_transition($from_page_entry, $to_page_entry);
		$from_page_entry = $to_page_entry;
		$to_page_entry = shift(@session_pages);
	}
	# Register the transition into "hyperspace" as well (i.e. end of session, leaving
	# our server's space for parts unknown...
	®ister_a_transition($from_page_entry, ":0");
}

$total_transitions = 0;
foreach $transition (sort (keys(%transition_count))) {
	&print_transition(TRANSITIONSFILE, $transition, 0);
	$total_transitions += 1;
}

print "Total session records processed     : $total_sessions\n";
print "Total unique inter-page transitions : $total_transitions\n";
if ($top <= 0) {
	($from_page, $to_page) = split(/:/, $most_popular_transition);
	print "Most common inter-page transition   : $from_page to $to_page ($most_popular_transition_count times)\n";
} else {
	print "Top $top inter-page transitions (complete sorted list in $sorted_transitions):\n";
	print "From page\tTo page\tCount\tTotal inter-page delay\tShortest\tLongest\tAverage\n";
}
# Now print the transitions into another file, sorted from the most to the least frequent transition
# Echo the top $top transitions to STDOUT
$i = 0;
foreach $transition (sort byvalue (keys(%transition_count))) {
	$i += 1;
	&print_transition(SORTEDFILE, $transition, ($i <= $top));
}

close (TRANSITIONSFILE);
close (SORTEDFILE);

# =====================================================================
#		S U B R O U T I N E S    E T C .
# =====================================================================
sub byvalue { $transition_count{$b} <=> $transition_count{$a}; }

sub print_transition {
	local($FILE, $transition, $echo) = @_;

	#DEBUG# print "DEBUG : print_transition\t($transition)\n";  #DEBUG#

	# Print a transition entry. The $transition key is in the form $from_page:$to_page
	# Split this into its parts
	($from_page, $to_page) = split(/:/, $transition);

	# Now print out a transition record
	$average = $transition_inter_page_delay_total{$transition} / $transition_count{$transition};
	print $FILE "$from_page\t$to_page\t$transition_count{$transition}\t" 
		. "$transition_inter_page_delay_total{$transition}\t"
		. "$transition_shortest_inter_page_delay{$transition}\t"
		. "$transition_longest_inter_page_delay{$transition}\t$average\n";
	if ($echo) {
		# Echo to STDOUT (if asked)
		print "$from_page\t$to_page\t$transition_count{$transition}\t" 
			. "$transition_inter_page_delay_total{$transition}\t"
			. "$transition_shortest_inter_page_delay{$transition}\t"
			. "$transition_longest_inter_page_delay{$transition}\t$average\n";
	}

	if ($transition_count{$transition} > $most_popular_transition_count) {
		$most_popular_transition_count = $transition_count{$transition};
		$most_popular_transition = $transition;
	}
}

sub register_a_transition {
	local($from_page_entry, $to_page_entry) = @_;

	# Process an inter-page transition from one page entry to another.
	# Each page entry is in the form pagename:readingtime

	#DEBUG# print "DEBUG : register_a_transition\t($from_page, $to_page)\n";  #DEBUG#

	# Split the page entries into their component parts and construct the transition key
	($from_page, $from_time) = split(/:/, $from_page_entry);
	($to_page, $to_time) = split(/:/, $to_page_entry);
	$transition = $from_page . ":" . $to_page;

	# Now record information on the transition
	if (! defined($transition_count{$transition})) {
		# This is the first time that this transition has occurred: initialize
		$transition_count{$transition} = 1;
		$transition_inter_page_delay_total{$transition} = $from_time;
		$transition_shortest_inter_page_delay{$transition} = $from_time;
		$transition_longest_inter_page_delay{$transition} = $from_time;
	} else {
		# Increment the counters for this transition
		$transition_count{$transition} += 1;
		#DEBUG# print "DEBUG : increment to $transition_count{$transition} for $transition\n";
		$transition_inter_page_delay_total{$transition} += $from_time;

		# Track the shortest and longest inter-page delays
		if ($from_time < $transition_shortest_inter_page_delay{$transition}) {
			$transition_shortest_inter_page_delay{$transition} = $from_time;
		} elsif ($from_time > $transition_longest_inter_page_delay{$transition}) {
			$transition_longest_inter_page_delay{$transition} = $from_time;
		}
	}
}

The "hipstats.pl" script

#!/usr/local/bin/perl

# ###############################################################################
#
# Script name:		hipstats
# Script author:		Michael Hayward
#
# Description:
#	This perl script builds on the "sessions" and "transit" scripts to extract
#	some useful information specific to the hip webzine. It takes the name
#	of a logfile (or root), and processes the associated session and transition
#	files.
#
#	The session logfile is produced as output from the "sessions" script,
#	as it processes a Windows NT WWW server's logfile. The session logfile
#	is stored in a file with the suffix ".ssn". The transition files are
#	produced by the "transit" script: ".trn" suffix contains transition stats
#	sorted by "to" page; ".trs" suffix contains transition stats sorted from
#	most frequent to least frequently used.
#
# ###############################################################################

require "timelocal.pl";
require "ctime.pl";

# Patterns to use to determine if a page is webzine-related
@webzine_patterns = ("/col/.*", "/bestnet/.*", "/tour/.*", "/zodiac/.*", "/",
	"/cover.htm", "/cgi-bin/.*/tour.*htm");

$total_hits = 0;
$top = 10;		# Number of "most popular" pages to show hits for

$filename = $ARGV[0];

# Get the filename root from the command line argument, and then determine the
# filenames we will be processing
($filename_root, $filename_suffix) = split(/\./,$filename);
$pages_file = $filename_root . ".pgs";
$sessions_file = $filename_root . ".ssn";
$transitions_file = $filename_root . ".trn";
$sorted_transitions = $filename_root . ".trs";

print "Hip Webzine stats from files $sessions_file/$pages_file/$transitions_file/$sorted_transitions\n";
open (PAGESFILE, "$pages_file") || die("Unable to open file $pages_file for read");
open (SESSIONSFILE, "$sessions_file") || die("Unable to open file $sessions_file for read");
open (TRANSITIONSFILE, "$transitions_file") || die("Unable to open file $transitions_file for read");
open (SORTEDFILE, "$sorted_transitions") || die("Unable to open file $sorted_transitions for read");

###########################################################################################
#
#   R E P O R T S    B A S E D   O N   T H E   P A G E S   F I L E   ( R O O T . p g s )
#
###########################################################################################
# Process the pages file
print "\n=========================\n";
print "= Page based reports    =\n";
print "=========================\n";
GETLINE: while ($line = ) {

	#DEBUG# print $line; #DEBUG#

	chop ($line);

	# Split the pages file line into pieces, and re-build the associative arrays 
	# of page-related information
	($page, $part2) = split (/ : /, $line);
	$page =~ s/(\S*)(\s*)$/$1/; # Trim trailing blanks on the page name

	($page_count{$page}, $page_total_intervals{$page}, $average{$page}) = split(/\t/, $part2);

	# Track the total hits in this logfile
	$total_hits += $page_count{$page};

	#DEBUG# print "$page, $page_count{$page}, $page_total_intervals{$page}, $average{$page}\n"; #DEBUG#
}

# ############################################################################
#
# Report # 1 & #2:	Calculate completion rate on the hip columns. And
#			while we're at it, determine "bestnet" readership too.
#
# Go through all the pages by name, and find information on the columns. They are
# named according to the following convention:
#	/col/thisweek/1.htm		First page of current column
#	/col/thisweek/2.htm		Second page of current column
#	/col//1.htm		First page of back issue column
#	/col//2.htm		Second page of back issue column
# "Best of the net" pages are named according to the following convention:
#	/bestnet/thisweek/index.htm		"Best of Net" index
#	/bestnet/thisweek/bnN.htm		Nth "Best of Net" item (N=1, 2, 3)
#	/bestnet//index.htm		Back issue "Best of Net" index
#	/bestnet//bnN.htm			Nth back issue "Best of Net" item
foreach $page (sort (keys(%page_count))) {
	if ($page =~ /^\/col\/(.*)1.htm$/) {
		# Page one of a column
		#DEBUG# print "Column $1\t"; #DEBUG#
		if ($page =~ /^\/col\/thisweek\/(.*)1.htm$/) {
			# Current issue, column name in $1
			$column_page1_count{$1} = $page_count{$page};
			if (! defined($column_page2_count{$1})) {
				$column_page2_count{$1} = 0;
			}
		} elsif ($page =~ /^\/col\/(\d\dwk\d\d)\/(.*)1.htm$/) {
			# Back issue, date in $1; column name in $2
			#DEBUG# print "backissue1: $1 $2\n"; #DEBUG#
			$key = $2 . ":" . $1;
			$column_backissue_page1_count{$key} = $page_count{$page};
			if (! defined($column_backissue_page2_count{$key})) {
				$column_backissue_page2_count{$key} = 0;
			}
		}
	} elsif ($page =~ /^\/col\/.*2.htm$/) {
		# Page two of a column
		if ($page =~ /^\/col\/thisweek\/(.*)2.htm$/) {
			# Current issue, column name in $1
			if (! defined($column_page1_count{$1})) {
				$column_page1_count{$1} = 0;
			}
			$column_page2_count{$1} = $page_count{$page};
		} elsif ($page =~ /^\/col\/(\d\dwk\d\d)\/(.*)2.htm$/) {
			# Back issue, date in $1; column name in $2
			#DEBUG# print "backissue2: $1 $2\n"; #DEBUG#
			$key = $2 . ":" . $1;
			if (! defined($column_backissue_page1_count{$key})) {
				$column_backissue_page1_count{$key} = 0;
			}
			$column_backissue_page2_count{$key} = $page_count{$page};
		}
	} elsif ($page =~ /^\/bestnet\/.*\/index.htm$/) {
		# A "Best of Net" index
		if ($page =~ /^\/bestnet\/thisweek\/index.htm$/) {
			# Current issue index
			$bestnet_count{"index"} = $page_count{$page};
		} elsif ($page =~ /^\/bestnet\/(\d\dwk\d\d)\/index.htm$/) {
			# Back issue index; date in $1
			$key = "index" . ":" . $1;
			$bestnet_backissue_count{$key} = $page_count{$page};
		}
	} elsif ($page =~ /^\/bestnet\/.*\d.htm$/) {
		# A "Best of Net" item
		if ($page =~ /^\/bestnet\/thisweek\/(bn\d).htm$/) {
			# Current issue item; item name/number in $1
			$bestnet_count{$1} = $page_count{$page};
		} elsif ($page =~ /^\/bestnet\/(\d\dwk\d\d)\/(bn\d).htm$/) {
			# Back issue index; date in $1, item name/number in $2
			$key = $2 . ":" . $1;
			$bestnet_backissue_count{$key} = $page_count{$page};
		}
	}
}

# ############################################################################
#
# Report # 1:	
print "\nReport 1: Hit count (and completion rate) for webzine columns, sorted by hit count\n";
print "Column    Hit counts:  Cmpltn  Back iss.  Hit counts:  Cmpltn\n";
print "name     Page1  Page2     (%)      date  Page1  Page2     (%)\n";
print "======   =====  =====  ======  ========  =====  =====  ======\n";
# Sort the backissue keys once before generating the report
@sorted_backissue_keys = sort (keys(%column_backissue_page1_count));
foreach $column (sort by_page1_hitrate (keys(%column_page1_count))) {
	$completion = ($column_page2_count{$column} * 100) / $column_page1_count{$column};
	printf "%-7s  %5d  %5d  %6.2f", $column, $column_page1_count{$column},
		$column_page2_count{$column}, $completion;
	&print_column_backissues($column);
	print "\n";
}

# ############################################################################
#
# Report # 2:	
print "\nReport 2: Hit count for \"Best of Net\" items, sorted by hit count\n";
print "Item     Hit                   Back iss.  Hit\n";
print "name     Count                     date  Count\n";
print "=====    =====                 ========  =====\n";
# Sort the backissue keys once before generating the report
@sorted_backissue_keys = sort (keys(%bestnet_backissue_count));
foreach $item (sort by_bestnet_hitrate (keys(%bestnet_count))) {
	printf "%-5s    %5d               ", $item, $bestnet_count{$item};
	&print_bestnet_backissues($item);
	print "\n";
}

# ################################################################
#
# Report # 3: Hit rates for all pages
#
print "\nReport 3: Hit count for top $top of all pages, sorted by hit count\n";
print "(from $total_hits page hits on all pages for the period)\n";
print "                                           Hit    % of\n";
print "Page name                                count   total\n";
print "=======================================  =====  ======  |===1|0==2|0==3|0==4|0==5|0==6|0==7|0==8|0==9|0=10|0\n";
$i = 0;
# Sort all pages by hit rate once here, before generating this report
@sorted_pages = sort by_hitrate (keys(%page_count));
foreach $page (@sorted_pages) {
	$rate = ($page_count{$page} * 100) / $total_hits;
	printf "%-35s  %9d  %6.2f  |", $page, $page_count{$page}, $rate;
	for ($j = 2; $j <= $rate; $j += 2) { print "%"; } print "\n";
	$i += 1;
	last if ($i >= $top);
}
print "=======================================  =====  ======  |===1|0==2|0==3|0==4|0==5|0==6|0==7|0==8|0==9|0=10|0\n";

# ################################################################
#
# Report # 4: Hit rates for all webzine-related pages
#
print "\nReport 4: Hit count for top $top of all webzine-related pages, sorted by hit count\n";
print "(from $total_hits page hits on all pages for the period)\n";
print "                                           Hit    % of\n";
print "Page name                                count   total\n";
print "=======================================  =====  ======  |===1|0==2|0==3|0==4|0==5|0==6|0==7|0==8|0==9|0=10|0\n";
$i = 0;
foreach $page (@sorted_pages) {
	if (&page_is_of_interest($page)) {
		$rate = ($page_count{$page} * 100) / $total_hits;
		printf "%-35s  %9d  %6.2f  |", $page, $page_count{$page}, $rate;
		for ($j = 2; $j <= $rate; $j += 2) { print "%"; } print "\n";
		$i += 1;
		last if ($i >= $top);
	}
}
print "=======================================  =====  ======  |===1|0==2|0==3|0==4|0==5|0==6|0==7|0==8|0==9|0=10|0\n";

##############################################################################################
#
#   R E P O R T S    B A S E D   O N   T H E   S E S S I O N S   F I L E   ( R O O T . s s n )
#
##############################################################################################
# Process the sessions file
print "\n=========================\n";
print "= Session based reports =\n";
print "=========================\n";
$total_sessions = 0;
for ($i = 0; $i <= 16; $i += 1) {
	$timebucket[$i] = 0; # Zero time bucket counters
} 
for ($i = 0; $i <= 20; $i += 1) {
	$percentbucket[$i] = 0; # Zero percent bucket counters
} 

GETLINE2: while ($line = ) {

	#DEBUG# print $line; #DEBUG#

	$total_sessions += 1;

	chop ($line);

	# Split the session file line into pieces
	($part1, $part2) = split (/ : /, $line);
	($remote_IP_addr, $session_start_time, $session_length, $session_num_of_pages) = split(/\t/, $part1);
	(@session_pages) = split(/ /, $part2);

	#DEBUG# print "DEBUG : $remote_IP_addr, $session_start_time, $session_length, $session_num_of_pages\n";
	#DEBUG# print "DEBUG : $part2\n";

	# Go through all of the pages in the session and accumulate the "webzine-space" time
	$from_page_entry = shift(@session_pages);
	$to_page_entry = shift(@session_pages);
	$session_webzine_time = 0;
	while ($to_page_entry ne "") {
		$session_webzine_time += &accumulate_webzine_space_info($from_page_entry, $to_page_entry);
		$from_page_entry = $to_page_entry;
		$to_page_entry = shift(@session_pages);
	}
	# Include the time spent on the last page of the session too
	$session_webzine_time += &accumulate_webzine_space_info($from_page_entry, $to_page_entry);

	# Now that we have this session's time spent in "webzine-space", increment the counter
	# associated with the corresponding "bucket" for our distribution chart. There's
	# probably a more efficient way to do this, but this should work...
	$percent = ($session_webzine_time * 100) / $session_length;
	TIME: {
		if ($session_webzine_time == 0) { $timebucket[0] += 1; last TIME; }
		if ($session_webzine_time <= 30) { $timebucket[1] += 1; last TIME; }
		if ($session_webzine_time <= 60) { $timebucket[2] += 1; last TIME; } # 1 minute
		if ($session_webzine_time <= 90) { $timebucket[3] += 1; last TIME; }
		if ($session_webzine_time <= 120) { $timebucket[4] += 1; last TIME; } # 2 minutes
		if ($session_webzine_time <= 150) { $timebucket[5] += 1; last TIME; }
		if ($session_webzine_time <= 180) { $timebucket[6] += 1; last TIME; } # 3 minutes
		if ($session_webzine_time <= 210) { $timebucket[7] += 1; last TIME; }
		if ($session_webzine_time <= 240) { $timebucket[8] += 1; last TIME; } # 4 minutes
		if ($session_webzine_time <= 270) { $timebucket[9] += 1; last TIME; }
		if ($session_webzine_time <= 300) { $timebucket[10] += 1; last TIME; } # 5 minutes
		if ($session_webzine_time <= 330) { $timebucket[11] += 1; last TIME; }
		if ($session_webzine_time <= 360) { $timebucket[12] += 1; last TIME; } # 6 minutes
		if ($session_webzine_time <= 390) { $timebucket[13] += 1; last TIME; }
		if ($session_webzine_time <= 420) { $timebucket[14] += 1; last TIME; } # 7 minutes
		if ($session_webzine_time <= 450) { $timebucket[15] += 1; last TIME; }
		$timebucket[16] += 1;
	}
	PERCENT: {
		if ($percent == 0) { $percentbucket[0] += 1; last PERCENT; }
		if ($percent <= 5) { $percentbucket[1] += 1; last PERCENT; }
		if ($percent <= 10) { $percentbucket[2] += 1; last PERCENT; }
		if ($percent <= 15) { $percentbucket[3] += 1; last PERCENT; }
		if ($percent <= 20) { $percentbucket[4] += 1; last PERCENT; }
		if ($percent <= 25) { $percentbucket[5] += 1; last PERCENT; }
		if ($percent <= 30) { $percentbucket[6] += 1; last PERCENT; }
		if ($percent <= 35) { $percentbucket[7] += 1; last PERCENT; }
		if ($percent <= 40) { $percentbucket[8] += 1; last PERCENT; }
		if ($percent <= 45) { $percentbucket[9] += 1; last PERCENT; }
		if ($percent <= 50) { $percentbucket[10] += 1; last PERCENT; }
		if ($percent <= 55) { $percentbucket[11] += 1; last PERCENT; }
		if ($percent <= 60) { $percentbucket[22] += 1; last PERCENT; }
		if ($percent <= 65) { $percentbucket[13] += 1; last PERCENT; }
		if ($percent <= 70) { $percentbucket[14] += 1; last PERCENT; }
		if ($percent <= 75) { $percentbucket[15] += 1; last PERCENT; }
		if ($percent <= 80) { $percentbucket[16] += 1; last PERCENT; }
		if ($percent <= 85) { $percentbucket[17] += 1; last PERCENT; }
		if ($percent <= 90) { $percentbucket[18] += 1; last PERCENT; }
		if ($percent <= 95) { $percentbucket[19] += 1; last PERCENT; }
		$percentbucket[20] += 1;
	}
}

# ################################################################
#
# Report # 5: Distribution of time in "webzine-space" in seconds
#
#print "\nReport 5: Distribution of session time\nSessions spending indicated # of seconds in \"webzine-space\" (of $total_sessions sessions)\n";
#print "none 0-30  <60  <90 <120 <150 <180 <210 <240 <270 <300 <330 <360 <390 <420 <450 >450\n";
#print "==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====\n"; 
#for ($i = 0; $i <= 16; $i += 1) {
#	printf "%4d ", $timebucket[$i];
#} 
#print "(#)\n";
#for ($i = 0; $i <= 16; $i += 1) {
#	$percent = ($timebucket[$i] * 100) / $total_sessions;
#	printf "%4d ", $percent;
#} 
#print "(%)\n";

@bucket_label = ("none", "<0.5", "<1.0",  "<1.5", "<2.0", "<2.5", "<3.0", "<3.5", "<4.0",
	 "<4.5", "<5.0", "<5.5", "<6.0", "<6.5", "<7.0", "<7.5", "<8.0", "<8.5", "<9.0");
# Try to print out the above stats as a bar chart, with the axes flipped. For this (and
# the following bar chart) we want to scale the bar chart to handle our maximum value
print "\nReport 5: Distribution of session time as bar chart\n";
print "Sessions spending indicated # of minutes in \"webzine-space\" (of $total_sessions sessions)\n";
print "Time |  Cnt |  % |\n";
print "=====|======|====0====|===1|0===|===2|0===|===3|0===|===4|0===|===5|0===|===6|0===|===7|0===|===8|0===|===9|0===|==10|0\n";
for ($i = 0; $i <= 16; $i += 1) {
	$percent = ($timebucket[$i] * 100) / $total_sessions;
	printf "%4s | %4d | %2d |", $bucket_label[$i], $timebucket[$i], $percent;
	for ($j = 1; $j <= $percent; $j += 1) { print "%"; } print "\n";
}
print "=====|======|====0====|===1|0===|===2|0===|===3|0===|===4|0===|===5|0===|===6|0===|===7|0===|===8|0===|===9|0===|==10|0\n";

# ################################################################
#
# Report # 6: Distribution of time in "webzine-space" as % of total session time
#
#print "\nReport 6: Distribution of session time\nSessions spending indicated % of session time in \"webzine-space\" (of $total_sessions sessions)\n";
#print "none  <5 <10 <15 <20 <25 <30 <35 <40 <45 <50 <55 <60 <65 <70 <75 <80 <85 <90 <95 <100\n";
#print "==== === === === === === === === === === === === === === === === === === === === ====\n";  
#for ($i = 0; $i <= 20; $i += 1) {
#	printf "%4d", $percentbucket[$i];
#} 
#print " (#)\n";
#for ($i = 0; $i <= 20; $i += 1) {
#	$percent = ($percentbucket[$i] * 100) / $total_sessions;
#	printf "%4d", $percent;
#} 
#print " (%)\n";

@bucket_label = ("none", "  <5", " <10",  " <15", " <20", " <25", " <30", " <35", " <40",
	 " <45", " <50", " <55", " <60", " <65", " <70", " <75", " <80", " <85", " <90", " <95", "<100");
# Print out the above stats as a bar chart, with the axes flipped
print "\nReport 6: Distribution of session time as bar chart\n";
print "Sessions spending indicated % of session time in \"webzine-space\" (of $total_sessions sessions)\n";
print "  %  |  Cnt |  % |\n";
print "=====|======|====0====|===1|0===|===2|0===|===3|0===|===4|0===|===5|0===|===6|0===|===7|0===|===8|0===|===9|0===|==10|0\n";
for ($i = 0; $i <= 20; $i += 1) {
	$percent = ($percentbucket[$i] * 100) / $total_sessions;
	printf "%4s | %4d | %2d |", $bucket_label[$i], $percentbucket[$i], $percent;
	for ($j = 1; $j <= $percent; $j += 1) { print "%"; } print "\n";
}
print "=====|======|====0====|===1|0===|===2|0===|===3|0===|===4|0===|===5|0===|===6|0===|===7|0===|===8|0===|===9|0===|==10|0\n";

##############################################################################################
#
#   R E P O R T S    B A S E D   O N   O N E   O F   T H E   T R A N S I T I O N S   F I L E S
#
#       ( R O O T . t r n   :   T R A N S I T I O N S   S O R T E D   B Y   " F R O M " )
#
##############################################################################################
# Process the transition file
print "\n============================\n";
print "= Transition based reports =\n";
print "============================\n";
$total_transitions = 0;
$total_entries = 0;
$total_exits = 0;
$total_inter_webzine = 0;
GETLINE3: while ($line = ) {

	#DEBUG# print $line; #DEBUG#

	chop ($line);

	# Split the transitions file line into pieces
        ($from_page, $to_page, $transition_count, $transition_inter_page_delay_total,
		$transition_shortest_inter_page_delay, $transition_longest_inter_page_delay,
		$average) = split(/\t/, $line);

	$total_transitions += $transition_count;

	$transition = $from_page . ":" . $to_page;
	if (&page_is_of_interest($from_page) && (! &page_is_of_interest($to_page))) {
		# Exit from "webzine-space"
		$total_exits += $transition_count;
		$exit_count{$transition} = $transition_count;
	} elsif ((! &page_is_of_interest($from_page)) && &page_is_of_interest($to_page)) {
		# Entry into "webzine-space"
		$total_entries += $transition_count;		
		$entry_count{$transition} = $transition_count;
	} elsif (&page_is_of_interest($from_page) && &page_is_of_interest($to_page)) {
		# An inter-page transition within the webzine
		$total_inter_webzine += $transition_count;		
		$inter_webzine_count{$transition} = $transition_count;
	}
}

# ################################################################
#
# Report # 7: Top $top ways our readers entered "webzine-space"
#
print "\nReport 7: Top $top ways our readers entered \"webzine-space\"\n";
print "($total_entries entries to webzine-space from $total_transitions total inter-page transitions)\n";
print "                                                                                   (% of   (% of \n";
print "Entry from:                          To webzine-space at:                 Count  entries)  total)\n";
print "==========================           ========================             =====  ========  ======\n";
$i = 0;
foreach $transition (sort by_entry_count (keys(%entry_count))) {
	($from_page, $to_page) = split(/:/, $transition);
	$percent1 = ($entry_count{$transition} * 100) / $total_entries;
	$percent2 = ($entry_count{$transition} * 100) / $total_transitions;
	printf "%-35s  %-35s  %5d    %6.2f  %6.2f\n", $from_page, $to_page, $entry_count{$transition},
		$percent1, $percent2;
	$i += 1;
	last if ($i >= $top);
}

# ################################################################
#
# Report # 8: Top $top ways our readers left "webzine-space"
#
print "\nReport 8: Top $top ways our readers left \"webzine-space\"\n";
print "($total_exits exits from webzine-space from $total_transitions total inter-page transitions)\n";
print "                                                                                   (% of   (% of \n";
print "Exit from webzine-space at:          To:                                  Count  entries)  total)\n";
print "===========================          ========================             =====  ========  ======\n";
$i = 0;
foreach $transition (sort by_exit_count (keys(%exit_count))) {
	($from_page, $to_page) = split(/:/, $transition);
	$percent1 = ($exit_count{$transition} * 100) / $total_exits;
	$percent2 = ($exit_count{$transition} * 100) / $total_transitions;
	printf "%-35s  %-35s  %5d    %6.2f  %6.2f\n", $from_page, $to_page, $exit_count{$transition},
		$percent1, $percent2;
	$i += 1;
	last if ($i >= $top);
}

# ################################################################
#
# Report # 9: Top $top inter-page transitions within "webzine-space"
#
print "\nReport 9: Top $top inter-page transitions within \"webzine-space\"\n";
print "($total_inter_webzine inter-webzine transitions from $total_transitions total inter-page transitions)\n";
print "                                                                                   (% of   (% of \n";
print "Transition from:                     To:                                  Count    links)  total)\n";
print "===========================          ========================             =====  ========  ======\n";
$i = 0;
foreach $transition (sort by_inter_webzine_count (keys(%inter_webzine_count))) {
	($from_page, $to_page) = split(/:/, $transition);
	$percent1 = ($inter_webzine_count{$transition} * 100) / $total_inter_webzine;
	$percent2 = ($inter_webzine_count{$transition} * 100) / $total_transitions;
	printf "%-35s  %-35s  %5d    %6.2f  %6.2f\n", $from_page, $to_page, $inter_webzine_count{$transition},
		$percent1, $percent2;
	$i += 1;
	last if ($i >= $top);
}

close (PAGESFILE);
close (SESSIONSFILE);
close (TRANSITIONSFILE);
close (SORTEDFILE);

# ######################################################################################
#     S U B R O U T I N E S
#
sub by_bestnet_hitrate { $bestnet_count{$b} <=> $bestnet_count{$a}; }

sub by_entry_count { $entry_count{$b} <=> $entry_count{$a}; }

sub by_exit_count { $exit_count{$b} <=> $exit_count{$a}; }

sub by_inter_webzine_count { $inter_webzine_count{$b} <=> $inter_webzine_count{$a}; }

sub by_hitrate { $page_count{$b} <=> $page_count{$a}; }

sub by_page1_hitrate { $column_page1_count{$b} <=> $column_page1_count{$a}; }

sub accumulate_webzine_space_info{
	local($from_page_entry, $to_page_entry) = @_;

	# Return the number of seconds that were spent in "webzine-space"
	# (basically this will be 0 or the time spent on the "from" page)

	local($from_page, $from_time, $to_page, $to_time);
	($from_page, $from_time) = split(/:/, $from_page_entry);
	($to_page, $to_time) = split(/:/, $to_page_entry);

	#DEBUG# print "DEBUG : accumulate_webzine_space_info\t($from_page_entry, $to_page_entry)\n"; #DEBUG#

	# If the "from_page" is in "webzine-space", then the time spent on that page
	# was spent in "webzine-space"
	$answer = 0;
	foreach $filter (@webzine_patterns) {
		#DEBUG# print "filter=$filter "; #DEBUG#
		if ($from_page =~ /^$filter$/) {
			# We've been in "webzine-space": crunch a few numbers
			$answer = $from_time;		
			last;
		}
	}
	$answer;
}

sub page_is_of_interest {
	local($page) = @_;

	# See if the page is one we're interested in (i.e. webzine-related)

	#DEBUG# print "DEBUG : page_is_of_interest\t($page)\n"; #DEBUG#

	# Check for a match by comparing each of the patterns against the page name
        $answer = ""; # Assume FALSE result for the routine
	foreach $filter (@webzine_patterns) {
		#DEBUG# print "filter=$filter "; #DEBUG#
		$answer = ($page =~ /^$filter$/);
		last if ($answer);
	}
	#DEBUG# print "\n"; #DEBUG#
	$answer;
}

sub print_bestnet_backissues {
	local($item) = @_;

	local($col, $date, $printed);

	# Print out statistics on any back issues for this "Best of Net" item

	#DEBUG# print "DEBUG : print_bestnet_backissues\t($item)\n";  #DEBUG#

	$printed = 0;
	foreach $key (@sorted_backissue_keys) {
		($col, $date) = split(/:/, $key);
		if ($col eq $item) {
			if ($printed == 1) {
				# Second and subsequent back issues for a given item
				# are prefixed with a bunch of blanks to line up properly
				print "\n                             ";
			}
			printf "%10s  %5d", $date, $bestnet_backissue_count{$key};
			$printed = 1;
		}
	}
}

sub print_column_backissues {
	local($column) = @_;

	local($col, $date, $completion, $printed);

	# Print out statistics on any back issues for this column

	#DEBUG# print "DEBUG : print_column_backissues\t($column)\n";  #DEBUG#

	$printed = 0;
	foreach $key (@sorted_backissue_keys) {
		($col, $date) = split(/:/, $key);
		if ($col eq $column) {
			$completion = ($column_backissue_page2_count{$key} * 100)
				/ $column_backissue_page1_count{$key};
			if ($printed == 1) {
				# Second and subsequent back issues for a given column
				# are prefixed with a bunch of blanks to line up properly
				print "\n                             ";
			}
			printf "%10s  %5d  %5d  %6.2f", $date, $column_backissue_page1_count{$key},
				$column_backissue_page2_count{$key}, $completion;
			$printed = 1;
		}
	}
}