mvhub-dev team mailing list archive
-
mvhub-dev team
-
Mailing list archive
-
Message #00367
[Merge] lp:~omacneil/mvhub/new_external_reports_lee into lp:mvhub
Dan MacNeil has proposed merging lp:~omacneil/mvhub/new_external_reports_lee into lp:mvhub.
Requested reviews:
MVHub devs with commit rights (mvhub-commit)
see commit log
--
https://code.launchpad.net/~omacneil/mvhub/new_external_reports_lee/+merge/28787
Your team MVHub Developers is subscribed to branch lp:mvhub.
=== modified file 'app-mvhub/DocumentRoot/cgi-bin/mvhub/reports.pl'
--- app-mvhub/DocumentRoot/cgi-bin/mvhub/reports.pl 2010-06-25 14:58:28 +0000
+++ app-mvhub/DocumentRoot/cgi-bin/mvhub/reports.pl 2010-06-29 18:43:26 +0000
@@ -41,7 +41,7 @@
# /
# },
{ title => 'Agency Addition Dates',
- comment => "This gives the date when each agency was added to MVHub.",
+ comment => "Date when each agency was added to the site .",
sql => qq/
SELECT agency_name, date_created
FROM agency
@@ -49,10 +49,9 @@
/
},
- { title => 'Program Addition Dates',
- comment =>
- "This gives the date when each program was added to MVHub.",
- sql => qq/
+ { title => 'Program Addition Dates',
+ comment => "Date when each program was added to the site.",
+ sql => qq/
SELECT p.program_name, a.agency_name, p.date_created
FROM agency a, program p
WHERE p.agency_id = a.agency_id
@@ -242,6 +241,10 @@
if ( !defined $report_number ) {
$output = $cgi->h1('Administrative Reports') . "\n";
+ $output
+ .= $cgi->p(
+ 'Our more technical reports of traffic, searches and data freshness, run each time you click the link'
+ ) . "\n";
$output .= MVHub::Reports::print_report_list( $cgi,
'/cgi-bin/mvhub/reports.pl', @REPORTS );
print $cgi->header();
=== modified file 'app-mvhub/DocumentRoot/static/mvh/html/reports.shtml'
--- app-mvhub/DocumentRoot/static/mvh/html/reports.shtml 2010-06-22 19:07:08 +0000
+++ app-mvhub/DocumentRoot/static/mvh/html/reports.shtml 2010-06-29 18:43:26 +0000
@@ -16,18 +16,32 @@
<div id="contentwrapper">
<div id="contentcolumn">
<div class="innertube">
- <h1>PDF Reports</h1>
- <p> <a href='/reports/MVHub.com_agencies_and_all_programs.pdf'>
+ <h1>PDF Reports, print to paper</h1>
+ <p> <a href='/reports/mvh/MVHub.com_agencies_and_all_programs.pdf'>
All Agency and Program info
</a>
formatted for printing (.pdf)
</p>
<p>
- <a href='/reports/MVHub.com_agencies_only.pdf'>
+ <a href='/reports/mvh/MVHub.com_agencies_only.pdf'>
Agency info only
</a>
formatted for printing (.pdf)
</p>
+ <h1>External Reports</h1>
+ <p>
+ These reports are generated from the raw, unedited web server logs by 3rd party tools every night.
+ </p>
+ <p> <a href='/reports/mvh/visitors.html'>
+ Visitors Report
+ </a>
+ Who has visited this web site, how often and from where.
+ </p>
+ <p> <a href='/reports/mvh/analog.html'>
+ Analog Report
+ </a>
+ This is a full web statistics report.
+ </p>
<!--#include virtual="/cgi-bin/mvhub/reports.pl"-->
</div><!-- id=innertube -->
=== modified file 'app-mvhub/DocumentRoot/static/mvh/robots.txt'
--- app-mvhub/DocumentRoot/static/mvh/robots.txt 2010-03-12 01:35:37 +0000
+++ app-mvhub/DocumentRoot/static/mvh/robots.txt 2010-06-29 18:43:26 +0000
@@ -14,3 +14,11 @@
# when in DEVELOPMENT
# User-agent: *
# Disallow:
+
+# for both production and development
+# we don't want spiders driving up to the load
+User-agent: *
+Disallow: /reports/
+
+User-agent: *
+Disallow: /html/reports.shtml
=== modified file 'app-mvhub/DocumentRoot/static/nsp/html/reports.shtml'
--- app-mvhub/DocumentRoot/static/nsp/html/reports.shtml 2010-06-22 19:00:02 +0000
+++ app-mvhub/DocumentRoot/static/nsp/html/reports.shtml 2010-06-29 18:43:26 +0000
@@ -12,24 +12,44 @@
<div id="topsection">
</div>
-
<div id="contentwrapper">
<div id="contentcolumn">
<div class="innertube">
- <h1>PDF Reports</h1>
- <p> <a href='/reports/NorthShorePort.org_agencies_and_all_programs.pdf'>
- All Agency and Program info
- </a>
- formatted for printing (.pdf)
+<h1>PDF Reports, print these to paper </h1>
+<p>These reports are generated every night</p>
+
+ <p> <a href='/reports/nsp/NorthShorePort.org_agencies_and_all_programs.pdf'>
+ All Agency and Program info
+ </a>
+ formatted for printing (.pdf)
</p>
<p>
- <a href='/reports/NorthShorePort.org_agencies_only.pdf'>
+ <a href='/reports/nsp/NorthShorePort.org_agencies_only.pdf'>
Agency info only
</a>
formatted for printing (.pdf)
</p>
+ <h1>External Reports</h1>
+ <p>
+ These reports are generated from the raw, unedited web
+ server logs by 3rd party tools every night.
+ </p>
+ <p> <a href='/reports/nsp/visitors.html'>
+ Visitors Report.
+ </a>
+ Who has visited this web site, how often and from
+ where.
+ </p>
+ <p> <a href='/reports/nsp/analog.html'>
+ Analog Report
+ </a>
+ This is a full web statistics report.
+ </p>
+
+
<!--#include virtual="/cgi-bin/mvhub/reports.pl"-->
+
</div><!-- id=innertube -->
</div><!-- id='contentcolumn' -->
</div><!-- id='contentwrapper' -->
=== modified file 'app-mvhub/DocumentRoot/static/nsp/robots.txt'
--- app-mvhub/DocumentRoot/static/nsp/robots.txt 2010-03-12 01:35:37 +0000
+++ app-mvhub/DocumentRoot/static/nsp/robots.txt 2010-06-29 18:43:26 +0000
@@ -14,3 +14,11 @@
# when in DEVELOPMENT
# User-agent: *
# Disallow:
+
+# for both production and development
+# we don't want spiders driving up to the load
+User-agent: *
+Disallow: /reports/
+
+User-agent: *
+Disallow: /html/reports.shtml
=== added file 'app-mvhub/bin/run_external_reports'
--- app-mvhub/bin/run_external_reports 1970-01-01 00:00:00 +0000
+++ app-mvhub/bin/run_external_reports 2010-06-29 18:43:26 +0000
@@ -0,0 +1,312 @@
+#!/usr/bin/perl
+
+# Generate web traffic reports.
+# Optionally run with --verbose
+
+use strict;
+use warnings;
+
+use Carp;
+use File::Basename;
+use File::Copy;
+
+use MVHub::Utils::ConfigSimple;
+
+my $config_file = $ARGV[0] || $ENV{MV_CONFIG_FILE};
+my $CFG = MVHub::Utils::ConfigSimple::create_config_from($config_file);
+
+my $LOG_DIR = $CFG->param('ABSOLUTE_PATH.log_dir');
+my $SITE_URL = $CFG->param('SITE.website_name');
+
+# analog also keeps a log for each site
+# this log is for this script
+my $LOGFILE = "$LOG_DIR/run_external_reports.log";
+
+{ # main
+ my $website_code = $CFG->param('SITE.website_code');
+ my $log_to_process;
+ my $processed_log;
+ my $output_file;
+
+ logentry("started run");
+
+ $log_to_process = "$LOG_DIR/combined.log";
+ $processed_log = "$LOG_DIR/combined.log.resolveme";
+
+ move_logfile( $log_to_process, $processed_log );
+ $log_to_process = $processed_log;
+
+ strip_local_ip_numbers( $log_to_process, '^10\.0\.0\.' );
+
+ $processed_log = "$LOG_DIR/combined.log.resolved";
+ my $tmp_dir = $CFG->param('ABSOLUTE_PATH.tmp_dir');
+ replace_ip_with_hostnames( $log_to_process, $processed_log, $tmp_dir );
+
+ $output_file = $CFG->param('ABSOLUTE_PATH.reports_dir')
+ . "/visitors.html";
+ run_visitors_report( $processed_log, $output_file );
+
+ $output_file = $CFG->param('ABSOLUTE_PATH.reports_dir')
+ . "/analog.html";
+ run_analog_report( $processed_log, $output_file );
+
+ logentry("finished run");
+}
+
+# copy combined logs to other file for
+# later processing can't have apache
+# & jdresolve both writing @ same time
+sub move_logfile {
+ my $source = shift;
+ my $destination = shift;
+
+ logentry("moving $source to $destination");
+
+ # we don't want clobber
+ # leftovers from past failure
+ if ( -e $destination ) {
+ logentry("$destination exists... bad..not clobbering");
+ return;
+ }
+
+ # if file doesn't exist, create it
+ ( system("/usr/bin/touch $source") == 0 )
+ or logentry("couldn't touch $source");
+
+ # apache will keep writing to renamed file
+ File::Copy::move( $source, $destination )
+ or fatal_logentry("couldn't move $source: $!\n");
+ ( system("/bin/touch $source") == 0 )
+ or fatal_logentry("bad touch: $source");
+
+ ( system("/bin/chmod a+r $source") == 0 )
+ or fatal_logentry("bad chmod: $source");
+}
+
+sub strip_local_ip_numbers {
+ my $log_to_strip = shift;
+ my $strip_regex = shift;
+
+ my $cmd = "/bin/grep -v $strip_regex $log_to_strip >
+ $log_to_strip.striped";
+
+ logentry("stripping local IPs from $log_to_strip");
+
+ # skip files that have no local ips
+ $cmd = "/bin/grep -l $strip_regex $log_to_strip > /dev/null";
+ system($cmd);
+ return 1 if ( ( $? >> 8 ) == 1 );
+
+ $cmd = "/bin/grep -v $strip_regex $log_to_strip > $log_to_strip.striped";
+ system($cmd);
+
+ if ( ( $? >> 8 ) == 2 ) {
+ fatal_logentry("grep error in $log_to_strip");
+ }
+
+ move( "$log_to_strip.striped", $log_to_strip )
+ or fatal_logentry("couldn't rename in strip_local_ip_number");
+}
+
+sub replace_ip_with_hostnames {
+ my $log_to_resolve = shift or croak 'missing param: $log_to_resolve';
+ my $resolved_log = shift or croak 'missing param: $resolved_log';
+ my $tmp_dir = shift or croak 'missing param $tmp_dir';
+
+ my $cmd .= '/usr/bin/jdresolve --dbfirst -r -n ';
+ $cmd .= " --database=$tmp_dir/jdresolve_dns_cache.db";
+ $cmd .= " --dbfirst $log_to_resolve >> $resolved_log";
+
+ logentry("resolving $log_to_resolve");
+
+ ( system($cmd) == 0 ) or fatal_logentry("command failed: $cmd");
+ unlink $log_to_resolve or fatal_logentry(
+ "failed to remove
+ $log_to_resolve"
+ );
+}
+
+sub run_analog_report {
+ my $in = shift or croak 'missing param: input_file';
+ my $out = shift or croak 'missing param: output_file';
+
+ logentry("start analog report for $in");
+
+ _generate_analog_conf_file();
+
+ my $cmd = "/usr/bin/analog -G +g$LOG_DIR/analog.conf +O$out $in 2>/dev/null";
+
+ if ( -e '/var/cache/analog/dnsfile.txt.Lock' ) {
+ my $error_msg =
+"leftover analog lock file present: /var/cache/analog/dnsfile.txt.Lock";
+ fatal_logentry($error_msg);
+ }
+
+ if ( system($cmd) != 0 ) {
+ logentry("Warning - couldn't run $cmd $!");
+ }
+ logentry("done analog report for $out");
+}
+
+sub _generate_analog_conf_file {
+ open my $outfile, ">", "$LOG_DIR/analog.conf" or croak "Cannot open $LOG_DIR/analog.conf\n";
+ my $analog_conf = <<"END";
+# Configuration file for analog 4.01
+HOSTNAME "Debian Linux System"
+OUTFILE index.html
+ERRFILE $LOG_DIR/analog_errors.txt
+
+PAGEINCLUDE *.htm
+PAGEINCLUDE *.shtml
+PAGEINCLUDE *.html
+PAGEINCLUDE */
+
+# cache DNS lookups
+DNS write
+
+# to this file
+DNSFILE /var/cache/analog/dnsfile.txt
+DNSLOCKFILE /var/cache/analog/dnsfile.txt.Lock
+
+# keep DNS lookup results for this many hours
+DNSGOODHOURS 472
+
+
+LOGFILE $LOG_DIR/referer*
+LOGFORMAT (COMBINED)
+LOGFILE $LOG_DIR/combined*
+
+
+
+#reports we want
+ALL ON
+VHOST OFF
+PROCTIME OFF # don't log data for this
+
+LINKINCLUDE pages
+REFLINKINCLUDE pages
+
+# people following links to from outside
+# sites to us are interesting
+# our internal links are not interesting
+# exclude them from the report
+#
+# replace the lctc.org entries
+# with ones that are relevant to your site
+REFREPEXCLUDE http://10.0.0.5/*
+REFREPEXCLUDE http://$SITE_URL/*
+REFREPEXCLUDE http://www.$SITE_URL/*
+REFREPEXCLUDE http://*.$SITE_URL/*
+
+UNCOMPRESS *.gz,*.Z "zcat"
+BROWOUTPUTALIAS Mozilla Netscape
+BROWOUTPUTALIAS "Mozilla (compatible)" "Netscape (compatible)"
+BROWOUTPUTALIAS IWENG AOL
+SEARCHENGINE http://*altavista.*/* q
+SEARCHENGINE http://*yahoo.*/* p
+SEARCHENGINE http://*google.*/* q
+SEARCHENGINE http://*lycos.*/* query
+SEARCHENGINE http://*aol.*/* query
+SEARCHENGINE http://*excite.*/* search
+SEARCHENGINE http://*go2net.*/* general
+SEARCHENGINE http://*metacrawler.*/* general
+SEARCHENGINE http://*msn.*/* MT
+SEARCHENGINE http://*hotbot.com/* MT
+SEARCHENGINE http://*netscape.*/* search
+SEARCHENGINE http://*looksmart.*/* key
+SEARCHENGINE http://*infoseek.*/* qt
+SEARCHENGINE http://*webcrawler.*/* search,searchText
+SEARCHENGINE http://*goto.*/* Keywords
+SEARCHENGINE http://*snap.*/* keyword
+SEARCHENGINE http://*dogpile.*/* q
+SEARCHENGINE http://*askjeeves.*/* ask
+SEARCHENGINE http://*ask.*/* ask
+SEARCHENGINE http://*aj.*/* ask
+SEARCHENGINE http://*directhit.*/* qry
+SEARCHENGINE http://*alltheweb.*/* query
+SEARCHENGINE http://*northernlight.*/* qr
+SEARCHENGINE http://*nlsearch.*/* qr
+SEARCHENGINE http://*dmoz.*/* search
+SEARCHENGINE http://*newhoo.*/* search
+SEARCHENGINE http://*netfind.*/* query,search,s
+SEARCHENGINE http://*/netfind* query
+SEARCHENGINE http://*/pursuit query
+SUBTYPE *.gz,*.Z
+TYPEOUTPUTALIAS .html ".html [Hypertext Markup Language]"
+TYPEOUTPUTALIAS .htm ".htm [Hypertext Markup Language]"
+TYPEOUTPUTALIAS .ps ".ps [PostScript]"
+TYPEOUTPUTALIAS .gz ".gz [Gzip compressed files]"
+TYPEOUTPUTALIAS .html.gz ".html.gz [Gzipped HTML]"
+TYPEOUTPUTALIAS .ps.gz ".ps.gz [Gzipped PostScript]"
+TYPEOUTPUTALIAS .xbm ".xbm [X11 bitmaps]"
+TYPEOUTPUTALIAS .tar.gz ".tar.gz [Compressed archives]"
+TYPEOUTPUTALIAS .jpg ".jpg [JPEG graphics]"
+TYPEOUTPUTALIAS .jpeg ".jpeg [JPEG graphics]"
+TYPEOUTPUTALIAS .gif ".gif [GIF graphics]"
+TYPEOUTPUTALIAS .xbm ".xbm [X bitmap]"
+TYPEOUTPUTALIAS .txt ".txt [Plain text]"
+TYPEOUTPUTALIAS .class ".class [Java class files]"
+TYPEOUTPUTALIAS .pdf ".pdf [Adobe Portable Document Format]"
+TYPEOUTPUTALIAS .zip ".zip [Zip archives]"
+TYPEOUTPUTALIAS .hqx ".hqx [Macintosh archives]"
+TYPEOUTPUTALIAS .exe ".exe [Executables]"
+TYPEOUTPUTALIAS .wav ".wav [WAV sound files]"
+TYPEOUTPUTALIAS .png ".png [PNG graphics]"
+TYPEOUTPUTALIAS .avi ".avi [AVI movies]"
+TYPEOUTPUTALIAS .arc ".arc [Compressed archives]"
+TYPEOUTPUTALIAS .mid ".mid [MIDI sound files]"
+TYPEOUTPUTALIAS .doc ".doc [Microsoft Word document]"
+TYPEOUTPUTALIAS .rtf ".rtf [Rich Text Format]"
+TYPEOUTPUTALIAS .mov ".mov [Quick Time movie]"
+TYPEOUTPUTALIAS .mpg ".mpg [MPEG movie]"
+TYPEOUTPUTALIAS .mpeg ".mpeg [MPEG movie]"
+END
+ print $outfile $analog_conf;
+ close $outfile;
+}
+
+sub run_visitors_report {
+
+ my $in = shift or croak 'missing param: input_file';
+ my $out = shift or croak 'missing param: output_file';
+
+ logentry("start visitor report for $in");
+
+ my $cmd = "/usr/bin/visitors -A $in -o html --output-file $out";
+
+ # presumably from David Siegal / Eric Adum circa 2004
+ # In version .3a of visitors, both status and error messages are printed
+ # to STDERR. So we'll capture standard error, but only print it if the
+ # command fails (i.e. exits with non-zero code). Note however that
+ # visitors does not fail for some errors, e.g. inability to read a
+ # log file. Sigh. I've submitted a patch request for the next version.
+
+ # danm 2010-06-22 current visitors version is 0.7-4
+
+ my $error_msg = `$cmd 2>&1`;
+ if ($?) {
+ warn "$cmd failed. $error_msg.\n";
+ logentry("$cmd failed. $error_msg.\n");
+ }
+ logentry("done visitor report for $out");
+}
+
+# kludge no locking for example
+sub logentry {
+ my $msg = shift;
+ open( my $LOG, '>>', $LOGFILE ) or warn "bad open for append: $! :
+ $LOGFILE\n";
+ my $timestamp = `date`;
+ chomp $timestamp;
+ print $LOG "$timestamp | $0 | $$ | $msg\n";
+ close($LOG) or warn "bad close: $! : $LOGFILE\n";
+
+}
+
+sub fatal_logentry {
+ my $msg = shift;
+ $msg = "FATAL ERROR SCRIPT ENDED SUDDENLY: $msg";
+ logentry($msg);
+ die "$msg\n";
+}
+
=== modified file 'app-mvhub/project-tools/templates/template.conf'
--- app-mvhub/project-tools/templates/template.conf 2010-06-13 22:41:22 +0000
+++ app-mvhub/project-tools/templates/template.conf 2010-06-29 18:43:26 +0000
@@ -114,6 +114,7 @@
setup_db_dir=link-to-live-code/app-mvhub/setup/database/sql/
setup_etc_dir=link-to-live-code/app-mvhub/setup/etc/
user_conf_dir=conf/
+log_dir=BAD_FIX_IN_CONF_FILE
[COOKIES]
# name of cookie used to store
=== modified file 'app-mvhub/setup/etc/cron.d/mvhub-cron'
--- app-mvhub/setup/etc/cron.d/mvhub-cron 2010-05-21 17:31:47 +0000
+++ app-mvhub/setup/etc/cron.d/mvhub-cron 2010-06-29 18:43:26 +0000
@@ -20,5 +20,9 @@
10 0 * * * www-data $BIN_DIR//generate_agency_program_pdf.pl $CONF_DIR/$MVH_CONF_FILE
15 0 * * * www-data $BIN_DIR//generate_agency_program_pdf.pl $CONF_DIR/$NSP_CONF_FILE
+ 20 0 * * * www-data $BIN_DIR/run_external_reports.pl $CONF_DIR/$MVH_CONF_FILE
+ 25 0 * * * www-data $BIN_DIR/run_external_reports.pl $CONF_DIR/$NSP_CONF_FILE
+ 30 0 * * * root /usr/sbin/apache2ctl graceful
+
51 8 1,15 * * www-data $BIN_DIR/notification_email.pl --config=$CONF_DIR/$MVH_CONF_FILE --execute
59 8 1,15 * * www-data $BIN_DIR/notification_email.pl --config=$CONF_DIR/$NSP_CONF_FILE --execute
=== modified file 'app-mvhub/t/debian_packages_installed.t'
--- app-mvhub/t/debian_packages_installed.t 2010-06-23 19:16:22 +0000
+++ app-mvhub/t/debian_packages_installed.t 2010-06-29 18:43:26 +0000
@@ -7,7 +7,10 @@
use Test::More;
my %required_packages = (
+ 'analog' => [],
'apache2-mpm-itk' => ['apache2-mpm-prefork'],
+ 'graphviz' => [],
+ 'jdresolve' => [],
'libapache2-mod-macro' => [],
'libcgi-application-perl' => [],
'libcgi-application-plugins-perl' => [],
@@ -51,6 +54,7 @@
'tetex-extra' => ['texlive-extra-utils'],
'tidy' => [],
'wamerican' => [],
+ 'visitors' => [],
);
{ # main
=== modified file 'lib-mvhub/lib/MVHub/Utils/Setup.pm'
--- lib-mvhub/lib/MVHub/Utils/Setup.pm 2010-06-25 14:51:40 +0000
+++ lib-mvhub/lib/MVHub/Utils/Setup.pm 2010-06-29 18:43:26 +0000
@@ -78,7 +78,10 @@
$template_cfg->param( 'NOTIFICATION.dev_email', "$username\@thecsl.org" );
$template_cfg->param( 'SITE.website_name',
"$site_code.$username.testing123.net" );
- $template_cfg->param( 'SITE.website_code', "$site_code" );
+ $template_cfg->param( 'SITE.website_code', "$site_code" );
+ $template_cfg->param( 'RELATIVE_PATH.log_dir', "log/$site_code/" );
+ $template_cfg->param( 'RELATIVE_PATH.reports_dir',
+ "reports/$site_code/" );
return $template_cfg;
}
@@ -492,12 +495,12 @@
group => '',
permissions => 'u=rwx,g=rwsx,o=rx',
},
- { dir => 'reports',
+ { dir => 'reports/nsp',
owner => '',
group => '',
permissions => 'u=rwx,g=rwsx,o=r-x',
},
- { dir => 'reports',
+ { dir => 'reports/mvh',
owner => '',
group => '',
permissions => 'u=rwx,g=rwsx,o=rx',
=== modified file 'lib-mvhub/t/conf/all.conf'
--- lib-mvhub/t/conf/all.conf 2010-06-16 17:56:56 +0000
+++ lib-mvhub/t/conf/all.conf 2010-06-29 18:43:26 +0000
@@ -96,6 +96,7 @@
setup_db_dir=link-to-live-code/app-mvhub/setup/database/sql/
setup_etc_dir=link-to-live-code/app-mvhub/setup/etc/
user_conf_dir=conf/
+log_dir=log/nsp/
[COOKIES]
# name of cookie used to store
Follow ups