|
#!/usr/local/bin/perl
|
|
# File: nph-rater.cgi
|
|
# Copyright 1997, Lincoln D. Stein. All rights reserved.
|
|
# Permission is granted to use, modify and redistribute in whole or in\
|
|
part,
|
|
# provided that the above copyright statement remains prominently disp\
|
|
layed.
|
|
|
|
use LWP::UserAgent;
|
|
use HTML::Parse;
|
|
use HTTP::Status;
|
|
use CGI qw/:standard :html3 :nph/;
|
|
use CGI::Carp;
|
|
|
|
$MAX_DEPTH=2; # how deeply to recurse
|
|
|
|
# here are words that get counted towards the cool! index
|
|
@COOL_WORDS = qw/cool hot groovy neat wild snazzy great awesome wicked\
|
|
/;
|
|
# here are URL components that may indicate an advertising banner
|
|
@AD_WORDS = qw/promotion ad advertisement sponsor banner commerci\
|
|
al
|
|
promotions ads advertisements sponsors banners commercials
|
|
doubleclick/;
|
|
|
|
# here are the attributes to count towards tutie-frutie
|
|
@COLOR_ATTR = qw/color bgcolor text link alink vlink background/;
|
|
# here is the number of previous rankings to list
|
|
$PREVIOUS_RANKS = 20;
|
|
|
|
# here is the name of the file that holds the previous rankings
|
|
$RANK_FILE = '/usr/local/etc/www/INDEXER.RANKS';
|
|
#---------------------------------------------------------------------\
|
|
--------
|
|
# no user serviceable parts below
|
|
|
|
# global for collecting statistics
|
|
%COUNTS = (
|
|
'pages' => 0,
|
|
'images' => 0,
|
|
'doodads' => 0,
|
|
'colors' => 0,
|
|
'frames' => 0,
|
|
'ads' => 0,
|
|
'link_words' => 0,
|
|
'cool_words' => 0,
|
|
'total_words' => 0,
|
|
);
|
|
grep ($COLOR_ATTR{$_}++,@COLOR_ATTR);
|
|
$LEVEL = 0; # recursion level
|
|
$HTML::Parse::IGNORE_UNKNOWN = 0; # don't ignore unknown tags
|
|
$COOL_PATTERN = join("|",@COOL_WORDS);
|
|
$AD_PATTERN = join("|",@AD_WORDS);
|
|
$SIG{ALRM} = \&do_alarm;
|
|
$FH = 'FH0000'; # just a filehandle
|
|
$|=1;
|
|
print header,
|
|
start_html('The Rating Game'),
|
|
h1('The Rating Game');
|
|
if (param('action') eq 'explain') {
|
|
print 'The idea is to automatically collect information about a li\
|
|
nked set of pages ',
|
|
'that gives the reader some idea of the flavor of the document. T\
|
|
he ratings ',
|
|
'measure pages\' information content, the amount of graphics they \
|
|
use, ',
|
|
'the presence of applets, and the presence of commercial content.'\
|
|
,
|
|
p(),
|
|
h2('Key'),
|
|
dl(
|
|
dt(strong('Information Index (II)')),
|
|
dd('Basic measure of the word to link ratio, defined as:',p(),
|
|
pre('II = 100 x (1 - (words inside links / total words in do\
|
|
cument))'),
|
|
p()),
|
|
dt(strong('Graphics Index (GI)')),
|
|
dd('Measure of the graphics usage of a page, defined as:',p(),
|
|
pre('GI = number IMG tags / number pages'),
|
|
p()),
|
|
dt(strong('Gadgets Index (GI)')),
|
|
dd('Measure of the number of applets, controls and scripts, def\
|
|
ined as:',p(),
|
|
pre('GI = number of gadgets / number pages'),
|
|
p()),
|
|
dt(strong('TutieFrutie Index (TFI)')),
|
|
dd('Measure of how "colorful" a document is, defined as:',p(),
|
|
pre('TFI = number of color changes / number of pages'),
|
|
p()),
|
|
dt(strong('Frames Index (FI)')),
|
|
dd('Measure of the use of frames, defined as:',p(),
|
|
pre('FI = number of frame tags'),
|
|
p()),
|
|
dt(strong('Cool! Index (C!I)')),
|
|
dd('Measure of how excited a page is about itself, defined as:'\
|
|
,p(),
|
|
pre('C!I = 100 x ( exclamation marks + superlatives ) / tota\
|
|
l sentences'),
|
|
p()),
|
|
dt(strong('Crass Commercialism Index (CCI)')),
|
|
dd('Indication of banner advertising on the page, defined as:',\
|
|
p(),
|
|
pre('CCI = number of ads / number of pages'),
|
|
p(),
|
|
'This program uses heuristics to count banner advertisements\
|
|
and may ',
|
|
'not always guess correctly.'
|
|
)
|
|
);
|
|
} else {
|
|
print
|
|
'This CGI script was written to go along with my May 1997 ',
|
|
a({-href=>'http://www.webtechniques.com/'},'WebTechniques')\
|
|
,' column ',
|
|
cite('Sifting the Wheat from the Chaff'),'. It demonstrations\
|
|
a way of ',
|
|
'rating Web pages automatically for information content. To u\
|
|
se it, enter a full ',
|
|
'URL in the text field below and press ',strong('Rate'),
|
|
'. After some processing, the ',
|
|
'script will report a variety of rating indexes.',
|
|
p(),
|
|
'This script isn\'t fast, so be patient. In order minimize sy\
|
|
stem load, ',
|
|
'the script currently only descends one level of links.',
|
|
p(),
|
|
a({-href=>script_name() . '?action=explain',-target=>'ex\
|
|
planation'},'Explain the ratings.');
|
|
|
|
print_prompt();
|
|
%stats = process_url($URL) if $URL = param('url_to_process');
|
|
print_previous(%stats);
|
|
}
|
|
print_tail();
|
|
exit 0;
|
|
sub print_prompt {
|
|
print hr,
|
|
start_form,
|
|
'URL to Rate:',br,
|
|
textfield(-name=>'url_to_process',-size=>60),br,
|
|
submit('Rate'),
|
|
end_form;
|
|
}
|
|
|
|
sub process_url {
|
|
my $url = shift;
|
|
print hr(),
|
|
h2('Progress');
|
|
print "<PRE>\n";
|
|
collect_stats(new URI::URL $url);
|
|
print "</PRE>\n";
|
|
|
|
return summary_statistics($url) if $COUNTS{'pages'};
|
|
}
|
|
sub print_tail {
|
|
print hr(),
|
|
address(a({-href=>'/~lstein'},"Lincoln D. Stein"),br,
|
|
a({-href=>'http://www.genome.wi.mit.edu/'},'Whitehead I\
|
|
nstitute/MIT Center for Genome Research'));
|
|
}
|
|
sub summary_statistics {
|
|
my $href = shift;
|
|
print h2('Raw Data'),
|
|
table({-border=>''},
|
|
TR({-align=>LEFT},
|
|
th('Pages'), td($COUNTS{'pages'}),
|
|
th('Total Words'), td($COUNTS{'total_words'})),
|
|
TR({-align=>LEFT},
|
|
th('Total Sentences'),td($COUNTS{'sentences'}),
|
|
th('Words in links'),td($COUNTS{'link_words'})),
|
|
TR({-align=>LEFT},
|
|
th('Applets/Controls'), td($COUNTS{'doodads'}),
|
|
th('Cool! Words'), td($COUNTS{'cool_words'})),
|
|
TR({-align=>LEFT},
|
|
th('Graphics'), td($COUNTS{'images'}),
|
|
th('Custom colors'), td($COUNTS{'colors'})),
|
|
TR({-align=>LEFT},
|
|
th('Possible Advertisements'), td($COUNTS{'ads'}),
|
|
th('Frames'), td($COUNTS{'frames'}))
|
|
);
|
|
my %i = (compute_indices(%COUNTS),'href'=>$href);
|
|
print h2('Ratings'),summary_table(\%i);
|
|
return %i;
|
|
}
|
|
|
|
sub summary_table {
|
|
my (@row) = @_;
|
|
my (@rows,$i);
|
|
foreach $i (@row) {
|
|
push(@rows,
|
|
td([a({-href=>$i->{href}},$i->{href}),
|
|
sprintf("%2.1f",$i->{II}),
|
|
sprintf("%2.1f",$i->{GI}),
|
|
sprintf("%2.1f",$i->{DI}),
|
|
sprintf("%2.1f",$i->{TFI}),
|
|
$i->{FI},
|
|
sprintf("%2.1f",$i->{'C!I'}),
|
|
sprintf("%2.1f",$i->{CCI})]
|
|
)
|
|
);
|
|
}
|
|
return join("\n",
|
|
table( {-border=>''},
|
|
TR(th(),
|
|
th('Information'),
|
|
th('Graphics'),
|
|
th('Gadgets'),
|
|
th('Colors'),
|
|
th('Frames'),
|
|
th('Cool!'),
|
|
th('Ads')),
|
|
TR({-align=>RIGHT},\@rows)
|
|
)
|
|
);
|
|
}
|
|
sub print_previous {
|
|
my (%current) = @_;
|
|
my $fh = open_and_lock($RANK_FILE);
|
|
my (@previous_ranks);
|
|
chomp(@previous_ranks = <$fh>);
|
|
if (@previous_ranks) {
|
|
my (@processed) = map { {split("\t")} } @previous_ranks;
|
|
print hr(),h2('Recent Ratings'),summary_table(@processed);
|
|
}
|
|
unless ($COUNTS{'pages'}) {
|
|
unlock($fh);
|
|
return;
|
|
}
|
|
|
|
unshift(@previous_ranks,join("\t",%current));
|
|
pop(@previous_ranks) if @previous_ranks > $PREVIOUS_RANKS;
|
|
seek($fh,0,0);
|
|
print $fh join("\n",@previous_ranks),"\n";
|
|
truncate($fh,tell($fh));
|
|
unlock($fh);
|
|
}
|
|
|
|
sub compute_indices {
|
|
my (%COUNTS) = @_;
|
|
my %indices = (
|
|
II => 100 * (1 - $COUNTS{'link_words'}/($COUNT\
|
|
S{'total_words'} || 1)),
|
|
GI => $COUNTS{'images'}/$COUNTS{'pages'},
|
|
DI => $COUNTS{'doodads'}/$COUNTS{'pages'},
|
|
TFI => $COUNTS{'colors'}/$COUNTS{'pages'},
|
|
FI => $COUNTS{'frames'},
|
|
'C!I'=> 100 * ($COUNTS{'cool_words'}/($COUNTS{'s\
|
|
entences'} || 1)),
|
|
CCI => $COUNTS{'ads'}/$COUNTS{'pages'},
|
|
);
|
|
return %indices;
|
|
}
|
|
|
|
sub collect_stats {
|
|
local $CURRENT_DOC = shift;
|
|
return undef unless $LEVEL < $MAX_DEPTH;
|
|
|
|
my $path = $CURRENT_DOC->abs->path;
|
|
return undef if $BEEN_THERE{$path}++;
|
|
my $href = $CURRENT_DOC->abs->as_string;
|
|
print ' 'x($LEVEL*3),"Examining ",a({-href=>$href},$href)," ";
|
|
$LEVEL++;
|
|
my $agent = new LWP::UserAgent;
|
|
my $request = new HTTP::Request('GET',$CURRENT_DOC);
|
|
my $response = $agent->request($request);
|
|
|
|
local ($BASE,$INSIDE_A_LINK,$TEXT);
|
|
|
|
TRY:
|
|
{
|
|
# replace with a more informative error message later
|
|
do { print em("unable to fetch document\n"); last TRY } unless $re\
|
|
sponse->is_success;
|
|
# This guarantees that we get the correct base document even if th\
|
|
ere was a
|
|
# redirect thrown in there.
|
|
if ($response->request->url->abs->path ne $path) {
|
|
$CURRENT_DOC = $response->request->url;
|
|
last TRY if $BEEN_THERE{$CURRENT_DOC->abs->path}++;
|
|
}
|
|
|
|
# make sure that it's an HTML document!
|
|
my $type = $response->header('Content-type');
|
|
do { print em("not an HTML file\n"); last TRY; } unless $type eq '\
|
|
text/html';
|
|
my $parse_tree = parse($response->content);
|
|
do { print em("unable to parse HTML\n"); last TRY; } unless $parse\
|
|
_tree;
|
|
|
|
print "\n";
|
|
|
|
$COUNTS{'pages'}++;
|
|
$parse_tree->traverse(\&process);
|
|
# for non-obvious reasons, we have to collect all the text before
|
|
# we can count the sentences.
|
|
$COUNTS{'sentences'} += sentences($TEXT);
|
|
$parse_tree->delete;
|
|
}
|
|
$LEVEL--;
|
|
return 1;
|
|
}
|
|
sub process {
|
|
my ($node,$start,$depth) = @_;
|
|
if (ref($node)) { # we have subparts
|
|
$BASE = $node->attr('href')
|
|
if $node->tag eq 'base';
|
|
|
|
$COUNTS{'images'}++ if $start && $node->tag eq 'img';
|
|
$COUNTS{'doodads'}++ if $start && $node->tag =~ /^(applet|\
|
|
object|script)/;
|
|
#
|
|
# count the number of color changes
|
|
grep($COLOR_ATTR{$_} && $COUNTS{'colors'}++,keys %{$node}) if \
|
|
$start;
|
|
|
|
$COUNTS{'frames'}++ if $start && $node->tag eq 'frame';
|
|
$COUNTS{'ads'}++ if $start && $node->tag eq 'img' && \
|
|
is_ad($node->attr('src'));
|
|
# here's where we handle links, and possible recursion
|
|
if ($node->tag eq 'a') {
|
|
my $href = $node->attr('href');
|
|
if ($href) {
|
|
if (is_child_url($href)) {
|
|
my $newdoc = new URI::URL($href,$BASE || $CURRENT_\
|
|
DOC->abs);
|
|
collect_stats($newdoc) unless $start;
|
|
}
|
|
$INSIDE_A_LINK = $start;
|
|
}
|
|
}
|
|
# step into frames correctly
|
|
if ( $start && ($node->tag eq 'frame') ) {
|
|
my $href = $node->attr('src');
|
|
if ($href && is_child_url($href)) {
|
|
my $newdoc = new URI::URL($href,$BASE || $CURRENT_DOC-\
|
|
>abs);
|
|
collect_stats($newdoc);
|
|
}
|
|
}
|
|
|
|
} else { # if we get here we've got plain t\
|
|
ext to deal with
|
|
my @words = $node=~/(\S+)/g;
|
|
$COUNTS{'link_words'} += @words if $INSIDE_A_LINK;
|
|
$COUNTS{'total_words'} += @words;
|
|
$COUNTS{'cool_words'} += is_cool($node);
|
|
$TEXT .= $node . " ";
|
|
}
|
|
|
|
return 1;
|
|
}
|
|
sub is_cool {
|
|
my $text = shift;
|
|
my ($exclamation_marks) = $text=~tr/!/!/;
|
|
my (@cool_words) = $text=~/\b($COOL_PATTERN)\b/oig;
|
|
return $exclamation_marks + @cool_words;
|
|
}
|
|
|
|
sub sentences {
|
|
my $text = shift;
|
|
# count number of capital letters followed some non-punctuation, f\
|
|
ollowed by
|
|
# punctuation and a space.
|
|
my (@sentences) = $text=~/([A-Z].+?[.!?]\s)/gm;
|
|
return scalar(@sentences);
|
|
}
|
|
|
|
sub is_ad {
|
|
my $url = shift;
|
|
return undef unless $url;
|
|
return $url=~/\b($AD_PATTERN)\b/oi;
|
|
}
|
|
|
|
sub is_child_url {
|
|
my $url = shift;
|
|
return undef if $url=~/^\w+:/;
|
|
return undef if $url=~m!^/!;
|
|
return undef if $url=~/^\.\./;
|
|
1;
|
|
}
|
|
|
|
# because bad HTML can cause the LWP parser to hang indefinitely,
|
|
# we must put it in an eval() statement and cause a timeout to
|
|
# occur.
|
|
sub parse {
|
|
my $content = shift;
|
|
return eval <<'END';
|
|
alarm(10);
|
|
my $f=parse_html($content);
|
|
alarm(0);
|
|
$f;
|
|
END
|
|
}
|
|
sub do_alarm {
|
|
die cite("WARNING: parse_html timed out while processing ",$CURREN\
|
|
T_DOC->abs->as_string),"\n";
|
|
}
|
|
# ------------------- file locking code ------------
|
|
# This bit of code creates an advisory lock on the indicated file and
|
|
# returns a file handle to it.
|
|
sub LOCK_SH { 1 }
|
|
sub LOCK_EX { 2 }
|
|
sub LOCK_NB { 4 }
|
|
sub LOCK_UN { 8 }
|
|
sub open_and_lock {
|
|
my $path = shift;
|
|
my $fh;
|
|
local($msg)='';
|
|
local $oldsig = $SIG{'ALRM'};
|
|
$SIG{'ALRM'} = sub { $msg='timed out'; $SIG{ALRM}=$oldsig; };
|
|
alarm(5);
|
|
|
|
$fh = ++$FH;
|
|
open ($fh,"+<$path") or die("Couldn't open $path: $!");
|
|
# now try to lock it
|
|
die("Couldn't get write lock (" . ($msg || "$!") . ")")
|
|
unless flock ($fh,LOCK_EX);
|
|
$fh;
|
|
}
|
|
|
|
sub unlock {
|
|
my $fh = shift;
|
|
flock($fh,LOCK_UN);
|
|
close $fh;
|
|
}
|