 2008/05/12
|
Last update 1999/02/20
TPJ: Issue_01_HTMLregexps
- finding_links_1
- finding_links_2
- urlify
- extracting_1
- extracting_2
- extracting_3
- extracting_4
- striphtml
- changing_links
- More Samples on HTMLregexps
| Issue_01_HTMLregexps1. finding_links_1
|
Download finding_links_1
|
#!/usr/bin/perl -n -00
|
|
while ( /<\s*A\s+HREF\s*=\s*(["'])(.*?)\1.*?>/gi ) {
|
|
print "$2\n";
|
|
}
|
| Issue_01_HTMLregexps2. finding_links_2
|
Download finding_links_2
|
#!/usr/bin/perl -n00
|
|
while ( m{ # match repeatedly with /g
|
|
< \s* A # this is an anchor
|
|
\s+ HREF # a link spec
|
|
\s* = \s* # here comes the link
|
|
( ["'] ) # either quote, saved in $1
|
|
# and \1
|
|
( .*? ) # the whole link, saved in $2
|
|
\1 # the original $1 quote
|
|
.*? > # the rest of the tag
|
|
}xsgi) # /x for expanded patterns
|
|
# /s so . can match \n
|
|
# /g to get multiple hits
|
|
# in one paragraph
|
|
# /i for case insensitivity
|
|
# on A and HREF
|
|
{
|
|
print "$2\n";
|
|
}
|
| Issue_01_HTMLregexps3. urlify
|
Download urlify
|
#!/usr/bin/perl
|
|
# urlify
|
|
|
|
require 5.002;
|
|
# well, or 5.000 if you strip the comments
|
|
|
|
$urls = '(' . join ('|', qw{
|
|
http
|
|
telnet
|
|
gopher
|
|
file
|
|
wais
|
|
ftp
|
|
}
|
|
)
|
|
. ')';
|
|
|
|
$ltrs = '\w';
|
|
$gunk = '/#~:.?+=&%@!\-';
|
|
$punc = '.:?\-';
|
|
$any = "${ltrs}${gunk}${punc}";
|
|
|
|
while (<>) {
|
|
# use this if early-ish perl5 (pre 5.002)
|
|
# s{\b(${urls}:[$any]+?)(?=[$punc]*[^$any]|\Z)}
|
|
# {<A HREF="$1">$1</A>}goi;
|
|
s{
|
|
\b # start at word boundary
|
|
( # begin $1
|
|
$urls : # need resource and a colon
|
|
[$any] +? # followed by one or more
|
|
# of any valid character, but
|
|
# be conservative and take
|
|
# only what you need to....
|
|
) # end $1
|
|
(?= # a look-ahead,
|
|
# non-consumptive assertion
|
|
[$punc]* # either 0 or more punctuation
|
|
[^$any] # followed by a non-url char
|
|
| # or else
|
|
$ # then end of the string
|
|
)
|
|
}{<A HREF="$1">$1</A>}igox;
|
|
print;
|
|
}
|
| Issue_01_HTMLregexps4. extracting_1
|
Download extracting_1
|
#!/usr/bin/perl -00 -ln
|
|
print $1 if m:<TITLE>(.*)</TITLE>:si;
|
| Issue_01_HTMLregexps5. extracting_2
|
Download extracting_2
|
#!/usr/bin/perl -n
|
|
BEGIN { ($/, $>) = ("", "\n") }
|
|
print $1 if m:<TITLE>(.*)</TITLE>:si;
|
| Issue_01_HTMLregexps6. extracting_3
|
Download extracting_3
|
#!/usr/bin/perl
|
|
use English;
|
|
$RS = '';
|
|
|
|
while ($paragraph = <ARGV>) {
|
|
if ( $paragraph =~ m:<TITLE>(.*)</TITLE>:si ) \
|
|
{
|
|
print "$1\n";
|
|
}
|
|
}
|
| Issue_01_HTMLregexps7. extracting_4
|
Download extracting_4
|
#!/usr/bin/perl -w
|
|
|
|
require 5.002;
|
|
# or 5.001 iff you remove the comments!
|
|
|
|
use strict;
|
|
undef $/;
|
|
@ARGV = ('-') unless @ARGV;
|
|
my($title, $filename);
|
|
while ($filename = shift) {
|
|
unless (open(HTML, $filename)) {
|
|
warn "can't open $filename: $!";
|
|
next;
|
|
}
|
|
my $html = <HTML>;
|
|
my $count = 0;
|
|
while ( $html =~ m{
|
|
< \s* TITLE .*? > # begin tag
|
|
\s* (.*?) \s* # co\
|
|
ntents
|
|
< \s* / \s* TITLE .*? > \
|
|
# end tag
|
|
}gsix ) {
|
|
if ($count++) {
|
|
warn "$filename has $count titles!\n";
|
|
}
|
|
($title = $1 || "<UNTITLED>") =~ s/\s+/ /g;
|
|
write;
|
|
}
|
|
}
|
|
|
|
format STDOUT =
|
|
@<<<<<<<<<<<<<<<<<&\
|
|
lt;<<<<< ^<<<<<<<<<<<&\
|
|
lt;<<<<<<<<<<<<<<<<<\
|
|
;<<<<<<<<
|
|
$filename, $title
|
|
^<<<<<<<<<<<\
|
|
<<<<<<<<<<<<<<<<<&l\
|
|
t;<<<<<<<<
|
|
~~ $title
|
|
.
|
| Issue_01_HTMLregexps8. striphtml
|
Download striphtml
|
#!/usr/bin/perl -p0777
|
|
# striphtml ("striff tummel")
|
|
|
|
# how to strip out html comments and
|
|
# tags and transform entities in just
|
|
# three--count 'em three--
|
|
# substitutions; sed and awk eat your
|
|
# heart out. :-)
|
|
|
|
# as always, translations from this
|
|
# nacri rendition into more
|
|
# characteristically marine,
|
|
# herpetoid, titillative, or
|
|
# indonesian idioms are welcome for
|
|
# the furthering of comparative
|
|
# cyberlinguistic studies.
|
|
|
|
require 5.001;
|
|
# for nifty embedded regexp comments
|
|
|
|
# first we'll shoot all the
|
|
# <!-- comments -->
|
|
|
|
s{ <! # comments begin with `<!'
|
|
# followed by 0 or more
|
|
# comments;
|
|
(.*?) # this eats up comments
|
|
# in non random places
|
|
( # not supposed to have any
|
|
# whitespace here
|
|
# just a quick start:
|
|
-- # each comment starts with
|
|
# a `--'
|
|
.*? # and includes all text up
|
|
# to and including the
|
|
-- # next occurrence.
|
|
\s* # and may have trailing
|
|
# whitespace (but not
|
|
# leading whitespace)
|
|
)+ # repetire ad libitum
|
|
(.*?) # trailing non comment
|
|
text
|
|
|
|
> # up to a `>'
|
|
|
|
}{
|
|
|
|
if ($1 || $3) { # this silliness for
|
|
|
|
# embedded comments in tags
|
|
|
|
"<!$1 $3>";
|
|
|
|
}
|
|
|
|
}gsex; # mutate into nada, nothing,
|
|
|
|
# and niente
|
|
|
|
|
|
|
|
# next we'll remove all the <tags>
|
|
|
|
|
|
s{ < # opening angle bracket
|
|
|
|
#
|
|
|
|
(?: # Non-backreffing grouping
|
|
|
|
# paren
|
|
|
|
[^>'"] * # 0 or more things that are
|
|
|
|
# neither > nor ' nor "
|
|
|
|
| # or else
|
|
|
|
".*?" # a section between
|
|
|
|
# double quotes (stingy match)
|
|
|
|
| # or else
|
|
|
|
'.*?' # a section between
|
|
|
|
# single quotes (stingy match)
|
|
|
|
)+ # repetire ad libitum
|
|
|
|
# hm.... are null tags (<>)
|
|
|
|
# legal?
|
|
|
|
> # closing angle bracket
|
|
|
|
}{}gsx; # mutate into nada, nothing,
|
|
|
|
# and niente
|
|
|
|
|
|
# finally we'll translate all &valid; HTML 2.0
|
|
|
|
# entities
|
|
|
|
|
|
s{ (
|
|
|
|
& # an entity starts with a
|
|
|
|
# semicolon
|
|
|
|
(
|
|
|
|
\x23\d+ # and is either a pound
|
|
|
|
# (# == hex 23) and numbers
|
|
|
|
| # or else
|
|
|
|
\w+ # has alphanumunders...
|
|
|
|
)
|
|
|
|
;? # a semicolon terminates,
|
|
|
|
# as does anything else
|
|
|
|
)
|
|
|
|
} {
|
|
|
|
$entity{$2} # if it's a known entity,
|
|
|
|
# use that.
|
|
|
|
|| # But otherwise
|
|
|
|
$1 # leave what we'd found.
|
|
|
|
}gex; # execute replacement--that's
|
|
|
|
# code not a string
|
|
|
|
|
|
|
|
|
|
|
|
# but wait! load up the %entity mappings
|
|
|
|
# enwrapped in a BEGIN that the last might be
|
|
|
|
# first, and only execute once, since we're in
|
|
|
|
# a -p "loop"; awk is kinda nice after all.
|
|
|
|
|
|
|
|
BEGIN {
|
|
|
|
|
|
%entity = (
|
|
|
|
lt => '<',
|
|
|
|
gt => '>',
|
|
|
|
amp => '&',
|
|
|
|
quot => '"', # vertical double quote
|
|
|
|
nbsp => chr 160, # no-break space
|
|
|
|
iexcl => chr 161, # !
|
|
|
|
cent => chr 162, #
|
|
|
|
pound => chr 163, #
|
|
|
|
curren => chr 164, #
|
|
|
|
yen => chr 165, #
|
|
|
|
brvbar => chr 166, # broken vertical bar
|
|
|
|
sect => chr 167, #
|
|
|
|
uml => chr 168, # (umlaut, or
|
|
|
|
# dieresis)
|
|
|
|
copy => chr 169, #
|
|
|
|
ordf => chr 170, # (feminine ordinal)
|
|
|
|
laquo => chr 171, #
|
|
|
|
not => chr 172, #
|
|
|
|
shy => chr 173, # soft hyphen
|
|
|
|
reg => chr 174, #
|
|
|
|
macr => chr 175, #
|
|
|
|
deg => chr 176, #
|
|
|
|
plusmn => chr 177, #
|
|
|
|
sup2 => chr 178, # superscript two
|
|
|
|
sup3 => chr 179, # superscript three
|
|
|
|
acute => chr 180, # (acute accent)
|
|
|
|
micro => chr 181, # micro sign
|
|
|
|
para => chr 182, # (pilcrow)
|
|
|
|
middot => chr 183, # o
|
|
|
|
cedil => chr 184, # (cedilla)
|
|
|
|
sup1 => chr 185, # superscript one
|
|
|
|
ordm => chr 186, # (masculine ordinal)
|
|
|
|
raquo => chr 187, #
|
|
|
|
frac14 => chr 188, # one-quarter
|
|
|
|
frac12 => chr 189, # one-half
|
|
|
|
frac34 => chr 190, # three-quarters
|
|
|
|
iquest => chr 191, #
|
|
|
|
Agrave => chr 192, # A
|
|
|
|
Aacute => chr 193, # A
|
|
|
|
Acirc => chr 194, # A
|
|
|
|
Atilde => chr 195, # A
|
|
|
|
Auml => chr 196, # A
|
|
|
|
Aring => chr 197, # A
|
|
|
|
AElig => chr 198, #
|
|
|
|
Ccedil => chr 199, # C
|
|
|
|
Egrave => chr 200, # E
|
|
|
|
Eacute => chr 201, # E
|
|
|
|
Ecirc => chr 202, # E
|
|
|
|
Euml => chr 203, # E
|
|
|
|
Igrave => chr 204, # I
|
|
|
|
Iacute => chr 205, # I
|
|
|
|
Icirc => chr 206, # I
|
|
|
|
Iuml => chr 207, # I
|
|
|
|
ETH => chr 208, # capital Eth,
|
|
|
|
# Icelandic
|
|
|
|
Ntilde => chr 209, # N
|
|
|
|
Ograve => chr 210, # O
|
|
|
|
Oacute => chr 211, # O
|
|
|
|
Ocirc => chr 212, # O
|
|
|
|
Otilde => chr 213, # O
|
|
|
|
Ouml => chr 214, # O
|
|
|
|
times => chr 215, #
|
|
|
|
Oslash => chr 216, # O
|
|
|
|
Ugrave => chr 217, # U
|
|
|
|
Uacute => chr 218, # U
|
|
|
|
Ucirc => chr 219, # U
|
|
|
|
Uuml => chr 220, # U
|
|
|
|
Yacute => chr 221, # capital Y, acute
|
|
|
|
# accent
|
|
|
|
THORN => chr 222, # capital THORN,
|
|
|
|
# Icelandic
|
|
|
|
szlig => chr 223, #
|
|
|
|
agrave => chr 224, # a
|
|
|
|
aacute => chr 225, # a
|
|
|
|
acirc => chr 226, # a
|
|
|
|
atilde => chr 227, # a
|
|
|
|
auml => chr 228, # a
|
|
|
|
aring => chr 229, # a
|
|
|
|
aelig => chr 230, #
|
|
|
|
ccedil => chr 231, # c
|
|
|
|
egrave => chr 232, # e
|
|
|
|
eacute => chr 233, # e
|
|
|
|
ecirc => chr 234, # e
|
|
|
|
euml => chr 235, # e
|
|
|
|
igrave => chr 236, # i
|
|
|
|
iacute => chr 237, # i
|
|
|
|
icirc => chr 238, # i
|
|
|
|
iuml => chr 239, # i
|
|
|
|
eth => chr 240, # small eth, Icelandic
|
|
|
|
ntilde => chr 241, # n
|
|
|
|
ograve => chr 242, # o
|
|
|
|
oacute => chr 243, # o
|
|
|
|
ocirc => chr 244, # o
|
|
|
|
otilde => chr 245, # o
|
|
|
|
ouml => chr 246, # o
|
|
|
|
divide => chr 247, #
|
|
|
|
oslash => chr 248, # o
|
|
|
|
ugrave => chr 249, # u
|
|
|
|
uacute => chr 250, # u
|
|
|
|
ucirc => chr 251, # u
|
|
|
|
uuml => chr 252, # u
|
|
|
|
yacute => chr 253, # small y, acute
|
|
|
|
thorn => chr 254, # small thorn,
|
|
|
|
# Icelandic
|
|
|
|
yuml => chr 255, # y
|
|
|
|
);
|
|
|
|
|
|
|
|
# now fill in all the numbers to match
|
|
|
|
# themselves
|
|
|
|
|
|
foreach $chr ( 0 .. 255 ) {
|
|
|
|
$entity{ '#' . $chr } = chr $chr;
|
|
|
|
}
|
|
|
|
}
|
| Issue_01_HTMLregexps9. changing_links
|
Download changing_links
|
|
|
#!/usr/bin/perl -p -i.bak -00
|
|
s[
|
|
(
|
|
< \s* A
|
|
\s+ HREF
|
|
\s* = \s*
|
|
( ["'] )
|
|
)
|
|
http://foo\.com/somewhere/
|
|
(
|
|
( .*? )
|
|
\2
|
|
.*? >
|
|
)
|
|
][${1}http://www.foo.com/elsewhere/$2]xsgi;
|
| Issue_01_HTMLregexps10. More Samples on HTMLregexps
|

Last update 1999/02/20 
All Rights Reserved - (C) 1997 - 2008 by The Labs.Com |