Webcheck Utility

Continueity of web site directories

"check" is a wrapper program for "webcheck" which allows recursive use.
"webcheck" is a Perl script for checking website directories, and is found [below.]

Check

#!/bin/bash
# $Id: check,v 3.2 2001/06/14 05:11:36 jno Exp $
# /usr/local/bin/check - recursive use of webcheck
# this is a wrapper file, invoke from any directory
find . -type d -exec 'webcheck' {}  \;

Webcheck

#!/usr/bin/perl 
# $Id: webcheck,v 3.28 2002/11/30 20:50:54 jno Exp $
# /usr/local/bin/webcheck, Usage: webcheck {directory}
# 	 defaults to $ENV{LOGNAME} with no commandline parameters
# DESCRIPTION: A html linting utility written in Perl, which 
#        checks all internal anchor, img, and background links, 
#        and can be used recursively. See notes below program.

########## NOTE: Make a selection of file extension you wish this
##########       program to check for. Include images.
########## NOTE: you must list 'html' before 'htm'

$extensions = "html|htm|txt|jpg|gif|zip|wav";

$currentdate = (`date`);
$currentdir = $ARGV[0] ;
# set cwd if argv[0] is not set
($ARGV[0]) or die "must specify directory\n";
chdir "$currentdir"  ;
print "   directory: $currentdir\n" ;

########## NOTE: add or delete the "htm", "html", or "shtml" file 
########## extensions as needed in the line below. See notes below.

foreach $filename (`ls *.htm *.html *.php`)     # 1- each file
{                                         
  $all = () ;                             # reset the blurch
  undef $/;                               # K: undef eol  
  open (FILE, "<$filename");
  print "$filename";
  $all = join ("",<FILE>);                # all of file slurped up
  close FILE;

 at code =();                                # find <..> segms and slurp
  while ($all =~ /<[^>]+>/gim) {push ( at code, $&)}

$links=();                                # clear links collection
  foreach $code ( at code) 
	{                                 # 2- each segment inspected
                                          # start not-any (" = blank),
                                          # repeat, follow with period, 
                                          # end with known extension.

  while ($code=~ /[^(\"|=| )]+\.($extensions)/gims) 
		{                         # 3-

    if ($& =~ /:\/\//) { next }           # skip http files

########## NOTE: Hash out the following line to speed up WebCheck,
##########       and see notes under "Orphans" below.

    elsif (-e $&) { system (touch, $&)}   # touch if exists

    else { $links .= " -- $&\n" unless -e $&};    # list if not exist
		}                                 #-3
	}                                         #-2

 if ($links) { $missing .= "$filename$links\n" }  # assoc w filename 
}                                                 #-1

#########       Send email locally (to owner) -- see notes
######### NOTE: if sendmail delays delivery, use procmail instead.
#########       Both forms are shown below.
#########       Or run /usr/sbin/sendmail -q as root

 if ($missing) {
open (MAIL, "|/usr/sbin/sendmail -oi -n -t"); 
# open (MAIL, "|/usr/bin/procmail -Y"); 
print MAIL <<EOF;
To:$ENV{LOGNAME}
From:WEBCHECK
Subject:$currentdir

 "WebCheck" searches all *.htm and *.html files in the logged directory
 for word chunks ending in the following file extensions...
           $extensions
 The current directory and listed paths of a link are inspected for the 
 existance of files. Fully qualified URLs and name anchors are skipped.
 Today's date.. $currentdate
 This check was made from .. $currentdir

 Missing links are listed by source filename below... 
\n$missing
(end)
EOF

print "====== ERRORS reported via email ======\n\n" ;
		}
else { print "    == no email report ==\n\n" };

#
#                            SETUP:     
#
# - make note of perl and sendmail (or procmail) location, and the 
#   To: header, and make corrections as needed. The "To:" is currently 
#   set to $ENV{LOGNAME}. If email notification is to be send elsewhere, 
#   change "To:$ENV{LOGNAME}" to another email address. 
#   Be sure to escape the " at " as "\ at "
#
# - See notes in the body of the program concerning appropriate use 
#   of sendmail or procmail. 
#
# - Set the file extensions to be looked for at the variable 
#
#                   $extensions="aaa|bbb|ccc";
#
#   be sure (1) the right side is enclosed in quotes followed with ;
#   (2) the extensions are separated with the | sign.
#   The variable $extensions may be found at the top of the program.
#
# - if "*.html" files are not used, delete this from the line of code
#
#              foreach $filename (`ls *.htm *.html`)
#
#   "ls" will write a "file not found" message to the screen if there 
#   are no "html" file extensions, yet this is included in the list.
#   Similarly other forms such as "php" can be added in this line.
#
#                            USAGE:
#
# WebCheck searches all *.htm and *.html files (or other file extensions
# as specified) in the current directory for "word chunks" ending in
# common file extensions included within HTML tags. The current directory,
# and any directory included as part of a filename, are inspected for the 
# existance of file names derived from these "link-like" word chunks. 
#
#
# The user account is notified by email of missing files. A separate
# e-mail will be sent for each directory where missing file names were 
# discovered. The missing files are listed by the name of the htm (or
# html) file where these are called.
# 
# Note that _all_ of the files will be inspected, including orphans.
# Thus if you receive strange messages about some files, suspect that
# they may be files requested as links from abondened html files. 
# 
# Webcheck can determine orphaned files, that is, files to which no links
# exist, because all inspected files are touched. Orphans thus show up as 
# files with earlier dates ("ls -tl" will list and group by dates). 
#
# Note that all orphans will not be identified unless WebCheck has been
# executed in each subdirectory. See notes on a recursive wrapper, below. 
#
# NOTE: touching files is very time consuming, since the process is
# repeated at every instance a file is encountered. To VOID the 
# ability to identify orphaned files, comment out the line..
#
#                elsif (-e $&) { system (touch, $&)}   
#  
# To have webcheck operate recursively through a file system, execute
# the following (this wrapper file is available as "check")..
#
# 	find . -type d -exec './webcheck' {}   \;
#
# (exactly as it appears above) from some starting point in the directory
# system. This assumes webcheck can be found on the path (as for example,
# in /usr/local/bin) or that a copy of webcheck is found in the root
# directory where file checking is started. 
#
# Webcheck operates verbosely, listing all the filenames which are
# inspected. to operate silently, hash the lines..
# 	print "$currentdir\n" ;
#	print "$filename";
#	print "   === ERRORS reported via email ===\n\n" ;
#	else { print "   === no email report ===\n\n" };
#
# Note that files linked from orphaned files will show as active files
# until the orphans are removed. See "about orphans" above.
#
# WebCheck will catch _any_ word-like link file names (with names of any
# size, including path names), including any nonalphabetical characters 
# except the double quote, equal sign, parenthesis, and included blanks.
#
#                          BUGS AND CAVEATS:
# 
# - WebCheck lists missing links as often as they occur in a html file.
# - All anchors of the form "href=http://... etc" are ignored. 
# - Name anchors links of the form "file.htm#goto" are stripped of 
#   the information after the # mark before testing. 
#
#                          COPYRIGHT NOTICES:
#
# Copyright (C) 1998 2001 Kees Cook, Counterpoint Networking, Inc.
#        cook (at) outflux (dot) net
# Developmental design: Jno Cook, Aesthetic Investigation, Chicago
#             jno (at) blight (dot) com  
# 
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation, version 2.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
# http://www.gnu.org/copyleft/gpl.html
#

Website Provider: Outflux.net, www.Outflux.net
URL:http://jnocook.net/geek/webcheck.htm