Xref: utzoo news.admin:15219 news.software.b:8250 Path: utzoo!utgpu!news-server.csri.toronto.edu!smoke.cs.toronto.edu!moraes Newsgroups: news.admin,news.software.b From: moraes@cs.toronto.edu (Mark Moraes) Subject: Re: Yet Another Cnews Thread (and, another solution). Message-ID: <91Jun13.173712edt.1088@smoke.cs.toronto.edu> Keywords: source, gawk, script Organization: Department of Computer Science, University of Toronto References: <517_=9&@uzi-9mm.fulcrum.bt.co.uk> <1991Jun12.001917.239@gacvx2.gac.edu> <1991Jun13.065630.1783@panix.uucp> <91Jun13.143704edt.1124@smoke.cs.toronto.edu> Date: 13 Jun 91 21:37:45 GMT Lines: 255 moraes@cs.toronto.edu (Mark Moraes) writes: >Caveat: It wasn't really meant for distribution And it shows :-( Some of the messages dated to when we were the only site running the current relaynews code; Paul Eggert pointed out that whitespace is not even technically legal according to 1036. The default now complains about bad dates too. Here's a new version. It's also available by anonymous ftp from ftp.cs.toronto.edu in pub/news (not to be confused with pub/c-news). My earlier warnings stand; included here in case you missed the last version (now cancelled). > This script (requires GNU awk) was derived from the mail messages > Geoff and I sent out to sites during the alpha test of the new fascist > relaynews on news-server.csri.toronto.edu. > > By default, the script complains about whitespace in messageids and > non-header lines in headers. The -q option merely generates the > summary. The -v option complains about all errors the script knows > about. The -o option allows you to specify categories of problem that > it should warn about. eg. newscomplain -o d $NEWSCTL/log should > complain about unparsable dates only. > > I would STRONGLY suggest that you NOT run this script automatically > (from cron or at). Please run it by hand, go through its output > carefully, edit the output or the script suitably to correct errors, > and then send the output to sh -x. Note: It expects Berkmail or > something that understands the -s option. "grep '^:.'" or grep > '^Mail' on the output is useful. > > Caveat: It wasn't really meant for distribution, so I'd advise extreme > care when using it. I'm only posting it because of the continuous > bitching and moaning by people who think they ought to be notified > because their precious, non-compliant trash, er, news posting is being > dropped on the floor by news software that's trying to be careful and > robust. Gah! Mark. #!/bin/sh # To unbundle, sh this file echo newscomplain 1>&2 sed 's/^-//' >newscomplain <<'!' -#! /bin/sh -# Scans C News relaynews log and generates a shell script that sends -# mail to admins of sites responsible for bad messages. thishost=csri.toronto.edu args="wnd"; case "$1" in --q) args=""; shift;; --v) args="dnfmxw"; shift;; --o) shift; args="$1"; shift;; esac -# gawk needed because of nawk hard limits on strings. Could be -# worked around if really necessary. gawk 'BEGIN { - args="'"$args"'"; - skipud = index(args, "d") # unparsable date - skipnh = index(args, "n"); # non-header line in header - skipdf = index(args, "f"); # date in future - skipnm = index(args, "m"); # no message-id - skipxh = index(args, "x"); # missing header - skipws = index(args, "w"); # whitespace in messageid - # to add: illegal messageid -} function saveit(msgid, line, arr) { - nf = split(msgid, v, "@"); - if (nf != 2) { - printf ("weird message-id: %s\n", msgid); - return; - } - # chop off trailing ">" - len = length(v[2]); - if (substr(v[2], len, 1) != ">") { - printf ("weird message-id: %s\n", msgid); - return; - } - machine = substr(v[2], 1, len - 1); - arr[machine] = arr[machine] line "\n"; -} function extract(s, startchar, endchar, tmp1, tmp2, tmp) { - # s, startchar, endchar are arguments, rest are locals. - # This returns whatever is in s between startchar and endchar (not - # inclusive). - tmp1 = index(s, startchar); - if (tmp1 != 0) { - tmp = substr(s, tmp1 + 1, length(s) - tmp1); - tmp2 = index(tmp, endchar); - if (tmp2 != 0) - tmp = substr(tmp, 1, tmp2 - 1); - } else - tmp = ""; - return tmp; -} function complain(s, msg, arr) { - for (m in arr) { - printf "Mail -s \"%s\" usenet@%s << \"EOF\"\nHi\n\n", s, m; - printf "%s\nThe following is from our news log:\n\n%s", msg,\ - arr[m]; - print "EOF\n:"; - } -} -$5 != "-" { - next; -} -/duplicate$/ { - next; -} -/all groups .* excluded in active/ { - next; -} -/older than [0-9]* days/ { - if (days == "") { - # The for() loop is not strictly necessary, but it makes this - # robust in the face of modified logs, eg. nntplink -X mods - for (i = 6; i <= NF; i++) { - if ($i == "older") { - i += 2; - days = $i; - } - } - } - old++; - next; -} -/unparsable Date:/ { - unparsable++; - saveit($6, $0, ud); - next; -} -/contains non-header line/ { - nonheader++; - saveit($6, $0, nh); - next; -} -/Date: in the future:/ { - future++; - saveit($6, $0, df); - next; -} -/no Message-ID: header/ { - nomsg++; - saveit("", $0, nm); - next; -} -/no .*: header/ { - xheader++; - saveit($6, $0, xh); - next; -} -/unapproved article in moderated group/ { - unapproved++; - next; -} -/whitespace in Message-ID/ { - whitespace++; - s = "") ">"; - saveit(s, $0, ws); - next; -} -{ - unknown++; - xx = xx $0 "\n"; -} END { - print "cat << \"EOF\"\nSummary:\n" - printf "%5d : older than %s days\n", old, days; - printf "%5d : unparsable Date\n", unparsable; - printf "%5d : header contains non-header lines\n", nonheader; - printf "%5d : Date: in the future\n", future; - printf "%5d : No Message-ID header\n", nomsg; - printf "%5d : missing required header\n", xheader; - printf "%5d : unapproved article in moderated group\n", unapproved; - printf "%5d : whitespace in Message-ID\n", whitespace; - if (unknown > 0) - printf "%5d : unknown lines:\n%s", unknown, xx; - print "EOF" - if (skipud && unparsable > 0) { - s="unparsable dates in news articles"; - printf ":\n:%s\n:\n", s; - msg="'"\ Your machine's news system seems to be generating articles with Date:\\n\ headers in violation of Internet RFC 1036, the Usenet article format\\n\ standard. C News sites will not file or forward such articles.\\n\ The correct date format is\\n\ - [Day,] dd Month [yy]yy hh:mm:ss timezone\\n\ -Four digit years and numeric timezones are recommended, per RFC1123.\\n\ -"'"; - complain(s, msg, ud); - } - if (skipnh && nonheader > 0) { - s="non-header lines in news article headers"; - printf ":\n:%s\n:\n", s; - msg="'"\ Your machine's news system seems to be generating articles with\\n\ non-header lines in headers. A non-header line is one which doesn't\\n\ conform to Internet RFC 1036, the Usenet article format\\n\ standard. C News sites will not file or forward such articles.\\n\ The definition of a header is\\n\ - word: text\\n\ where text may carry onto optional continuation lines. Continuation\\n\ lines must start with whitespace. Headers continue till the first\\n\ empty line. A common mistake is to leave empty headers in -- since\\n\ these have no space after the :, they are illegal.\\n\ -"'"; - complain(s, msg, nh); - } - if (skipdf && future > 0) { - s="bad dates in news articles"; - printf ":\n:%s\n:\n", s; - msg="'"\ Your machine's news system is generating Date: headers with times that\\n\ appear to be in the future, probably due to a missing or incorrect time\\n\ zone. C News sites will not file or forward such articles.\\n\ -"'"; - complain(s, msg, df); - } - if (skipnm && nomsg > 0) { - s="news articles without message ids"; - printf ":\n:%s\n:\n", s; - msg="'"\ Your machine's news system is generating articles without Message-IDs\\n\ in violation of Internet RFC 1036, the Usenet article format\\n\ standard. C News sites will not file or forward such articles.\\n\ -"'"; - complain(s, msg, nm); - } - if (skipxh && xheader > 0) { - s="missing required headers in news articles"; - printf ":\n:%s\n:\n", s; - msg="'"\ Your machine's news system is generating articles without required headers,\\n\ in violation of Internet RFC 1036, the Usenet article format standard.\\n\ C News sites will not file or forward such articles.\\n\ -"'"; - complain(s, msg, xh); - } - if (skipws && whitespace > 0) { - s="whitespace in news article message IDs"; - printf ":\n:%s\n:\n", s; - msg="'"\ Your machine's news system is generating articles with Message-IDs\\n\ that contain whitespace. This is illegal according to section 2.1.5\\n\ of Internet RFC 1036, the Usenet article format standard.\\n\ C News sites will not file or forward such articles.\\n\ -"'"; - complain(s, msg, ws); - } -}' $@ ! chmod +x newscomplain echo shar unpacked fully exit