Path: utzoo!utgpu!news-server.csri.toronto.edu!rpi!crdgw1!uunet!munnari.oz.au!mel.dit.csiro.au!yarra!melba.bby.oz.au!gnb From: gnb@bby.oz.au (Gregory N. Bond) Newsgroups: comp.lang.perl Subject: ngmatch in perl Message-ID: Date: 24 Apr 91 00:47:29 GMT Sender: usenet@melba.bby.oz.au (news READER id) Organization: Burdett, Buckeridge and Young Ltd. Lines: 148 Nntp-Posting-Host: leo-gw This is a little script I hacked up to rebuild our news system after some nasty failures. It is basically a perl transcription if the ngmatch.c file from Cnews. Strange thing is, it is as slow as hell, about 2 match/sec on 20Mips sparc. Doesn't worry me too much (as its a bit of a oncer), but surprising. Another thing that I found difficult was the equivalent of "fgrep -v -f ", where file is large (say, 1,000 lines). I tried a perl script that built a program then eval'd it, but with a 30k pattern file, the perl program grew to >5Mb and ran like a dog. It was much much faster to split the pat file and use a chain of fgreps. Again, surprising. Any suggestions? Greg. #! /usr/local/bin/perl # take a history file on stdin, and match them to the following # newsgroups to see if we would collect it (ignore distributions here) # Each line that matches is passed to the stdout $sysfile_pattern = " gnu,comp,news,sci,rec,misc,soc,talk,alt,to.melba,aus,melb,world,general,\ !talk.all,\ !soc.all,soc.culture.australian,soc.culture.jewish,\ soc.culture.arabic,\ soc.culture.esperanto,soc.religion.islam,\ soc.history,soc.men,soc.motss,\ soc.women,soc.feminism,\ !rec.all,rec.humor,rec.games.hack,rec.sport.cricket,rec.pets,\ rec.games.trivia,rec.arts.movies.reviews,\ !alt.all,alt.humor.oracle,alt.sources,alt.security,alt.sys.sun,\ alt.folklore,alt.religion,\ !sci.all,sci.space,sci.military,sci.med.aids,sci.econ,\ sci.environment,\ sci.energy,sci.skeptic,\ comp.all,!comp.binaries,!comp.all.mac,!comp.all.amiga,\ !comp.all.atari,\ !comp.mail.maps,!comp.os,!comp.sys,comp.sys.sun,comp.sys.ibm,\ !talk.all,all.misc:F: "; #$sysfile_pattern = "comp,!comp.binaries.all"; $sysfile_pattern =~ s/\\.//g; $sysfile_pattern =~ s/\s//g; $sysfile_pattern =~ s/:.*//; $sysfile_pattern =~ tr/A-Z/a-z/; @patterns = split(/,/, $sysfile_pattern); # ngmatch(grouplist) # returns true is one of the groups in grouplist matched one of the # patterns in patterns. sub ngmatch { local ($grouplist) = @_; return scalar(grep(&onegroupmatch($_), split(/,/, $grouplist))); } # # match the single group name in grp to the patterns. Keep track of # depth! (see cnews file libcnews/ngmatch.c for comments) # sub onegroupmatch { local ($grp) = @_; local ($pat, $neg, $depth, $hitdeepest, $faildeepest); foreach $_ (@patterns) { $neg = (($pat = $_) =~ s/^!//); if (($depth = &onepatmatch($grp, $pat)) > 0) { $faildeepest = $depth if $depth > $faildeepest && $neg; $hitdeepest = $depth if $depth > $hitdeepest && !$neg; } } $hitdeepest > $faildeepest; } # # match the single newsgroup against the single pattern # sub onepatmatch { local($grp, $pat) = @_; local (@gs, @ps); local($p, $g, $incr, $depth); @gs = split(/\./, $grp); @ps = split(/\./, $pat); $depth = 0; for ($p = shift(@ps), $g = shift(@gs); $p && $g; $p = shift(@ps), $g = shift(@gs)) { $incr = 20; return 0 if ($p ne $g && $p ne "all"); if ($p eq $g) { $depth += $incr; } else { # Is a wildcard match $depth += --$incr; } } ($p && !$g) ? 0 : $depth; } if (0) { $v = "comp.graphics"; $m = &ngmatch($v); print "match $v: $m\n"; $v = "comp.graphics"; $m = &ngmatch($v); print "match $v: $m\n"; } # # # Now the main loop # # if (1) { open (GOOD, ">good.out") || die "can't open good.out: $!\n"; open (BAD, ">bad.out") || die "can't open bad.out: $!\n"; # open (EXPIRED, ">expired.out") || die "can't open expired.out: $!\n"; while (<>) { @L = split(/\t/); next unless ( ($ng = @L[2]) =~ s=/\d+==g ); $ng =~ s/\s+/,/g; $ng =~ s/,+$//; if (&ngmatch($ng)) { print GOOD; } else { print BAD; } ## last if $num++ > 100; } } -- Gregory Bond, Burdett Buckeridge & Young Ltd, Melbourne, Australia Internet: gnb@melba.bby.oz.au non-MX: gnb%melba.bby.oz@uunet.uu.net Uucp: {uunet,pyramid,ubc-cs,ukc,mcvax,prlb2,nttlab...}!munnari!melba.bby.oz!gnb