Path: utzoo!utgpu!news-server.csri.toronto.edu!cs.utexas.edu!swrinde!zaphod.mps.ohio-state.edu!wuarchive!emory!att!linac!midway!gargoyle!chinet!les From: les@chinet.chi.il.us (Leslie Mikesell) Newsgroups: comp.lang.perl Subject: Re: Needed: a pointer for a perl compare script (long, sorry..) Message-ID: <1990Dec04.230436.8432@chinet.chi.il.us> Date: 4 Dec 90 23:04:36 GMT References: <12020@milton.u.washington.edu> Organization: Chinet - Public Access UNIX Lines: 320 In article <12020@milton.u.washington.edu> djo7613@hardy.u.washington.edu.acs.washington.edu (Dick O'Connor) writes: >I've been following this group for awhile, cutting and saving sample scripts >like my Mom clips recipes, but I still haven't picked up enough pointers to >migrate one of my old Fortran "compare" programs to perl. If I could do >this, the users could make their own runs and I'd be free to contemplate a >higher order of existence (programming! :) >My program reads two files of differing format which are sorted by a unique >5-character label. When two labels match, a new record is written, with >info from file A (moved around a bit) written to the "left" and info from >file B (again, reformatted a little) written to the "right". Where a >given record from file A or B has no counterpart, the same new record is >written, with blanks on the "side" without counterpart information. Perl is the language of choice for this kind of thing but it may still turn out to be non-trivial. It is also fairly hard to describe so examples are generally needed. The merging subroutine is > half the file so I'll just include the whole thing. The concept here is to store old and new items into different associative arrays, sort the keys, then make the comparison from the top of each list. Here is a sample that takes a stream that looks like this from a legislative database: NOXIOUS WEEDS, PEST ERADICATION - 1.2.5 VT H 2 AUTHOR: ... TOPIC: ... SUBTOPIC: ... SUMMARY: ........ ........ STATUS: ....... ....... VT H 5 AUTHOR: etc... and files items under /dir/state/number, where state is taken from the first 2 characters of the bill id, and number is last portion of the topic line. Within the file, items are sorted by their bill id with an additional header added to note the date and whether the bill has been signed. A subsequent entry (possibly an update) with the same bill id will be merged by extracting the SUMMARY: portion of the old entry and stuffing it into the new data which will contain the current status. The merging portion is done in the writestate subroutine. Les Mikesell les@chinet.chi.il.us #---------------- # merge.pl # put legislative info into files: # 1 directory per state, 1 file per topic # merge w/current - if new includes summary, use it, # else snarf summary from old # collect one state from current input - then read current info & merge # # top dir of tree: $dir = './test'; open (ERR,">>errlog") ; # %nitems=(); $haveitem = 0; $havetopic = ""; $havestate = ""; $havesum = 0; $instatus = 0; ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst)=localtime(time); $mon += 1; $signed = 'n'; while (<>) { # read input file(s) # strip page breaks and some other junk if (/ Page/) { next;} if ( //) { next;} # these are always blank lines if ( /^ *\f/) { next;} # form feeds - ignore if ( /^ *$/) { $bl=1; next; # blank lines } unless (/\n$/) { # possible junk at EOF if ($instatus) { # past a complete item do saveitem(); do writestate(); } else { # incomplete item, junk it $haveitem = 0; $havestate = ""; $instatus = 0; $havesum = 0; } next; } # find: AGRICULTURAL LOANS - 1.4.1 if ( /^[A-Z].* \- [0-9][0-9]*\.[0-9][0-9]*/) { # new topic $topic=$_; # print "$topic \n"; $topic =~ s/.* \- // ; # print "$topic \n"; $topic =~ s/[^0-9]*$// ; # print "$topic \n"; if ( $haveitem != 0 ) { do saveitem(); # save current entry do writestate(); # write current state } $havetopic = $topic; print "Topic = $topic\n"; next; } # if start of new item, save last in %nitems # if start of new state, merge last state if (/^[A-Z][A-Z].*AUTHOR:/) { #new item if ($haveitem) {do saveitem();} /^([A-Z][A-Z]) *([0-9A-Za-z][0-9A-Za-z]*) *([0-9].*)AUTHOR:/; if (length($1) != 2) { print "**** BILL ID ERROR in $_ **** \n"; print ERR "**** BILL ID ERROR in $_ **** \n"; next; } $sortid=sprintf("%2.2s %s %s",$1,$2,$3); $sortid =~ s/ *$// ; $state=substr($_,0,2); $state =~ y/A-Z/a-z/; if ($havestate ne "" && $havestate ne $state) { # $save = $_ ; do writestate(); # print "Input line: was $save\n now $_\n"; } $id = substr($_,0,14); $id =~ s/ *$//; #trim trailing space $havestate=$state; $_ =~ s/.*AUTHOR:/AUTHOR:/ ; $_ =~ s/ *$//; $item = $_ ; #print "New: Itemid = $id $sortid\n"; $haveitem = 1; next; } if (/SUMMARY:/) { $havesum = 1;} if ($haveitem == 0 ) { next;} if (/STATUS:/) { $instatus = 1;} if ($instatus ) { if ( /RATIFIED/) { $signed = 's';} #NC variation if ( /[Ss]igned/) { $signed = 's';} } if (/END OF REPORT/) { do saveitem(); do writestate(); $havetopic=""; next; } $_ =~ s/ // ; # strip right indent $item .= $_ ; # append current line to item } do saveitem(); do writestate(); exit; # save item to %nitems array w/sortid as key # add header line of: # >smmddyy # where s = s or n (signed or not) # m = month # d = day date item is written to database (now) # y = year (2 digits) # if duplicate keep one with summary field sub saveitem { if ($haveitem = 0 ) {return; } if ($havetopic eq "" ) {return; } $haveitem = 0; $instatus = 0; if ($havesum == 0) { # no summary, check for alternate if ($nitems{$sortid} && $nitems{$sortid} =~ /SUMMARY:/) { return; } } $nitems{$sortid} = sprintf (">%s%02d%02d%02d\n%s%s\n%s",$signed,$mon,$mday,$year,"BILL ID: ",$id,$item) ; $signed = 'n'; $havesum = 0; return; } # load any current items in for merging sub writestate { local ($_) ; # important to not alter upper $_ local ($*) = 1 ; # multi-line match needed %oitems=(); @olist=(); # sanity check - may not have any new input if ($havetopic eq "" ) {return; } if ($havestate eq "" ) {return; } $nname = sprintf ("%s/%s/%s",$dir,$havestate,$havetopic); print "Loading $nname \n" ; if (open (IN,"<$nname")) { # read in old items keeping old date $wsortid=""; $item=""; while () { if (/^>/) { # added header line if ($wsortid ne "") { $oitems{$wsortid} = $item; #store previous item $wsortid = ""; $item=""; # start new one } } $item .= $_ ; # collect lines of item # normalize key to match original input if (/^BILL ID: (..) *([0-9A-Za-z][0-9A-Za-z]*) *([0-9].*)/) { $wsortid=sprintf("%2.2s %s %s",$1,$2,$3); #print "Old id: $wsortid\n"; } } if ($wsortid ne "") { $oitems{$wsortid} = $item; # save the last one } close(IN); @olist = sort (keys(%oitems)); # sort the old keys $howmany = $#olist +1; print "$howmany old bills\n" ; } @nlist = sort(keys(%nitems)); # sort the new keys $howmany = $#nlist +1; print "$howmany updates\n" ; #now merge the lists and write out print "Writing $nname \n" ; if ($nname ne $lname ) { close OUT; unless (open (OUT,">$nname")) { $dirname = sprintf ("%s/%s",$dir,$havestate); printf "Creating $dirname\n"; mkdir ($dirname,0777); open (OUT,">$nname") || die "Can't open $nname"; $lname = $nname ; } } #print "@olist\n"; #print "@nlist\n"; $oldid=shift(@olist); # start with top two keys $newid=shift(@nlist); $current = "" ; # sanity check while ( $oldid && $newid ) { # compare and merge #print " oldid = $oldid newid = $newid\n"; if ($current ge $oldid || $current ge $newid) { print "***** MERGE ERROR at $current *****\n"; print ERR "***** MERGE ERROR at $current *****\n"; } if ($oldid eq $newid ) { # merge summary w/new # if anything beyond date is changed use new # this keeps the old date on duplicates if ( ( ($t1) = $oitems{$oldid} =~ /BILL ID:([^\0]*)/ ) && ( ($t2) = $nitems{$newid} =~ /BILL ID:([^\0]*)/ ) && ($t1 eq $t2)) { print "Match: unchanged using OLD $oldid\n"; print OUT $oitems{$oldid} ; $current = $oldid ; } else { print "Match: using NEW $newid \n"; if ($nitems{$newid} =~ /SUMMARY:/) { #new has summary, toss old print OUT $nitems{$newid} ; $current = $newid ; print "NEW has summary\n"; } else { # snarf summary from old - note multi-line wierdness if (($status) = $oitems{$oldid} =~ /(^SUMMARY:\n[^\0]*)^STATUS:/) { # and insert into new - that was easy... substr($nitems{$newid},index($nitems{$newid},"STATUS:\n"),0) = $status ; } printf OUT $nitems{$newid} ; $current = $newid ; print "OLD has summary\n"; } } # this was a match, shift both lists to next item $oldid=shift(@olist); $newid=shift(@nlist); next; } # not a match, use alphabetically first item if ($oldid lt $newid ) { print "using OLD $oldid \n"; print OUT $oitems{$oldid} ; $current = $oldid ; $oldid = shift(@olist); next; } # newid must be > oldid print OUT $nitems{$newid} ; $current = $newid ; print "using NEW $newid \n"; $newid = shift(@nlist); next; } # one of the arrays is empty - write remaining part of other array if ($oldid) { print OUT $oitems{$oldid} ; print "using OLD $oldid \n"; foreach $oldid (@olist) { print OUT $oitems{$oldid} ; print "using OLD $oldid \n"; } } if ($newid) { print OUT $nitems{$newid} ; print "using NEW $newid \n"; foreach $newid (@nlist) { print OUT $nitems{$newid} ; print "using NEW $newid \n"; } } undef %nitems; #left over from trying to pin down a memory leak undef %oitems; # in an old version of perl %nitems=(); %oitems=(); $havestate=""; $haveitem=0; } Brought to you by Super Global Mega Corp .com