Path: utzoo!utgpu!news-server.csri.toronto.edu!cs.utexas.edu!uunet!stephsf!wengland From: wengland@stephsf.stephsf.com (Bill England) Newsgroups: comp.lang.perl Subject: Multi line, flat file, record reader ... Message-ID: <459@stephsf.stephsf.com> Date: 9 Jan 91 21:48:17 GMT Organization: Stephen Software Systems, Inc., Tacoma WA Lines: 277 package rec_reader; ## # Copyright (c) 1991, Stephen Software Systems, Inc. # All Rights Reserved. # # Permission is granted to all interested parties to distribute # this perl library under the terms of the GNU PUBLIC LICENSE, # # ( This is the same license included with most major GNU ) # ( sofware packages. Look for the file COPYING in the build ) # ( directory of the GNU software. ) # # This copyright and license notice must be retained. This # program is distributed WITHOUT ANY WARRANTY and without even # the implied # warranty of MERCHANTABILITY or FITNESS FOR A # PARTICULAR PURPOSE. # # # # # # This software system is currently under construction/design/hack, # Please don't mind the mess :-) ... # # Formatting Notes: # 80 column width, 4 space tabs. # # Modification Notes: # # Bill England, Sun Dec 02 15:00:03 PST 1990, # Library documentation. # # Bill England, November 1990 # Created. # # Please send problems, enhancements, and corrections to # support@stephsf.COM ## ## # _Randomly arranged record parser_ library. This library is # used for parsing randomly arranged, multiple line, flat record # files where a single record is not necessarily on one line and # where record formats, and record/field seperators differ from file # to file. # # Routines; # Parse_Index, # Is used to find and parse the first line # and index of the file. To use feed each line # of the file to Parse_Index and call Rtn_Index # immediatly afterwards. When Rtn_Index returns # true (1) the index has been found and seperated # into field names. # # Parse_Rec, # Rtn_Last_Rec, # Are used to accept multiple lines of a file and # return a completly found record in an associative # array with the names of the fields from the current # files index. ## ## # As an example of how to use these routines the following # example is provided. # # # # # # $f_have_index = 0; # $last_in_file_name = 0; # while(<>){ # chop; # # if( $f_have_index == 0 ){ # $f_have_index = &Parse_Index($_); # }else{ # if( $last_in_file_name eq $ARGV ){ # %rec = &Parse_Rec($_); # }else{ # # When changing files force out the last record # # and reset the have index flag. # %rec = &Rtn_Last_Rec; # $f_have_index = &Parse_Index($_); # } # &Your_Routine_That_Uses_The_Data( %rec ) if %rec; # } # $last_in_file_name = $ARGV; # } # die "Incomplete record file.\nAn index and records were not found.\n" # unless $f_have_index; # # %rec = &Rtn_Last_Rec; # &Your_Routine_That_Uses_The_Data( %rec ) if %rec; # # # # # # # # # # # An example data file follows; ( record seperator = ! # and field seperator = ~ ) # #!~Company #~Name #~TitleName #~Address #~CityStateZip # #!~Washington State Employees Credit Union #~Jane Smith #~Ms. Smith #~P.O. Box WSECU #~Olympia, Washington 98507 # #!~Weyerhaeuser Tacoma Credit Union #~Jane Smith #~Ms. Smith #~33615 First Way South #~Federal Way, Washington 98003 # #!~Alaska Airlines Employee Federal Credit Union #~Jane Smith #~Ms. Smith #~19530 Pacific Hwy South #201 #~Seattle, Washington 98188 ## ## # Parse rec depends on having a valid index record found # at the begining of a file. The functions Parse_Index/Rtn_Index # are required to run before Parse_Rec/Rtn_Rec can be used. # # Global Vars ( Muli_Line_Rec library ): # rec_sep ... record seperator. # fld_sep ... field seperator. # rec_concat ... is the string buffer for the current record. # parse ... T/F indicates if parse has started and triggers # above variable initializations. ## @name_index = (); $parse = 0; 1; ## # Parse_Rec, Parse record and return successfully found record. # ## sub main'Parse_Rec{ local( $line_in ) = @_; die "The Parse_Index function has not yet completed successfully.\n" unless $parse; $rec_concat= $rec_concat.$line_in; if( $line_in =~ /$rec_sep/) { local($idx, $fld_count, @flds, $cnt, %ass_r ); # Split the record out looking for the start of the # next record, i.e. 2 occurances of $rec_sep. # local($fld_data, $remain) = split(/$rec_sep/, $rec_concat, 2 ); if( $remain ne '' ){ ($trash, $rec_concat) = split(//,$remain, 2); @flds = split(/$fld_sep/, $fld_data); # if trailing fields are blank/null then the $flds array # will be short counted. $fld_count will contain the # exact number of fields on the record. $fld_count= ($fld_data =~ s/$fld_sep/$fld_sep/g); $fld_count++; if(@name_index != $fld_count){ print STDERR "Number of fields in record does not match index.\n"; print STDERR "$fld_data\n", join( "$fld_sep", @name_index ),"\n","\n"; return (); } $cnt=0; foreach $idx(@name_index){ $ass_r{$idx} = $flds[$cnt++]; } return %ass_r; } } return (); # for consistancy use return although "();" might be faster. } ## # Rtn_Last_Rec, Return last record. # ## sub main'Rtn_Last_Rec{ local($fld_data, $remain) = split(/$rec_sep/, $rec_concat, 2 ); local($idx, @flds, $cnt, %ass_r ); # Returning the last record implies that any existing index # is now garbage and that the index parse boolean is no longer # true. # $parse = 0; # Here we split the record out looking for the start of the # next record, i.e. 2 occurances of $rec_sep. # @flds = split(/$fld_sep/, $fld_data); # if trailing fields are blank/null then the $flds array # will be short counted. $fld_count will contain the # exact number of fields on the record. $fld_count= ($fld_data =~ s/$fld_sep/$fld_sep/g); $fld_count++; if(@name_index != $fld_count){ print STDERR "Number of fields in record does not match index.\n"; print STDERR "$fld_data\n", join( "$fld_sep", @name_index ),"\n"; return (); } $cnt=0; foreach $idx(@name_index){ $ass_r{$idx} = $flds[$cnt++]; } return %ass_r; } ## # Parse Index functions. # ## sub main'Parse_Index { local( $line_in ) = @_; local( $remain ); # First char is record seperator # Second char is field seperator # Everything left is name of first field # if (!$parse){ $parse = 1; $rec_concat=''; ($rec_sep, $fld_sep, $remain) = split(//,$line_in, 3); die "Identical Record and Field Seperators.\n" unless $rec_sep ne $fld_sep; $line_in = $remain; } $rec_concat= $rec_concat.$line_in; { local($fld_names, $remain) = split(/$rec_sep/, $rec_concat, 2 ); local($trash); # Here we split the record out looking for the start of the # next record, i.e. two occurances of $rec_sep. # if( $remain ne '' ){ # trash is the first fld_sep of the next record. # ( Works like chop except it chops the first char # instead of the last. ) # ($trash, $rec_concat) = split(//,$remain, 2); # Make the record names case insensitive, # others may want to change this ... a runtime option # could be created, say -s (case-sensitive). # $fld_names =~ y/A-Z/a-z/; @name_index = split(/$fld_sep/, $fld_names); return 1; } } return 0; } # end of parse label index -- +- Bill England, wengland@stephsf.COM -----------------------------------+ | * * H -> He +24Mev | | * * * ... Oooo, we're having so much fun making itty bitty suns * | |__ * * ___________________________________________________________________|