Path: utzoo!utgpu!news-server.csri.toronto.edu!cs.utexas.edu!uunet!stephsf!wengland
From: wengland@stephsf.stephsf.com (Bill England)
Newsgroups: comp.lang.perl
Subject: Multi line, flat file, record reader ...
Message-ID: <459@stephsf.stephsf.com>
Date: 9 Jan 91 21:48:17 GMT
Organization: Stephen Software Systems, Inc., Tacoma WA
Lines: 277

package rec_reader; 
## 
 #  Copyright (c) 1991, Stephen Software Systems, Inc.  
 #  All Rights Reserved.
 #
 #  Permission is granted to all interested parties to distribute 
 #  this perl library under the terms of the GNU PUBLIC LICENSE,
 #
 #  ( This is the same license included with most major GNU       )
 #  ( sofware packages.  Look for the file COPYING in the build   )
 #  ( directory of the GNU software.                              )
 #
 # This copyright and license notice must be retained. This 
 # program is distributed WITHOUT ANY WARRANTY and without even 
 # the implied # warranty of MERCHANTABILITY or FITNESS FOR A 
 # PARTICULAR PURPOSE.
 #
 # # #
 #
 # This software system is currently under construction/design/hack,
 # Please don't mind the mess :-)   ...
 #
 # Formatting Notes:
 #     80 column width, 4 space tabs.
 #
 # Modification Notes:
 #
 #      Bill England, Sun Dec 02 15:00:03 PST 1990, 
 #		Library documentation.
 # 
 #      Bill England, November 1990
 #		Created.
 #
 # Please send problems, enhancements, and corrections to
 # support@stephsf.COM
##
##
 #  _Randomly arranged record parser_ library.  This library is
 #  used for parsing randomly arranged, multiple line, flat record 
 #  files where a single record is not necessarily on one line and 
 #  where record formats, and record/field seperators differ from file 
 #  to file.
 #
 #  Routines;
 #      Parse_Index,
 #			Is used to find and parse the first line
 #			and index of the file.  To use feed each line
 #			of the file to Parse_Index and call Rtn_Index
 #			immediatly afterwards.  When Rtn_Index returns
 #			true (1) the index has been found and seperated
 #			into field names.
 #
 #      Parse_Rec,
 #      Rtn_Last_Rec,
 #			Are used to accept multiple lines of a file and
 #			return a completly found record in an associative
 #			array with the names of the fields from the current
 #			files index.
##
##
 #   As an example of how to use these routines the following 
 #   example is provided.
 # # # #
 #
 # $f_have_index = 0;
 # $last_in_file_name = 0;
 # while(<>){
 #		chop;
 # 
 #		if( $f_have_index == 0  ){
 #			$f_have_index = &Parse_Index($_);
 #		}else{
 #			if( $last_in_file_name eq $ARGV ){
 #				%rec = &Parse_Rec($_);
 #			}else{
 #				  #  When changing files force  out the last record
 #				  #  and reset the have index flag.
 #					%rec = &Rtn_Last_Rec;
 #					$f_have_index = &Parse_Index($_);
 #			}
 # 				&Your_Routine_That_Uses_The_Data( %rec ) if %rec;
 #		}
 #		$last_in_file_name = $ARGV;
 #	}
 #	die "Incomplete record file.\nAn index and records were not found.\n"
 #		unless $f_have_index;
 # 
 #  %rec = &Rtn_Last_Rec;
 #  &Your_Routine_That_Uses_The_Data( %rec ) if %rec;
 #
 # # # # # # # # #
 # An example data file follows; ( record seperator = ! 
 #                                  and field seperator = ~ )
 #
 #!~Company
 #~Name
 #~TitleName
 #~Address
 #~CityStateZip
 #
 #!~Washington State Employees Credit Union
 #~Jane Smith
 #~Ms. Smith
 #~P.O. Box WSECU
 #~Olympia, Washington    98507
 #
 #!~Weyerhaeuser Tacoma Credit Union
 #~Jane Smith
 #~Ms. Smith
 #~33615 First Way South
 #~Federal Way, Washington     98003
 #
 #!~Alaska Airlines Employee Federal Credit Union
 #~Jane Smith
 #~Ms. Smith
 #~19530 Pacific Hwy South #201
 #~Seattle, Washington       98188
##
##
 # Parse rec depends on having a valid index record found
 # at the begining of a file.  The functions Parse_Index/Rtn_Index
 # are required to run before Parse_Rec/Rtn_Rec can be used.
 #
 # Global Vars ( Muli_Line_Rec library ):
 #   rec_sep     ... record seperator.
 #   fld_sep	 ... field seperator.
 #   rec_concat  ... is the string buffer for the current record.
 #   parse       ... T/F indicates if parse has started and triggers
 #                   above variable initializations.
##

@name_index = ();
$parse = 0;
1;
##
 # Parse_Rec, Parse record and return successfully found record.
 #
##
sub main'Parse_Rec{
	local( $line_in ) = @_;

	die "The Parse_Index function has not yet completed successfully.\n"
		unless  $parse;

	$rec_concat= $rec_concat.$line_in;

	if( $line_in =~ /$rec_sep/)
	{	
		local($idx, $fld_count, @flds, $cnt, %ass_r ); 

		# Split the record out looking for the start of the
		# next record, i.e. 2 occurances of $rec_sep.
		#
		local($fld_data, $remain) = split(/$rec_sep/, $rec_concat, 2 );

		if( $remain ne '' ){
			($trash, $rec_concat) = split(//,$remain, 2);

			@flds    = split(/$fld_sep/, $fld_data);

 		    # if trailing fields are blank/null then the $flds array
		    # will be short counted.  $fld_count will contain the
		    # exact number of fields on the record.

			$fld_count= ($fld_data =~ s/$fld_sep/$fld_sep/g);
			$fld_count++;

			if(@name_index != $fld_count){ 
			 print STDERR "Number of fields in record does not match index.\n";
			 print STDERR "$fld_data\n", 
							join( "$fld_sep", @name_index ),"\n","\n";
			 return ();
			}

			$cnt=0;
			foreach $idx(@name_index){
				$ass_r{$idx} = $flds[$cnt++];	
			}
			return %ass_r;
		}
	}
	return (); # for consistancy use return although "();" might be faster.
}
##
 # Rtn_Last_Rec, Return last record.
 #
##
sub main'Rtn_Last_Rec{
	local($fld_data, $remain) = split(/$rec_sep/, $rec_concat, 2 );
	local($idx, @flds, $cnt, %ass_r ); 

	# Returning the last record implies that any existing index
	# is now garbage and that the index parse boolean is no longer
	# true.
	#
	$parse = 0;

	# Here we split the record out looking for the start of the
	# next record, i.e. 2 occurances of $rec_sep.
	#
	@flds    = split(/$fld_sep/, $fld_data);

    # if trailing fields are blank/null then the $flds array
    # will be short counted.  $fld_count will contain the
    # exact number of fields on the record.

	$fld_count= ($fld_data =~ s/$fld_sep/$fld_sep/g);
	$fld_count++;

	if(@name_index != $fld_count){ 
	 print STDERR "Number of fields in record does not match index.\n";
	 print STDERR "$fld_data\n", join( "$fld_sep", @name_index ),"\n";
	 return ();
	}

	$cnt=0;
	foreach $idx(@name_index){
		$ass_r{$idx} = $flds[$cnt++];	
	}
	return %ass_r;
}
##
 # Parse Index functions.
 #
##
sub main'Parse_Index {
	local( $line_in ) = @_;
	local( $remain );

	# First char is record seperator
	# Second char is  field seperator
	# Everything left is name of first field
	#
	if (!$parse){
		$parse = 1;
		$rec_concat='';
		($rec_sep, $fld_sep, $remain) = split(//,$line_in, 3);

		die "Identical Record and Field Seperators.\n"
			unless $rec_sep ne $fld_sep;

		$line_in = $remain;
	}

	$rec_concat= $rec_concat.$line_in;

	  {
	  local($fld_names, $remain) = split(/$rec_sep/, $rec_concat, 2 );
	  local($trash);

	  # Here we split the record out looking for the start of the
	  # next record, i.e. two occurances of $rec_sep.
	  #
	  if( $remain ne '' ){
		# trash is the first fld_sep of the next record.
		# ( Works like  chop except it chops the first char 
		#   instead of the last. )
		#
		($trash, $rec_concat) = split(//,$remain, 2);

		# Make the record names case insensitive, 
		# others may want to change this ... a runtime option
		# could  be created, say -s (case-sensitive).
		#	
		$fld_names =~ y/A-Z/a-z/;

		@name_index = split(/$fld_sep/, $fld_names);
		return 1;
	  }
	}
	return 0;
} # end of parse label index
-- 
 +-  Bill England,  wengland@stephsf.COM -----------------------------------+
 |   * *      H -> He +24Mev                                                |
 |  * * * ... Oooo, we're having so much fun making itty bitty suns *       |
 |__ * * ___________________________________________________________________|