Path: utzoo!utgpu!cs.utexas.edu!usc!julius.cs.uiuc.edu!rpi!clarkson!grape.ecs.clarkson.edu!nelson From: nelson@sun.soe.clarkson.edu (Russ Nelson) Newsgroups: alt.sources Subject: code for reading from .?Q? files Message-ID: Date: 15 Nov 90 18:00:04 GMT References: <1422@tharr.UUCP> Sender: @grape.ecs.clarkson.edu Reply-To: nelson@clutx.clarkson.edu (aka NELSON@CLUTX.BITNET) Distribution: alt Organization: Clarkson University, Potsdam NY Lines: 561 In-Reply-To: gtoal@tharr.UUCP's message of 15 Nov 90 00:59:40 GMT In article <1422@tharr.UUCP> gtoal@tharr.UUCP (Graham Toal) writes: This posting consists of a set of routines which roughly simulate fopen, fgetc, fgets, and fclose. The difference between these and the originals is that these will read data from a .Z compressed file, decompressing it on the fly. It does *not* uses pipes, processes, or intermediate files. This makes it useful to add to any programs which read large text files sequentially. I had a very similar need, except that I needed to SEEK also. With compress, this is a problem. But with Huffman encoding, seeking isn't a problem, so long as you seek to a bit boundary rather than a byte boundary. Since other people may need to seek into squeezed files, here's my version. It reads files produced in the format of the "SQ" standard first seen on CP/M. It's derived from simtel20.army.mil:pd1:xsq.tar-z, and is kind of useless without it, because you need to modify xsq.c or xusq.c to convince it to tell you the code offsets. I've enclosed my xusqi.c program, which looks through a specially formatted-file (seven-line records), and maps a byte offset into the original file into a bit offset into the squeezed file. This code has been written for MSDOS, but should be portable to Unix. #! /bin/sh # This is a shell archive. Remove anything before this line, then unpack # it by saving it into a file and typing "sh file". To overwrite existing # files, type "sh file -c". You can also feed this as standard input via # unshar, or by typing "sh 'sqread.c' <<'END_OF_FILE' X#include X#include X#include X#include X#include X#include X#define INTERNAL X#include "sqread.h" X X#define ERROR (-1) X#define PATHLEN 312 /* Number of characters allowed in pathname */ X#define OK 0 X X#define RECOGNIZE 0xFF76 /* unlikely pattern */ X#define DLE 0x90 /* repeat byte flag */ X#define SPEOF 256 /* special endfile token */ X#define NUMVALS 257 /* 256 data values plus SPEOF*/ X Xstatic int getuhuff(SQFILE *sqp); Xstatic int portgetw(FILE *f); X X/* BUG: you cannot select an arbitrary position in the output stream X * to start your output (unless you make provisions for that). This is X * because of the repeat compression. You can make provision for seeking X * to the middle of a repeat sequence by terminating the repeat when X * compressing the file. X */ X XSQFILE * Xsqopen (const char *path) X{ X FILE *fp; X SQFILE *sqp; X int numnodes; X int i; X X fp = fopen(path, "rb"); X if (!fp) X return NULL; X sqp = malloc(sizeof(SQFILE)); X if (!sqp) X return NULL; X sqp->file = fp; X if(portgetw(fp) != (int) RECOGNIZE) {/* Process header */ X rewind(sqp->file); X sqp->issq = 0; X return sqp; X } X sqp->issq = 1; X (void) portgetw(fp); /* checksum */ X while (getc(fp)) /* Read and discard the name */ X ; X X numnodes = portgetw(fp); X if(numnodes < 0 || numnodes >= NUMVALS) { X /* invalid decode tree */ X fclose(fp); X return NULL; X } X /* Initialize for possible empty tree (SPEOF only) */ X sqp->sqleaf[0].children[0] = -(SPEOF + 1); X sqp->sqleaf[0].children[1] = -(SPEOF + 1); X X for(i = 0; i < numnodes; ++i) { /* Get decoding tree from file */ X sqp->sqleaf[i].children[0] = portgetw(fp); X sqp->sqleaf[i].children[1] = portgetw(fp); X } X sqp->rewpos = ftell(sqp->file); X sqp->bpos = 7; /* force initial read */ X sqp->repct = 0; /* start with no repeats */ X return sqp; X} X Xvoid Xsqrewind(SQFILE *sqp) X{ X sqp->bpos = 7; /* force initial read */ X sqp->repct = 0; /* start with no repeats */ X fseek(sqp->file, sqp->rewpos, SEEK_SET); X} X X X/* The "offset" is actually a bit offset within the file. Offset 0 is X * bit 0, byte 0. Offset 8 is bit 0, byte 1. Etc... X */ Xlong Xsqseek(SQFILE *sqp, long offset) X{ X int bpos; X long value; X X if (!sqp->issq) X return fseek(sqp->file, offset, SEEK_SET); X X bpos = offset & 7; X value = fseek(sqp->file, offset >> 3, SEEK_SET); X if (bpos == 0) { X sqp->bpos = 7; X } else { X sqp->bpos = bpos - 1; X sqp->curin = getc(sqp->file) >> (bpos - 1); X } X sqp->repct = 0; /* start with no repeats */ X return value; X} X X X/* Get bytes with decoding - this decodes repetition, X * calls getuhuff to decode file stream into byte X * level code with only repetition encoding. X * X * The code is simple passing through of bytes except X * that DLE is encoded as DLE-zero and other values X * repeated more than twice are encoded as value-DLE-count. X */ X Xint Xsqgetc(SQFILE *sqp) X{ X int c; X X if (!sqp->issq) X return getc(sqp->file); X X if(sqp->repct > 0) { X /* Expanding a repeated char */ X --sqp->repct; X return(sqp->value); X } else { X /* Nothing unusual */ X if((c = getuhuff(sqp)) != DLE) { X /* It's not the special delimiter */ X sqp->value = c; X if(sqp->value == EOF) X sqp->repct = INT_MAX; X return(sqp->value); X } else { X /* Special token */ X if((sqp->repct = getuhuff(sqp)) == 0) X /* DLE, zero represents DLE */ X return(DLE); X else { X /* Begin expanding repetition */ X sqp->repct -= 2; /* 2nd time */ X return(sqp->value); X } X } X } X} X X X/* Decode file stream into a byte level code with only X * repetition encoding remaining. X */ Xstatic int Xgetuhuff(SQFILE *sqp) X{ X int i; X X /* Follow bit stream in tree to a leaf*/ X i = 0; /* Start at root of tree */ X do { X if(++sqp->bpos > 7) { X if((sqp->curin = getc(sqp->file)) == EOF) X return(ERROR); X sqp->bpos = 0; X /* move a level deeper in tree */ X i = sqp->sqleaf[i].children[1 & sqp->curin]; X } else X i = sqp->sqleaf[i].children[1 & (sqp->curin >>= 1)]; X } while(i >= 0); X X /* Decode fake node index to original data value */ X i = -(i + 1); X /* Decode special endfile token to normal EOF */ X return(i == SPEOF) ? EOF : i; X} X X Xchar X*sqgets (char *s, int n, SQFILE *sqp) X{ X int c = '\0'; X X while (c != '\n' && n > 1) { X if ((c = sqgetc(sqp)) == EOF) X return NULL; X if (c == '\r') X continue; X *s++ = c; X --n; X } X *s = '\0'; X return s; X} X X Xlong Xsqtell (SQFILE *sqp) X{ X if (!sqp->issq) X return ftell(sqp->file); X return (ftell(sqp->file) << 3) + sqp->bpos; X} X X Xlong Xsqsize (SQFILE *sqp) X{ X fseek(sqp->file, 0, SEEK_END); X if (!sqp->issq) X return ftell(sqp->file); X return (ftell(sqp->file) << 3) + 7; X} X X Xint Xsqclose (SQFILE *sqp) X{ X int value = fclose(sqp->file); X free(sqp); X return value; X} X X X/* X * Machine independent getw which always gets bytes in the same order X * as the CP/M version of SQ wrote them X */ Xstatic int Xportgetw(f) XFILE *f; X{ X int c; X X c = getc(f) & 0377; X return(c | (getc(f) << 8)); X} END_OF_FILE if test 4903 -ne `wc -c <'sqread.c'`; then echo shar: \"'sqread.c'\" unpacked with wrong size! fi # end of 'sqread.c' fi if test -f 'sqread.h' -a "${1}" != "-c" ; then echo shar: Will not clobber existing file \"'sqread.h'\" else echo shar: Extracting \"'sqread.h'\" \(709 characters\) sed "s/^X//" >'sqread.h' <<'END_OF_FILE' Xstruct sqleaf { /* Decoding tree */ X int children[2]; /* left, right */ X}; X Xtypedef struct { X#ifdef INTERNAL X FILE *file; X struct sqleaf sqleaf[256]; X int issq; /* non-zero if this file's squeezed */ X int bpos; /* last bit position read */ X int curin; /* last byte value read */ X int repct; /* Number of times to return value */ X int value; /* current byte value or EOF */ X long rewpos; /* rewind seek position */ X#endif X} SQFILE; X XSQFILE *sqopen (const char *path); Xvoid sqrewind (SQFILE *stream); Xlong sqseek (SQFILE *stream, long offset); Xchar *sqgets (char *s, int n, SQFILE *stream); Xlong sqtell (SQFILE *stream); Xlong sqsize (SQFILE *stream); Xint sqclose (SQFILE *stream); END_OF_FILE if test 709 -ne `wc -c <'sqread.h'`; then echo shar: \"'sqread.h'\" unpacked with wrong size! fi # end of 'sqread.h' fi if test -f 'xusqi.c' -a "${1}" != "-c" ; then echo shar: Will not clobber existing file \"'xusqi.c'\" else echo shar: Extracting \"'xusqi.c'\" \(5092 characters\) sed "s/^X//" >'xusqi.c' <<'END_OF_FILE' Xstatic char *sccsid = "@(#)usq.c 1.7u (UCF) 82/12/15"; X/* X * usq.c - CP/M compatible file unsqueezer utility X * X * compile as follows: X * cc [-DVAX] -O usq.c -o usq X * (define VAX only if running on VAX) X */ X X#include X#include X#include X#include X#include X X#define ERROR (-1) X#define PATHLEN 312 /* Number of characters allowed in pathname */ X#define OK 0 X X#define RECOGNIZE 0xFF76 /* unlikely pattern */ X#define DLE 0x90 /* repeat byte flag */ X#define SPEOF 256 /* special endfile token */ X#define NUMVALS 257 /* 256 data values plus SPEOF*/ X X/* 16 bit ints, please */ Xtypedef int INT16; Xtypedef unsigned UNSIGNED; Xtypedef long LONG32; X Xstruct sqleaf { /* Decoding tree */ X INT16 children[2]; /* left, right */ X}; Xstruct sqleaf Dnode[NUMVALS - 1]; X XINT16 Bpos; /* last bit position read */ XINT16 Curin; /* last byte value read */ XINT16 Repct; /* Number of times to return value */ XINT16 Value; /* current byte value or EOF */ X XINT16 getcr(void); XINT16 getuhuff(void); XINT16 portgetw(FILE *f); XLONG32 remap(LONG32); X Xmain(int argc, char *argv[]) X{ X squeeze(argv[1]); X} X X/* X The following code is primarily from typesq.c and utr.c. Typesq Xis a modification of USQ by Dick Greenlaw. Those modifications (usq Xto typesq) were made by Bob Mathias, I am responsible for the butchery Xdone to make it work with cat. X X*/ X XFILE *in; Xsqueeze(fname) Xchar *fname; X{ X register INT16 i, c; X register char *p; X register INT16 numnodes; /* size of decoding tree */ X register UNSIGNED crc; X UNSIGNED filecrc; X char inl[BUFSIZ]; X INT16 inlcnt; X X init_cr(); init_huff(); crc=0; X X if((in=fopen( fname, "rb"))==NULL) { X fprintf(stderr, "usq: can't open %s\n", fname); X return ERROR; X } X if(portgetw(in) != (INT16) RECOGNIZE) {/* Process header */ X fprintf(stderr, "usq: %s is not a SQueezed file\n", fname); X return(ERROR); X } X filecrc = (UNSIGNED) portgetw(in); /* checksum */ X while (getc(in)) /* Read and discard the name */ X ; X X numnodes = portgetw(in); X if(numnodes < 0 || numnodes >= NUMVALS) { X fprintf(stderr, "usq: %s has invalid decode tree\n", fname); X fclose(in); X return(ERROR); X } X /* Initialize for possible empty tree (SPEOF only) */ X Dnode[0].children[0] = -(SPEOF + 1); X Dnode[0].children[1] = -(SPEOF + 1); X X for(i = 0; i < numnodes; ++i) { /* Get decoding tree from file */ X Dnode[i].children[0] = portgetw(in); X Dnode[i].children[1] = portgetw(in); X } X inlcnt = 0; X while (fgets(inl, sizeof(inl), stdin)){ X if (++inlcnt == 7) { X sprintf(inl, "%ld\n", remap(atol(inl))); X inlcnt = 0; X } X fputs(inl, stdout); X } X fclose(in); X return(OK); X} X X/* Map the original source position into the compressed bit position. X * X */ XLONG32 Xremap(LONG32 pos) X{ X static LONG32 inpos = 0L; X int c; X X while (pos != inpos) { X if ((c = getcr()) == EOF) X abort(); X#ifdef notdef X putchar(c); X#endif X inpos++; X } X fprintf(stderr, "ftell() = %ld, Bpos = %d\n", ftell(in), Bpos); X if (Bpos == 99) X return (ftell(in) << 3) + ((Bpos == 99)?0:(Bpos+1)); X return ((ftell(in)-1) << 3) + Bpos+1; X} X X X/*** from utr.c - */ X/* initialize decoding functions */ X Xinit_cr() X{ X Repct = 0; X} X Xinit_huff() X{ X Bpos = 99; /* force initial read */ X} X X/* Get bytes with decoding - this decodes repetition, X * calls getuhuff to decode file stream into byte X * level code with only repetition encoding. X * X * The code is simple passing through of bytes except X * that DLE is encoded as DLE-zero and other values X * repeated more than twice are encoded as value-DLE-count. X */ X XINT16 Xgetcr() X{ X register INT16 c; X X if(Repct > 0) { X /* Expanding a repeated char */ X --Repct; X return(Value); X } else { X /* Nothing unusual */ X if((c = getuhuff()) != DLE) { X /* It's not the special delimiter */ X Value = c; X if(Value == EOF) X Repct = INT_MAX; X return(Value); X } else { X /* Special token */ X if((Repct = getuhuff()) == 0) X /* DLE, zero represents DLE */ X return(DLE); X else { X /* Begin expanding repetition */ X Repct -= 2; /* 2nd time */ X return(Value); X } X } X } X} X/* Decode file stream into a byte level code with only X * repetition encoding remaining. X */ X XINT16 Xgetuhuff() X{ X register INT16 i; X X /* Follow bit stream in tree to a leaf*/ X i = 0; /* Start at root of tree */ X do { X if(++Bpos > 7) { X if((Curin = getc(in)) == EOF) X return(ERROR); X Bpos = 0; X /* move a level deeper in tree */ X i = Dnode[i].children[1 & Curin]; X } else X i = Dnode[i].children[1 & (Curin >>= 1)]; X } while(i >= 0); X X /* Decode fake node index to original data value */ X i = -(i + 1); X /* Decode special endfile token to normal EOF */ X return(i == SPEOF) ? EOF : i; X} X/* X * Machine independent getw which always gets bytes in the same order X * as the CP/M version of SQ wrote them X */ XINT16 Xportgetw(f) XFILE *f; X{ X register INT16 c; X X c = getc(f) & 0377; X return(c | (getc(f) << 8)); X} END_OF_FILE if test 5092 -ne `wc -c <'xusqi.c'`; then echo shar: \"'xusqi.c'\" unpacked with wrong size! fi # end of 'xusqi.c' fi echo shar: End of shell archive. exit 0 -- --russ (nelson@clutx [.bitnet | .clarkson.edu]) FAX 315-268-7600 It's better to get mugged than to live a life of fear -- Freeman Dyson I joined the League for Programming Freedom, and I hope you'll join too.