Xref: utzoo comp.unix.questions:19082 comp.lang.perl:147 Path: utzoo!utgpu!jarvis.csri.toronto.edu!mailrus!cs.utexas.edu!usc!elroy.jpl.nasa.gov!jpl-devvax!lwall From: lwall@jpl-devvax.JPL.NASA.GOV (Larry Wall) Newsgroups: comp.unix.questions,comp.lang.perl Subject: Re: Utility.... Message-ID: <6797@jpl-devvax.JPL.NASA.GOV> Date: 16 Jan 90 00:12:47 GMT References: <25b16c0c:746comp.unix.questions@tronsbox.UUCP> Reply-To: lwall@jpl-devvax.JPL.NASA.GOV (Larry Wall) Organization: Jet Propulsion Laboratory, Pasadena, CA Lines: 145 In article <25b16c0c:746comp.unix.questions@tronsbox.UUCP> tron1@tronsbox.UUCP (HIM) writes: : : Hmm.. I am having soime trouble with a local problem and I am sure it is : only because I am not familiar enough with UNIX to choose the right tool. : : I have some text that is broken with nl's at the wrong place (I run a BBS : and these are files I have been uploaded... : : Like: : : This is a test of the errors I have seen in some of the tex : t : files I have and I really need a way to fix them. : : This is a new paragraph. : : I would like it to put the "t" at the end of TEXT where it belongs and : reformat that text so that it is correct BUt recognize that 2 nl's in a row : are a new paragraph. : : As a bonus, the ability to set the left and right margons for the indent and : wordwrap would help! You've asked for a very difficult thing, unless you can give a better rule for where the spurious nl's occur, such as "every n chars", or "it's always the last word in the line". Still, one can do a reasonable approximation of correctness. Here is a semi-solution, assuming you have perl 3.0, and enough memory to slurp in /usr/dict/words. It errs on the conservative side, in that if both ends of a line break are real words, it doesn't combine them, resulting in an occasional extra space in the middle of a word. It could easily be made to err on the other side, so that you'd have occasional words combined that made a compound word accidentally across a line break. You get the bonus of specifying your left and right margins. I also took the liberty of adding a -i flag to specify the indent on each paragraph. Larry Wall lwall@jpl-devvax.jpl.nasa.gov #!/bin/sh : make a subdirectory, cd to it, and run this through sh. echo 'If this kit is complete, "End of kit" will echo at the end' echo Extracting nonl sed >nonl <<'!STUFFY!FUNK!' -e 's/X//' X#!/usr/bin/perl X X# Usage: nonl -l left_margin -r right_margin -i indent [files] X X$left = 0; X$right = 72; X$indent = 0; X Xwhile ($ARGV[0] =~ /^-(.*)/) { X $_ = $1; X shift; X if (s/^l//) { X $left = $_; X $left = shift if $left eq ''; X } X elsif (s/^i//) { X $indent = $_; X $indent = shift if $indent eq ''; X } X elsif (s/^r//) { X $right = $_; X $right = shift if $right eq ''; X } X else { X die "Unrecognized switch: -$_\n"; X } X} X X# The following kludge let's us define a format at run-time, since formats X# don't parse right inside eval yet. X Xopen(TMP,">/tmp/nonl$$") || die "Can't create /tmp/nonl$$: $!\n"; Xprint TMP "format STDOUT =\n"; Xprintf TMP "%s^%s\n\$new\n", X ' ' x ($left + $indent), '<' x ($right - $left - 1 - $indent) X if $indent; Xprintf TMP "%s^%s~~\n\$new\n", X ' ' x $left, '<' x ($right - $left - 1); Xprint TMP "\n.\n"; Xclose TMP; Xsystem "cat /tmp/nonl$$" if $debug; Xdo "/tmp/nonl$$"; die $@ if $@; # so define the format already Xunlink "/tmp/nonl$$"; X X# Slurp /usr/dict/words into associative array X Xopen(WORDS,'/usr/dict/words') || die "Can't open /usr/dict/words: $!\n"; Xwhile () { X chop; X y/A-Z/a-z/; # canonicalize X ++$word{$_}; X} X X$/ = ''; # enable paragraph mode X$= = 10000000; # no top of forms X Xprint "Ready\n" if $debug; X Xwhile (<>) { X @old = split(/(\s)/); # note we use () to include delimiters in @old X @new = (); X while ($#old > 0) { X if ($old[0] =~ /^\s$/) { # discard old delimiters X shift(@old); X next; X } X $word1 = shift(@old); X push(@new,$word1); X if ($old[0] eq "\n" && $#old > 0) { # Maybe bad break? X if ($word1 =~ /[.?!]['"]?$/) { # Sentence break. X push(@new,''); # Force extra space. X next; X } X next if $word1 =~ /[,;:'")]$/; # end of clause, so ok X $word2 = $old[1]; X next if $word2 =~ /^[('"]/; # beg of clause, so ok X $word1 =~ y/A-Z/a-z/; # canonicalize words X $word1 =~ s/[^a-z']//; X $word2 =~ y/A-Z/a-z/; X $word2 =~ s/[^a-z']//; X if (!$word{$word1} || !$word{$word2}) { # if either isn't a word X if ($word{$word1 . $word2}) { # and the combo is... X shift(@old); # chuck naughty \n X $new[$#new] =~ s/-$//; # chuck any hyphenation X $new[$#new] .= shift(@old); # glue 2nd half on X } X } X } X } X push(@new,$old[0]) unless $old[0] =~ /^\s$/; X $new = join(' ',@new); X write; # this will do text filling for us X} !STUFFY!FUNK! echo "" echo "End of kit" : I do not append .signature, but someone might mail this. exit