$<PDPRSX
GET "LIBHDR"
GET "SPEHDR"
$>PDPRSX

$<CAP1
GET "LIBHDR"
GET ".progs.spehdr"
$>CAP1

LET getnode( n ) = VALOF		// get a chunk of pseudo heap
    TEST (n+heapvecpointer) <= heapvecsize THEN
    $(
	LET v = heapvec + heapvecpointer
	heapvecpointer := heapvecpointer + n + 1
	RESULTIS v
    $)
    ELSE

$<PDPRSX
    RESULTIS IE.NBF
$>PDPRSX

$<CAP1
  $<HEAPTEST
    RESULTIS -1				// Try any negative number
  $>HEAPTEST
  $<HEAPTEST'
    $(
	MANIFEST $(	inc.size = 4*1024	// ammount we increase heap by
			max.before.inc = (32*1024) - 2 - inc.size	// heapvecsize is 1 less than segment size
		 $)
	TEST heapvecsize <= max.before.inc THEN
	$(
	    LET rc = CHANGESIZE( heapvec , inc.size )
   $<LOG
	    LET o = OUTPUT()
	    SELECTOUTPUT( log )
	    WRITEF("More heap got after %N*N", COMPTIME() - start.time)
	    UNLESS o = 0 DO SELECTOUTPUT( o )
   $>LOG
	    heapvecsize := heapvecsize + inc.size	// get more heap if we can
	    IF rc = 0 THEN RESULTIS getnode( n )
	    SELECTOUTPUT( MSTREAM )
	    WRITEF("*N******FATAL ERROR - failed to get extra heap - %S*N", FAULT( rc ))
	    FINISH
	$)
	ELSE RESULTIS -1		// Have as much heap as we can get
    $)
  $>HEAPTEST'
$>CAP1


AND writeword( old, new ) BE		// write a coded word to a sort file
$(
  LET count = old%0
    FOR I=1 TO count DO			// its OK using count coz
      IF old%I NE new%I THEN
      $(
	 count := I-1			// when we update it we BREAK the loop
	 BREAK
      $)

    Binwrch( count )			// binary count of chars same as prev.
    Binwrch( new%0 )			// binary count of end of word position

    FOR I=count+1 TO new%0 DO Binwrch( new%I )
$)


AND readword( oldword ) BE		// read next word given the last
$(
    LET count = BINRDCH()		// pick up similarity count
    LET len   = BINRDCH()		// and final length
	TEST count = ENDSTREAMCH
	THEN $(
		oldword%0  := 0
		TERMINATOR := ENDSTREAMCH
	     $)
	ELSE $(
		FOR I=count+1 TO len DO oldword%I := BINRDCH()
		oldword%0  := len
		TERMINATOR := ' '
	     $)
$)


AND build.tree (root) BE			// build lots of trees and write them out
$(  LET word = VEC wordsperword
    LET post = VEC wordsperword

    linecount := 1				// input line number
    seed.tree( root )				// pre-seed

    $(  Read.input.word( FALSE, word, post )
	IF TERMINATOR=ENDSTREAMCH & word%0=0 THEN BREAK
	UNLESS record (root, word )	// FALSE -> no space
	       DO $(
			write.tree( root )	// chuck it out

$<PDPRSX
			ENDWRITE()		// close it
			find.newoutput()	// open a new one
$>PDPRSX

$<CAP1
			// Preserve current anon file capablity in sort file
			// vector,and get a new anon file.
			// Check that version numbers are OK

			sort.file.cap!version :=  GETSLOT()
			ENDWRITE()		// close it
			MOVECAP( K.N0 , sort.file.cap!version )
			find.newoutput()	// open a new one
			version := version + 1
			IF version > maxsortversion THEN
			$(
			    SELECTOUTPUT(MSTREAM)
			    WRITEF("*N******FATAL ERROR about line %N - Input too big for sorting*N",linecount)
			    STOP(16)
			$)
$>CAP1

			!root := 0		// empty tree
			seed.tree( root )	// clear and pre-seed
			record(root, word)	// and record this
		  $)
	wordcount := wordcount + 1	// one more word

    $)  REPEAT
$)


AND Read.input.word(reflecting, word, post) BE
$(
    LET p   = ?
    LET top = ?

    $(
	$(
		TERMINATOR := RDCH()
		UNLESS 	TERMINATOR=' '  |	// skip format characters
			TERMINATOR='*N' |
			TERMINATOR='*C' |
			TERMINATOR='*P' |
			TERMINATOR='*E' |
			TERMINATOR='*T' |
			TERMINATOR='*'' | 	// and leading ignorables
			TERMINATOR='*"' |
			TERMINATOR='('  |
			TERMINATOR='['  |
			TERMINATOR='{'  |
			TERMINATOR='#'  	// ROFF and GCAL precise space
		    DO BREAK
		IF	TERMINATOR='*N' |
			TERMINATOR='*C' |
			TERMINATOR='*P'
		    THEN linecount := linecount + 1
		IF reflecting THEN Wrch(TERMINATOR)
	$) REPEAT

	p := 0

	UNLESS TERMINATOR=ENDSTREAMCH 		// step back to pick up first valid character
	       DO UNRDCH()

	$(
	   TERMINATOR := RDCH()			// pick up a char
	   IF TERMINATOR=' '  |
	      TERMINATOR='*N' |
	      TERMINATOR='*C' |
	      TERMINATOR='*P' |
	      TERMINATOR='*E' |
	      TERMINATOR='*T' |
	      TERMINATOR='#'  |			// ROFF/GCAL precise space
	      TERMINATOR=ENDSTREAMCH 		// break if it is a format char
	     THEN BREAK
	   p := p + 1
	   word%p := TERMINATOR			// else remember it
	$) REPEAT

	UNLESS TERMINATOR=ENDSTREAMCH 		// step back to pick up next time
	       DO UNRDCH()

	top := p

	UNTIL p=0 DO
	$(					// now strip all trailing chars
	   LET ch = Capitalch( word%p )		// that are not alphanumeric
	   IF ('A' <= ch <= 'Z') | ('0' <= ch <= '9')
	      THEN BREAK
	   p := p - 1
	$)

	IF (p>2) & (CAPITALCH(word%p) = 'S') & (word%(p-1) = '*'')
	   THEN p := p - 2			// strip possessives


	FOR I=1 TO p DO				// check for a completely numeric result
	    UNLESS '0' <= word%I <= '9' GOTO some.not.numeric
	p := 0
      some.not.numeric:

	TEST TERMINATOR=ENDSTREAMCH | p > 0 THEN
	$(					// build the POST string
		FOR I=p+1 TO top DO
		    post%(I-p) := word%I
		post%0 := top-p
		BREAK
	$)
	ELSE IF reflecting THEN
	$(
		word%0 := top			// write out all the skipped chars
		Writes(word)
	$)
    $) REPEAT

    word%0 := p
$)

AND seed.tree( root ) BE			// shove the seed set into the tree
$(
  LET lc = linecount
    linecount := -1				// mark them all as seeds
    FOR I=1 TO seedset!0 DO
	record( root, seedset!I )
    linecount := lc				// restore
$)


AND record (node, string) = VALOF		// put a node into the tree
$(
    TEST !node = 0 THEN				// empty space!
    $(
	LET l    = length (string)		// make a new node
	LET leaf = getnode (l.string + l)
	LET leaf.st = leaf + l.string
	IF  leaf < 0 THEN RESULTIS FALSE	// no pseudo-heap left
	FOR i = 0 TO l DO leaf.st!i := string!i	// copy string
	leaf!l.minus := 0
	leaf!l.plus  := 0
	leaf!l.first := linecount
	leaf!l.count := 1
        !node := leaf
	RESULTIS TRUE				// done
    $)
    OR
    $(
        node := !node
        SWITCHON comparestring (string, node + l.string) INTO
        $(CASES

        CASE +1:
            node := node + l.plus
	    ENDCASE

        CASE -1:
            node := node + l.minus
	    ENDCASE

        CASE 0:
            node!l.count := node!l.count + 1	// already there!
	    RESULTIS TRUE
        $)CASES
    $)
$) REPEAT					// saves having to recurse


AND length (string) = (string%0)/BYTESPERWORD	// word length of string


AND ucequalstring (a, b) = VALOF
$(
    UNLESS a%0 = b%0 DO RESULTIS FALSE
    FOR i = 1 TO a%0 DO
	UNLESS CAPITALCH (a%i) = CAPITALCH (b%i) DO RESULTIS FALSE
    RESULTIS TRUE
$)


AND find.newoutput() BE

// find a new LEXICON.SOR stream (PDPRSX)
// Get a new anon file for sorting into (CAP)

$(
$<PDPRSX
   LET file = "LEXICON.SOR"
     newoutput := FINDOUTFILE ( file )
$>PDPRSX

$<CAP1
    newoutput := findoutfile( "/A" )
$>CAP1

$<LOG
     SELECTOUTPUT( log )
  $<PDPRSX
     Writen(Time()); Newline()
  $>PDPRSX
  $<CAP1
    WRITEF("New anon file after %N*N", COMPTIME() - start.time)
  $>CAP1
$>LOG

    SELECTOUTPUT( newoutput )
$)


AND write.tree( root ) BE			// chuck out a tree
$(
	write.node( root )			// recursive write-out
	oldnode := heapvec			// keep an old node
	(oldnode+l.string)%0 :=0		// with a null string
	heapvecpointer := l.string+1		// reset the heap pointer
$)


AND write.node( root ) BE UNLESS !root = 0 DO	// write out the nodes
$(						// in sorted order
    LET node = !root
    LET lc   = node!l.first

	write.node( node+l.minus )		// all the earlier junk
	   UNLESS lc<0 DO			// - then its a seed word
	   $(
	     writeword( oldnode+l.string, node+l.string )
	     write.info( lc, node!l.count )
	     oldnode := node
	   $)
	write.node( node+l.plus )		// all the later junk
$)


AND write.info( lc, co ) BE			// binary chars for efficiency
$(
    Binwrch( lc>>8 )
    Binwrch( lc & #377 )
    Binwrch( co )
    distinctwordcount := distinctwordcount + 1	// one more distinct word
$)



$<PDPRSX
AND merge() = VALOF				// merge all the intermediate files
$(
    LET file     = "LEXICON.SOR"
    LET in1, in2 = ?, ?
    LET filename = VEC 100
    LET addon    = ";000"			// we diddle the version numbers
    LET length   = file%0

  $<LOG
    LET o        = OUTPUT()

	SELECTOUTPUT( log )
	Writes("Merge*N")
	SELECTOUTPUT( o )
  $>LOG

	FOR I=0 TO length DO	filename%I := file%I	// add on the addon
	FOR I=1 TO addon%0 DO	filename%(length+I) := addon%I

	filename%0 := length + addon%0
	length     := length+2

	$(					// start at the first we made
		place( version, filename, length )
		in1 := FINDINPUT( filename )
		version := version + 1		// and work upwards
		place( version, filename, length )
		in2 := FINDINPUT( filename )
		version := version + 1

		TEST (in1>0) & (in2>0)		// have two inputs?
		     THEN $(			//   then merge them
				find.newoutput()
				do.merge(in1, in2)
				ENDWRITE()
				SELECTINPUT( in1 )
				DELETEINPUT()	// and delete them
				SELECTINPUT( in2 )
				DELETEINPUT()
			  $)
		ELSE TEST (in1>0) & (in2=IE.NSF)
		     THEN RESULTIS in1		// only one means we've finished
		ELSE $(
			SELECTOUTPUT( SYSOUT )	// PANIC!!!!
			Writef("Merge IOerror %N, %N*N",in1,in2) <> FINISH
		     $)
	$) REPEAT
$)

AND place( n, s, off ) BE			// shove in an octal version num
$(
    LET d1 = (n>>6) & 7
    LET d2 = (n>>3) & 7
    LET d3 = (n   ) & 7
	s%(off  ) := d1 + '0'
	s%(off+1) := d2 + '0'
	s%(off+2) := d3 + '0'
$)
$>PDPRSX



$<CAP1
AND merge() = VALOF

//  Merges the anon intermediate sort files
//  Uses the sort.file.cap vector as a circular buffer of slots
//  Merges each file from each slot with the next one, putting the result
//  at the current end of the used slots, until the input pointer catches
//  up with the output pointer

$(
    LET in1,in2 = ?,?
    LET out = ?
    LET inslot = 0
    LET outslot = version

  $<LOG
    LET o = OUTPUT()

	SELECTOUTPUT( log )
	WRITEF("Merge started after %N*N", COMPTIME() - start.time)
	UNLESS o = 0 DO SELECTOUTPUT( o )
  $>LOG

    $(
	outslot := outslot = maxsortversion -> 0,outslot+1	// Update outslot, wrap slot vector if reqd.

	in1 := FILEFROMSLOT( sort.file.cap!inslot , 1)	// Open first input file
	IF in1 = 0 THEN merge.error.exit()
	inslot := inslot = maxsortversion -> 0 , inslot+1	// Get next slot ready
	IF inslot = outslot THEN RESULTIS in1		// Only 1 file, so finished, return open stream ready for use.

	in2 := FILEFROMSLOT( sort.file.cap!inslot , 1)	// Open 2nd input file
	IF in2 = 0 THEN merge.error.exit()
	inslot := inslot = maxsortversion -> 0 , inslot+1	// Get next slot ready

	find.newoutput()
	do.merge( in1 , in2 )			// Merge the 2 inputs to new output
	SELECTINPUT( in1 )			// and free the slots used
	ENDREAD()
	SELECTINPUT( in2 )
	ENDREAD()
	sort.file.cap!outslot := GETSLOT()
	ENDWRITE()				// Get capability for new file
	MOVECAP( K.N0 , sort.file.cap!outslot )	// and save so can re-read
	version := version + 1			// New version number
    $) REPEAT

$)


AND merge.error.exit() BE			// Fall over tidily if merge fails
$(
    LET errstring = FAULT(ERRORCODE)
    SELECTOUTPUT(MSTREAM)
    WRITEF("*N******FATAL ERROR - can not reopen sort file - %S*N",errstring)
    STOP(16)
$)
$>CAP1



AND read.pair() = VALOF				// Binrdch a two-byte number
$(
    LET a = BINRDCH() << 8
    RESULTIS a + BINRDCH()
$)


AND do.merge( in1, in2 ) BE			// merge two input streams
$(
    LET word1 = VEC wordsperword		// from in1
    LET word2 = VEC wordsperword		// from in2
    LET last  = VEC wordsperword		// copy of last written
    LET lc1, lc2 = ?, ?				// line-numbers
    LET n1,  n2  = ?, ?				// frequency counts
    LET read1, read2 = TRUE, TRUE		// what we need to read from

    last%0 := 0					// all empty
    word1%0, word2%0 := 0, 0
    distinctwordcount := 0			// the last merge gives the right answer

	$(					// repeat down the files
	    IF read1 THEN			// read in from in1
	    $(
		SELECTINPUT( in1 )
		readword( word1 )
		lc1 := read.pair()
		n1  := Binrdch()
		IF TERMINATOR<0 THEN		// end-of-file
		   TEST read2 THEN		// have we a word2 in hand
		   $(				//   - no
			copy( in2, word2, last )
			RETURN			// copy rest of other stream
		   $)
		   ELSE
		   $(				//   - yes
			Writeword( last, word2 )
			Write.info( lc2, n2 )	// write it out
			copy( in2, word2, word2 )
			RETURN			// copy rest of other stream
		   $)
		read1 := FALSE			// got one
	    $)

	    IF read2 THEN
	    $(
		SELECTINPUT( in2 )
		readword( word2 )
		lc2 := read.pair()
		n2  := Binrdch()
		IF TERMINATOR<0 THEN		// run out?
		$(				//  chuck out word1
		    Writeword( last, word1 )
		    Write.info(lc1,n1)
		    copy( in1, word1, word1 )
		    RETURN			// and copy rest
		$)
		read2 := FALSE
	    $)

	    SWITCHON comparestring( word1, word2 ) INTO	// who first????
	    $(
	    	CASE -1:			// word1
			Writeword( last, word1 )
			FOR I=0 TO (word1%0)/BYTESPERWORD DO last!I := word1!I
	    		Write.info(lc1,n1)
			read1 := TRUE		// need a new one
			ENDCASE
		CASE 0:				// matches perfectly!
			Writeword( last, word1 )
			FOR I=0 TO (word1%0)/BYTESPERWORD DO last!I := word1!I
			Write.info(min(lc1,lc2),n1+n2)	// get first line-number
			read1, read2 := TRUE, TRUE
			ENDCASE
		CASE +1:
			Writeword( last, word2 )
			FOR I=0 TO (word2%0)/BYTESPERWORD DO last!I:= word2!I
	    		Write.info(lc2,n2)
			read2 := TRUE
			ENDCASE
	    $)
	$) REPEAT
$)


AND copy( s, lastin, lastout ) BE		// copy stream s. lastin was
$(						// last read on this stream,
    LET new = VEC wordsperword			// lastout was last written
    LET lc, co = ?, ?

    SELECTINPUT( s )
    FOR I=0 TO (lastin%0)/BYTESPERWORD DO new!I := lastin!I
    $(
	readword( new ); lc := read.pair(); co := BINRDCH()
	IF TERMINATOR<0 THEN BREAK
	Writeword( lastout, new )
	Write.info(lc,co)
	FOR I=0 TO (new%0)/BYTESPERWORD DO lastout!I := new!I
    $) REPEAT
$)

