Views
DB_btree_wdoc.stl
by
Paul McJones
—
last modified
2021-02-25 11:30
- History
-
Action Performed by Date and Time Comment Publish Paul McJones 2021-02-25 11:30 No comments.
"B-tree variant for word occurence_strings, with 2 cumulants in parent, refcounts."
-- file B_tree_for_bigs_wdoc.stl
package B_tree_for_wdocstring; -- B-trees, realized as objects
-- ***********************************************************************************************
-- ****** B-tree variant for word occurence_strings, with 2 cumulants in parent, refcounts ******
-- ***********************************************************************************************
-- Refcounts can be ignored except in the wo_set_comp, wo_insert, split_node, share_right, join_left,
-- and join_left routines.
-- this B-tree structures the wdoccs_big_string which holds word occurences
const wo_code_pts := {"wo_last", "wo_nlast2", "wo_nlast1", "wo_nc1", "wo_nc2", "wo_fpbot", "wo_fpsame",
"wo_fpinfirst", "wo_fpinmid", "wo_fpinlast", "wo_set_copy", "wo_set_last1", "wox_set_last1",
"wo_set_last2", "wox_set_last2", "wo_set_nlast1", "wo_set_nlast2", "wo_set_ndel_nc", "wo_set_ndel_comp",
"wo_set_ndel_end", "wo_set_del_nc", "wo_set_del_have", "wo_set_del_comp",
"wo_set_del_comp_end", "wo_set_del_enough", "wo_set_del_canpull", "wo_set_del_canjoin",
"wo_set_enough", "wo_set_cutback", "wo_in_copy", "wo_in_nend2", "wo_in_nend1", "wo_in_nend3",
"wo_in_end3", "wo_in_endnc", "wo_in_ncnosplit", "wo_in_ncsplit", "wo_in_endcomp",
"wo_in_endcomp_nos", "wo_in_endcomp_split", "wo_in_endcomp_nosthis", "wo_in_endcomp_splitthis",
"wo_in_nendnc_nos", "wo_in_nendnc_split", "wo_in_nendcomp_nos", "wo_in_nendcomp_split",
"wo_in_nendcomp_nosthis", "wo_in_nendcomp_splitthis", "wo_pull_left", "wo_pull_right",
"wo_share_copy", "wo_share_copy2", "wo_share_move_left",
"wo_share_move_right", "wo_jleft_copy", "wo_jleft", "wo_jright_copy", "wo_jright"};
-- code points to be traversed
var debug_flag := false;
var prior_debug_c := 0,debug_c := 0; -- global variables for debugging
var tree_level := 0; -- for tracking tree level during insertion
--procedure wo_create();
--procedure wo_get_cum(rec);
--procedure wo_get_cum2(rec);
--procedure wo_comp(rec,j);
--procedure wo_comp2(rec,j);
--procedure wo_comp_cum(rec,x);
--procedure wo_comp_cum2(rec,x);
procedure wo_first_past_to(rec,key,beg,ennd); -- search for a key in the indicated range
--procedure wo_set_comp(rw rec,w,x); -- assignment of the first component whose cumulant is at least w; w must be in range
--procedure wo_set_comp2(rw rec,w,x);-- assignment of the first component whose second cumulant is at least w
--procedure wo_insert(rw rec,j,x); -- insertion before j-th component; or at the end if j = OM
--procedure wo_insert2(rw rec,j,x); -- insertion before component with second cumulant at least j; or at end (j = OM)
procedure wo_dump(rec); -- get tuple from B-tree representation (DEBUGGING ONLY)
procedure show_hex(occstg); -- show an occurences string as comma-separated hex (DEBUGGING ONLY)
procedure shexify(stg); -- put into abbreviated hex (DEBUGGING ONLY)
procedure wo_check_tree_structure(tree); -- recursive check of tree structure (DEBUGGING ONLY)
end B_tree_for_wdocstring;
package body B_tree_for_wdocstring; -- B-trees, realized as objects
use setldb,byteutil,disk_records_pak,db_records,string_utility_pak;
-- there are two cumulants, the total number of occurences and the corresponding record id
procedure get_ch_cum(rec,j); -- get the integer cum value for the j-th child of this node
return int_of_5(dr_load(rec)(wo_cum_start + (j - 1) * 5..wo_cum_start + j * 5 - 1));
end get_ch_cum;
procedure hwo_get_cum(rec); -- get the integer cum value of the fina l child of this node
return get_ch_cum(rec,num_childr(rec));
end hwo_get_cum;
procedure hwo_get_cum2(rec); -- get the record id string cum value of the final child of this node
return get_ch_cum2(rec,num_childr(rec));
end hwo_get_cum2;
procedure set_ch_cum(rec,j,cum_int); -- set the integer cum value for the j-th child of this node
stg := dr_load(rec); -- make sure that this record is loaded
stg(wo_cum_start + (j - 1) * 5..wo_cum_start + j * 5 - 1) := stg_of_5(cum_int);
dr_setrecbuf(rec,stg); dr_dirtify(rec);
end set_ch_cum;
procedure get_ch_cum2(rec,j); -- get the record id string cum value for the j-th child of this node
return dr_load(rec)(wo_cum2_start + (j - 1) * 4..wo_cum2_start + j * 4 - 1);
end get_ch_cum2;
procedure hwo_create(); -- creation rout ine
rec := dr_new_rec(); -- create a new record (at this point, the record is loaded
-- dirty, and ho
set_type(rec,wdoccs_str_node_ncr); -- set to non-compound tree
return rec;
end hwo_create;
procedure set_ch_cum2(rec,j,new_rec_id); -- set the record id string cum value for the j-th child of this node
stg := dr_load(rec); -- make sure that this record is loaded
stg(wo_cum2_start + (j - 1) * 4..wo_cum2_start + j * 4 - 1) := new_rec_id;
dr_setrecbuf(rec,stg); dr_dirtify(rec);
end set_ch_cum2;
procedure vect_of_children(rec); -- gets vector of children, as a string of 4-byte record numbers
nch := num_childr(rec); -- number of children
return dr_load(rec)(wo_ch_start..wo_ch_start - 1 + nch * 4);
end vect_of_children;
procedure set_vect_of_children(rec,stg); -- sets vector of children, from a string of 4-byte record numbers
--print("set_vect_of_children: ",hexify(rec)," ",hexify(stg));
set_num_childr(rec,(nstg := #stg)/4); -- set the number of children (also loads)
missing := wo_cum_start - wo_ch_start - #stg;
--if missing < 0 then print("missing,wo_cum_start,wo_ch_start,#stg: ",missing," ",#stg," ",hexify(stg)); stop; end if;
stg2:=dr_load(rec);
stg2(wo_ch_start..wo_cum_start - 1) := (stg + missing * "\x00");
-- set the children, remembering not to change the length of the string section containing them
dr_setrecbuf(rec,stg2); dr_dirtify(rec);
end set_vect_of_children;
procedure vect_of_cums(rec); -- gets vector of cums, as a string of 5-byte fields
nch := num_childr(rec); -- number of children
return dr_load(rec)(wo_cum_start..wo_cum_start - 1 + nch * 5);
end vect_of_cums;
procedure set_vect_of_cums(rec,stg); -- sets vector of cumulants, from a string of 5-byte record numbers
missing := wo_cum2_start - wo_cum_start - #stg;
stg2:=dr_load(rec); -- force load
stg2(wo_cum_start..wo_cum2_start - 1) := (stg + missing * "\x00");
-- set the cumulants, remembering not to change the length of the string section containing them
dr_setrecbuf(rec,stg2); dr_dirtify(rec);
end set_vect_of_cums;
procedure vect_of_cums2(rec); -- gets vector of cums, as a string of 4-byte fields
nch := num_childr(rec); -- number of children
return dr_load(rec)(wo_cum2_start..wo_cum2_start - 1 + nch * 4);
end vect_of_cums2;
procedure set_vect_of_cums2(rec,stg); -- sets vector of children, from a string of 4-byte record numbers
missing := rec_size + 1 - wo_cum2_start - #stg;
stg2:=dr_load(rec); -- force load
stg2(wo_cum2_start..rec_size) := (stg + missing * "\x00");
-- set the children, remembering not to change the length of the string section containing them
dr_setrecbuf(rec,stg2); dr_dirtify(rec);
end set_vect_of_cums2;
procedure voc(rec,j); -- n'th member of vector of children
stg := dr_load(rec); -- load this string
cjstrt := (j - 1) * 4 + wo_ch_start;
return stg(cjstrt..cjstrt + 3); -- return child rec
end voc;
procedure set_voc(rec,j,chrec); -- set n'th member of vector of children
cjstrt := (j - 1) * 4 + wo_ch_start;
stg2:=dr_load(rec); -- force load; then set the first character of the record
stg2(cjstrt..cjstrt + 3) := chrec;
dr_setrecbuf(rec,stg2); dr_dirtify(rec);
end set_voc;
procedure wo_dump(rec); -- get tuple from nested representation
-- is compound representation or direct, depending on whether 'is_compound' flag is set
-- note that this shows the vector of components, without their cumulants
-- we ignore the cumulants in the tree nodes
var indent := 0;
return wo_dump_in(rec); -- call inner workhorse
procedure wo_dump_in(rec); -- inner workhorse
if dr_is_compound(rec) then -- compound case
indent +:= 1; nc := num_childr(rec);
stg := (["\n" + (indent * " ") + "("] +/ [wo_dump_in(voc(rec,j)): j in [1..nc]])
+ ["\n" + (indent * " ")+ "[" + str(nc) + "]"
+ str(wo_get_cum(rec)) + "," + shexify(wo_get_cum2(rec)) + ")"];
indent -:= 1;
return stg;
end if; -- done with compound case
t := [];
for j in [1..num_childr(rec)] loop
chj := voc(rec,j);
svj := if (lvj := wo_length(chj)) > 0 then show_hex(wo_slice(chj,1,lvj)) else "" end if; t with:= svj;
end loop;
return ["\n" + (indent * " ") + "("] + t + [str(wo_get_cum(rec)) + "," + shexify(wo_get_cum2(rec)) + ")"];
end wo_dump_in;
end wo_dump;
procedure show_hex(occstg); -- show an occurences string as comma-separated hex
return "" +/ [shexify(occstg(j..j + 3)) + if j > #occstg - 4 then "" else "," end if: j in [1,5..#occstg]];
end show_hex;
procedure shexify(stg); -- put into abbreviated hex
ns := #(stg := hexify(stg)); zers := span(stg,"0");
return if ns = 8 then ":" + stg else ":" + stg + ":" + str(#zers) end if;
end shexify;
procedure hwo_comp(rec,j); -- fetch of component containing cumulant j
return comp_cum_ix(rec,1,j)(1);
end hwo_comp;
procedure hwo_comp2(rec,j); -- fetch of component containing second cumulant j
return comp_cum_ix(rec,2,j)(1);
end hwo_comp2;
procedure hwo_comp_cum(rec,x); -- fetch of x-th component, with cumulan ts
return comp_cum_ix(rec,1,x);
end hwo_comp_cum;
procedure hwo_comp_cum2(rec,x); -- fetch of component containing second cumulant x, with cumulants
return comp_cum_ix(rec,2,x);
end hwo_comp_cum2;
procedure comp_cum_ix(rec,srch_on,x); -- fetch of x-th component by search on specified cumulant
-- component should be found by binary search
-- search for the first index component with cumulant past
-- the specified x; Return OM if there is none such.
-- this returns the triple [leaf,int_cum_of_leaf,second_cum_of_leaf]
-- NOTE: this should be by binary search
-- find the first node for which a cumulant >= j
if x = OM then -- want last node
pass("wo_last");
j := num_childr(rec);
the_cum := if srch_on = 1 then get_ch_cum(rec,j) else get_ch_cum2(rec,j) end if;
else
if srch_on = 2 then
iofx := int_of_4(x);
pass("wo_nlast2");
else
pass("wo_nlast1");
end if;
if not (exists j in [1..num_childr(rec)] |
if srch_on = 1 then (the_cum := get_ch_cum(rec,j)) >= x else
int_of_4(the_cum := get_ch_cum2(rec,j)) >= iofx end if) then
--print("******** return OM: ",x," ",nc := num_childr(rec)," ",wo_dump(rec)," ",if nc > 0 then get_ch_cum(rec,nc) else "NONE" end if); stop;
return OM;
end if;
end if;
-- desired node not found
if not dr_is_compound(rec) then -- if node is not compound we have what we want
if srch_on = 1 then
pass("wo_nc1");
return [voc(rec,j),the_cum,get_ch_cum2(rec,j)];
else
pass("wo_nc2");
return [voc(rec,j),get_ch_cum(rec,j),the_cum];
end if;
end if;
if srch_on = 1 then prev_cum := if j = 1 then 0 else get_ch_cum(rec,j - 1) end if; end if;
-- the preceding cumulant (needed in the integer case only)
res := comp_cum_ix(voc(rec,j),srch_on,if x = OM or srch_on = 2 then x else x - prev_cum end if);
-- continue the search recursively
[rec_no,retcum,retcum2] := res;
return [rec_no] + [prev_cum + retcum,retcum2];
end comp_cum_ix;
procedure wo_first_past_to(rec,key,beg,ennd); -- search for a key in the indicated range
-- this routine returns the pair [first_past_key,its_ix]
stg := dr_load(rec); -- read contents of the record
-- if the node is a wdoccs_string_record we search for the key directly
if stg(type_byte) = wdoccs_string_record then
pass("wo_fpbot");
ikey := int_of_4(key); -- convert to integer for purposes of comparison
if exists j in [beg..ennd min wo_length(rec)] | int_of_4(wos := wo_slice(rec,j,j)) >= ikey then
return [wos,j]; -- (this should be a binary search)
end if;
return OM; -- otherwise the desired key cannot be found
end if;
-- otherwise find the children containing the start and end leaves of the specified range
must := (exists ixbeg in [1..ncr := num_childr(rec)] | get_ch_cum(rec,ixbeg) >= beg);
if not must then print("** INDICATED STARTING POINT NOT PRESENT IN OCCURENCE VECTOR ** : ncr,beg:", ncr,beg); stop; end if;
must := (exists ixend in [ixbeg..ncr] | get_ch_cum(rec,ixend) >= ennd);
if not must then print("** INDICATED ENDING POINT NOT PRESENT IN OCCURENCE VECTOR ** ncr,ennd:", ncr,ennd); stop; end if;
prev_cum := if ixbeg = 1 then 0 else get_ch_cum(rec,ixbeg - 1) end if;
if ixbeg = ixend then -- both children are the same; search down recursively
pass("wo_fpsame");
if (res := wo_first_past_to(voc(rec,ixbeg),key,beg - prev_cum,ennd - prev_cum)) = OM then
return OM; -- the desired key cannot be found
end if;
[first_past_key,its_ix] := res; return [first_past_key,its_ix + prev_cum];
end if; -- otherwise search the first node to its end, then the middle nodes, then the last
if (res := wo_first_past_to(
first_ch := voc(rec,ixbeg),key,beg - prev_cum,get_ch_cum(rec,ixbeg) - prev_cum)) /= OM then
-- desired key is found in first child
[first_past_key,its_ix] := res;
pass("wo_fpinfirst");
return [first_past_key,its_ix + prev_cum];
end if;
ikey := int_of_4(key); -- convert to integer for purposes of comparison
if exists j in [ixbeg + 1,ixend - 1] | int_of_4(get_ch_cum2(rec,j)) >= ikey then
-- have found appropriate child
-- (should be binary search)
chj := voc(rec,j); chj_len := (get_ch_cum(rec,j) - (prev_cum := get_ch_cum(rec,j - 1)));
[first_past_key,its_ix] := wo_first_past_to(chj,key,1,chj_len);
pass("wo_fpinmid");
return [first_past_key,its_ix + prev_cum];
end if;
prev_cum := get_ch_cum(rec,ixend - 1); -- get the previous cum
if (res := wo_first_past_to(voc(rec,ixend),key,1,ennd - prev_cum)) /= OM then
-- desired key is found in last child
[first_past_key,its_ix] := res;
pass("wo_fpinlast");
return [first_past_key,its_ix + prev_cum];
end if;
return OM; -- otherwise the desired key cannot be found
end wo_first_past_to;
procedure hwo_set_comp(rw rec,w,x);-- assignment of the first component whose second cumulant is at least w
set_comp_ix(rec,1,w,x);
end hwo_set_comp;
procedure hwo_set_comp2(rw rec,w,x);-- assignment of the first component whose second cumulant is at least w
set_comp_ix(rec,2,w,x);
end hwo_set_comp2;
procedure set_comp_ix(rw rec,srch_on,w,x);
-- assignment of the first component whose specified cumulant is at least w; w must be in range
-- we must first copy rec if its refcount is greater than 1, and transfer one reference
-- from its old to its copied version
--print("set_comp_ix: ",wo_dump(rec));
if refcount(int_of_4(rec)) > 1 then -- must copy
pass("wo_set_copy");
stg := dr_load(rec); new_r := dr_new_rec();dr_setrecbuf(new_r,stg); dr_dirtify(new_r);
increfs(new_r,1); incref(rec,-1); rec := new_r; -- substitute copy for original
end if;
ic := dr_is_compound(rec); if srch_on = 2 then wint := int_of_4(w); end if;
ncr := num_childr(rec);
if w = OM then -- make change in last node
if srch_on = 1 then
pass("wo_set_last1");
w := get_ch_cum(rec,ix := ncr);
else
pass("wo_set_last2");
w := get_ch_cum2(rec,ix := ncr);
end if;
if ix = 0 then print("Deletion or change at end of empty tree is not allowed."); stop; end if;
elseif not (exists ix in [1..ncr] |
if srch_on = 1 then get_ch_cum(rec,ix) >= w else int_of_4(get_ch_cum2(rec,ix)) >= wint end if) then
if srch_on = 1 then
pass("wox_set_last1");
w := get_ch_cum(rec,ix := ncr);
else
pass("wox_set_last2");
w := get_ch_cum2(rec,ix := ncr);
end if;
if ix = 0 then print("Deletion or change at very end of empty tree is not allowed."); stop; end if;
end if;
-- get the local cumulant of the child
if srch_on = 1 then
old_cum := if ic then wo_get_cum(voc(rec,ix)) else wo_length(voc(rec,ix)) end if;
-- find the last cumulant value of voc(rec,ix), or the length of active part of voc(rec,ix)
pass("wo_set_nlast1");
else
pass("wo_set_nlast2");
old_cum := if ic then wo_get_cum2(voc(rec,ix)) else wo_slice(nd,wol := wo_length(nd),wol) end if;
-- find the last cumulant value of voc(rec,ix), or the last occurence in voc(rec,ix)
end if;
--print("old_cum: ",old_cum," ",hexify(dr_load(voc(rec,ix))));
if srch_on = 1 then prev_cum := if ix = 1 then 0 else get_ch_cum(rec,ix - 1) end if; end if;
if x /= OM then -- we are not dealing with a deletion
if not ic then -- non-compound case; we must add the change in the leaf cumulant
-- to the cumulant of all following nodes.
old_x := voc(rec,ix);
set_voc(rec,ix,x); -- change the component
xfref(old_x,x); -- transfer reference from old to new component
cum_change := wo_length(x) - old_cum;
update_cums(rec,ix,cum_change); -- update the cums,starting with the given child
set_ch_cum2(rec,ix,wo_slice(x,wol := wo_length(x),wol));
-- modify the cum of the changed component. Note that the following
-- second cumulants need not be changed. However, if ix references the
-- final child of rec, a cum2 entry in the parent of rec may need to be changed below.
pass("wo_set_ndel_nc");
return; -- done with this case
end if; -- otherwise we have the compound case
pass("wo_set_ndel_comp");
the_child := voc(rec,ix); -- the next operation may copy the child
wo_set_comp(the_child,if srch_on = 1 then w - prev_cum else w end if,x);
-- make the change in the child
cum_change := wo_get_cum(the_child) - old_cum;
set_voc(rec,ix,the_child); -- put the revised, possibly copied child back into position
-- the 'update_cums' procedure which we call now must then start with the (properly set)
-- cumulant of the preceding node, and then left_add the change in the the cumulant
-- to the cumulant of this child to all the subsequent children. This assumes that
-- (new_d - old_d) + c + old_d + e = c + new_d + e
-- for all cumulant values. This is obviously true for values using associative-commutative
-- cumulator functions with an inverse, and also in the (string) case where a + b = b.
update_cums(rec,ix,cum_change); -- update the cums,starting with the given child
-- the final cumulant of the_child might have been changed by
-- the preceding wo_set_comp(the_child,..) operation; see preceding comment.
if ix = ncr then
set_ch_cum2(rec,ix,wo_get_cum2(the_child));
pass("wo_set_ndel_end");
end if;
--print("compound self,ix: ",wo_get_cum()," ",ix);
return; -- done with the non-deletion case
end if; -- otherwise we are dealing with a deletion
if not dr_is_compound(rec) then -- non-compound case
the_ch := vect_of_children(rec);
the_child := the_ch(4 * ix - 3..4 * ix); -- capture the child before removal
incref(the_child,-1); -- drop one reference
the_ch(4 * ix - 3..4 * ix) := "";
pass("wo_set_del_nc");
if #the_ch > 0 then -- need not delete cumulant if no remaining children
stg := vect_of_cums(rec); -- get the vector of cums
stg((ix - 1) * 5 + 1..ix * 5) := ""; -- drop one element
set_vect_of_cums(rec,stg); -- put back into place
stg := vect_of_cums2(rec); -- get the second vector of cums
stg((ix - 1) * 4 + 1..ix * 4) := ""; -- drop one element
set_vect_of_cums2(rec,stg); -- put back into place
pass("wo_set_del_have");
end if;
-- delete the child from the string of descendants
set_vect_of_children(rec,the_ch); -- delete the ix-th node
-- delete the cumulant of the ix-th node. note that this has already been loaded
cum_change := -old_cum;
update_cums(rec,ix,cum_change); -- update the cums,starting with the appropriate child
return; -- done with this case
end if; -- otherwise we are dealing with a deletion in a compound case
the_child := voc(rec,ix); -- get the child
old_ch_leafsum := wo_get_cum(old_child := the_child);
wo_set_comp(the_child,w - prev_cum,OM); -- make the deletion in the child
new_ch_leafsum := wo_get_cum(the_child);
set_voc(rec,ix,the_child := the_child); -- re-insert the possibly modified child
cum_change := new_ch_leafsum - old_ch_leafsum;
update_cums(rec,ix,cum_change); -- update the cums,starting with the given child
-- since the last child may have been deleted, we also need to update the second cum
pass("wo_set_del_comp");
if ix = ncr then
set_ch_cum2(rec,ix,wo_get_cum2(the_child));
pass("wo_set_del_comp_end");
end if;
-- now it is possible that the child has lost enough children to have fallen below the required wo_low_lim
-- if this has happened, we attempt to share or join children with one of the adjacent siblings
if num_childr(the_child) >= wo_low_lim then
pass("wo_set_del_enough");
return;
end if; -- otherwise try to join or share
if pull_from_left(rec,ix) or pull_from_right(rec,ix) then
pass("wo_set_del_canpull");
return;
end if;
if join_with_left(rec,ix) or join_with_right(rec,ix) then
pass("wo_set_del_canjoin");
null;
end if;
-- In the 'join' case, either the left or the right join must work,
-- since in this compound case we must have at least one sibling.
-- but we must check to see if the node being processed has fallen to
-- just one child, and if it has, replace it by its single child.
if num_childr(rec) > 1 then
pass("wo_set_enough");
return;
end if;
pass("wo_set_cutback");
ch_rec := dr_load(the_ch := voc(rec,1)); dr_load(rec);
dr_setrecbuf(rec,ch_rec); dr_dirtify(rec);
set_vect_of_children(the_ch,""); -- the children have all moved
incref(the_ch,-1); -- child data is inherited from single child, which loses a reference
end set_comp_ix;
procedure update_cums(rec,ix,cum_change); -- update the cums of this tree,starting with the given child
-- now add this difference to all subsequent children
for j in [ix..nvc := num_childr(rec)] loop
set_ch_cum(rec,j,cum_change + get_ch_cum(rec,j));
end loop;
end update_cums;
procedure cumulate(rec);
-- initalize the cumulants of a node whose children are either leaves or already initialized
--print("CUMULATE!");
ic := dr_is_compound(rec); -- determine if compound
the_cum := 0;
for j in [1..num_childr(rec)] loop
nd := voc(rec,j); -- get the j-th child
--print("Child ",j," = ",int_of_4(nd));
nd_cum := if ic then wo_get_cum(nd) else wo_length(nd) end if;
--print("nd_cum=",nd_cum);
-- cumulant of the final child of the subnode, or occurence string length
the_cum := the_cum + nd_cum;
set_ch_cum(rec,j,the_cum); -- update the child's cumulant value
nd_cum2 := if ic then wo_get_cum2(nd) else wo_slice(nd,wol := wo_length(nd),wol) end if;
--print(int_of_4(nd_cum2));
-- cumulant2 of the final child of the subnode, or last element of occurence string
set_ch_cum2(rec,j,nd_cum2); -- update the child's cumulant value
end loop;
end cumulate;
procedure hwo_insert(rw rec,j,x); -- insertion before j-th component; or at the end if j = OM
insert(rec,1,j,x);
end hwo_insert;
procedure hwo_insert2(rw rec,j,x); -- insertion before component with second cumulant at least j;
insert(rec,2,j,x); -- or at the end if j = OM
end hwo_insert2;
procedure insert(rw rec,srch_on,j,x); -- insertion before j-th component with at least specified cumulant;
tree_level := 0; -- note that we are at the top of the tree
insert_in(rec,srch_on,j,x); -- call the inner routine
end insert;
procedure insert_in(rw rec,srch_on,j,x); -- insertion before j-th component with at least specified cumulant;
-- or at the end if j = OM
-- component should be found by binary search
if refcount(int_of_4(rec)) > 1 then -- must copy
pass("wo_in_copy");
stg := dr_load(rec); new_r := dr_new_rec(); dr_setrecbuf(new_r,stg); dr_dirtify(new_r);
increfs(new_r,1); incref(rec,-1); rec := new_r; -- substitute copy for original
end if;
result := OM; -- in case desired element not found
ic := dr_is_compound(rec); ncr := num_childr(rec);
the_cum := wo_get_cum(rec); -- get final cumulant of this tree
wol := x_cum := wo_length(x); -- cumulant value of the leaf x
x_cum2 := wo_slice(x,wol,wol); -- second cumulant value of the leaf x
if not ic then incref(x,1); end if; -- at the bottom level, the inserted leaf takes on an extra reference
if j /= OM then -- look for target node of insertion, if any
if srch_on = 2 then
pass("wo_in_nend2");
iofj := int_of_4(j);
else
pass("wo_in_nend1");
end if;
if exists ix in [1..ncr] |
if srch_on = 1 then (cum := get_ch_cum(rec,ix)) >= j else
int_of_4(cum := get_ch_cum2(rec,ix)) >= iofj end if then
result := [ix,if ix = 1 then 0 else get_ch_cum(rec,ix - 1) end if,cum];
nd := voc(rec,ix);
pass("wo_in_nend3");
else
pass("wo_in_end3");
result := OM;
end if;
end if;
if result = OM then -- we have insertion at the very end
if not ic then -- simply append to vector
pass("wo_in_endnc");
set_vect_of_children(rec,vect_of_children(rec) + x);
-- add the cumulant of x to the present cumulant of this tree
set_ch_cum(rec,nvc := ncr + 1,the_cum := the_cum + x_cum);
set_ch_cum2(rec,nvc,x_cum2);
--print("vect_of_children(rec): ",nvc," ",num_childr(rec)," ",ncr," ",wo_hi_lim + 1," ",hexify(vect_of_children(rec)));
if tree_level > 0 or nvc <= wo_hi_lim then -- no need to split
pass("wo_in_ncnosplit");
return;
end if; -- otherwise we must split, and becomes compound
pass("wo_in_ncsplit");
-- note that in this case we are at the very top of the tree
set_vect_of_children(rec,two_halves(rec)); -- split into 2 non-compound subtrees
set_ch_cum(rec,1,wo_get_cum(voc(rec,1))); set_ch_cum(rec,2,the_cum);
set_ch_cum2(rec,1,wo_get_cum2(voc(rec,1))); set_ch_cum2(rec,2,wo_get_cum2(voc(rec,2)));
set_is_compound(rec,true); -- note that it is indeed compound
return; -- done with this case
end if; -- otherwise we have insertion at the very end of a compound vector
last_child:= voc(rec,nvc := num_childr(rec)); -- get the last child
tree_level +:= 1; -- go down a level
insert_in(last_child,srch_on,OM,x); -- insert at the end of this last child
tree_level -:= 1; -- come up a level
set_voc(rec,ncr,last_child); -- insert the possibly modified child back into vect_of_children
-- add the cumulant of x to the present cumulant of this tree
set_ch_cum(rec,ncr,the_cum := the_cum + x_cum);
set_ch_cum2(rec,ncr,x_cum2);
pass("wo_in_endcomp");
if num_childr(last_child) <= wo_hi_lim then -- no need to split
pass("wo_in_endcomp_nos");
return;
end if; -- otherwise we must split the last child
pass("wo_in_endcomp_split");
split_node(rec,nvc); -- split the nvc-th node into two. we insert an empty node to the right
-- of node nvc, and then move half the children of the nvc-th node into the new node
nvc := num_childr(rec);
if tree_level > 0 or nvc <= wo_hi_lim then -- no need to split this node
pass("wo_in_endcomp_nosthis");
return; -- done with this case
end if;
-- otherwise we are at the very top of the tree, so we must split it into two parts and create a new tree level
set_vect_of_children(rec,two_halves(rec));
set_ch_cum(rec,1,wo_get_cum(voc(rec,1))); set_ch_cum(rec,2,the_cum);
set_ch_cum2(rec,1,wo_get_cum2(voc(rec,1))); set_ch_cum2(rec,2,wo_get_cum2(voc(rec,2)));
pass("wo_in_endcomp_splitthis");
return; -- done with this case
end if; -- end of case of insertion at the very end
-- in the remaining cases we have an insertion before one of our nodes
[ix,prev_cum,cum] := result; -- decode the result returned, geting the insertion result and the preceding cumulant
if not ic then -- insertion at appropriate position in non-compound vector
the_ch := vect_of_children(rec); the_ch(4 * ix - 3..4 * ix - 4) := x;
the_cums := vect_of_cums(rec); the_cums2 := vect_of_cums2(rec);
the_cums(5 * ix - 4..5 * ix - 5) :=
if ix = 1 then "\x00\x00\x00\x00\x00" else the_cums(5 * ix - 9..5 * ix - 5) end if;
the_cums2(4 * ix - 3..4 * ix - 4) := x_cum2;
set_vect_of_children(rec,the_ch); -- make insertion into list of children
set_vect_of_cums(rec,the_cums); -- make insertion into list of cums
set_vect_of_cums2(rec,the_cums2); -- make insertion into list of cums
update_cums(rec,ix,x_cum); -- adjust the given and following cumulants
if tree_level > 0 or (nvc := num_childr(rec)) <= wo_hi_lim then -- no need to split
pass("wo_in_nendnc_nos");
return; -- done with this case
end if; -- otherwise we must split, and becomes compound
pass("wo_in_nendnc_split");
-- otherwise we are at the very top of the tree, so we must split it into two parts and create a new tree level
set_vect_of_children(rec,two_halves(rec));
set_ch_cum(rec,1,wo_get_cum(voc(rec,1))); set_ch_cum(rec,2,the_cum + x_cum);
set_ch_cum2(rec,1,wo_get_cum2(voc(rec,1))); set_ch_cum2(rec,2,wo_get_cum2(voc(rec,2)));
set_is_compound(rec,true); -- note that it is indeed compound
--print("just became compound: ",dr_is_compound(rec)," ",wo_dump(rec)); stop;
return; -- done with this case
end if; -- othewise we deal with the compound case
c := voc(rec,ix); -- get the child into which the insertion will now be made
-- and get the position in this child at which the insertion will be made
tree_level +:= 1; -- go down a level
insert_in(c,srch_on,if srch_on = 1 then j - prev_cum else j end if,x); -- insert x into this child
tree_level -:= 1; -- come up a level
set_voc(rec,ix,c); -- put the possibly revised child back into position
update_cums(rec,ix,x_cum); -- adjust the given and following cumulants
set_ch_cum2(rec,ix,wo_get_cum2(c)); -- adjust the second cumulant on record for the child
-- which may have changed if the insertion was at the end of the child
if #(vect_of_children(c))/4 <= wo_hi_lim then -- no need to split the child
pass("wo_in_nendcomp_nos");
return;
end if; -- otherwise we must split the child
pass("wo_in_nendcomp_split");
split_node(rec,ix); -- split the child into two. we insert an empty node to the right
-- of node ix, and then move half the nodes into it
nvc := num_childr(rec);
if tree_level > 0 or nvc <= wo_hi_lim then -- no need to split this node
pass("wo_in_nendcomp_nosthis");
return; -- done with this case
end if; -- otherwise we are at the very top of the tree, so we must split it into two parts and create a new tree level
pass("wo_in_nendcomp_splitthis");
set_vect_of_children(rec,two_halves(rec));
set_ch_cum(rec,1,wo_get_cum(voc(rec,1))); set_ch_cum(rec,2,the_cum + x_cum);
set_ch_cum2(rec,1,wo_get_cum2(voc(rec,1))); set_ch_cum2(rec,2,wo_get_cum2(voc(rec,2)));
end insert_in;
procedure two_halves(rec); -- split this tree into two halves
-- the two nodes created share the children of the original tree, so that the refcounts of
-- these children need no adjustment. The nodes created each have a refcount of 1.
-- Note that this routine is only called if rec has no parent.
the_type := if (ic := dr_is_compound(rec)) then wdoccs_str_node_record else wdoccs_str_node_ncr end if;
u1 := dr_new_rec(); u2 := dr_new_rec(); -- make and initialize two subtrees
set_type(u1,the_type); set_type(u2,the_type); -- the halves are compound iff this tree is compound
-- we must subtract the cumulant of the last retained child from all the children that move
-- to get the cumulant of the left node
hnvc := (nvc := num_childr(rec))/2; cum_last_retained := get_ch_cum(rec,hnvc);
for j in [hnvc + 1..nvc] loop -- subtract this from the cumulant of each child that will move
set_ch_cum(rec,j,get_ch_cum(rec,j) - cum_last_retained);
end loop;
vocums := vect_of_cums(rec); -- get the vector of cums
vocums2 := vect_of_cums2(rec); -- get the vector of cums
set_vect_of_children(u1,(voch := vect_of_children(rec))(1..4 * hnvc));
set_vect_of_cums(u1,vocums(1..5 * hnvc)); -- u1 gets half the children and cums
set_vect_of_cums2(u1,vocums2(1..4 * hnvc)); -- u1 gets half the children and cums
set_vect_of_cums(u2,vocums(5 * hnvc + 1..)); -- the second half inherits adjusted cumulants from the original tree
set_vect_of_cums2(u2,vocums2(4 * hnvc + 1..)); -- likewise for the second cums
set_vect_of_children(u2,voch(4 * hnvc + 1..)); -- u2 gets the other half of the children
--print("two_halves: ",str(wo_dump(u1)),"\n",str(wo_dump(u2)),"\n",hexify(voch(4 * hnvc + 1..)),"\n",hexify(dr_load(u2)));
return u1 + u2;
-- assemble the two nodes of the new compound tree; return as a string
end two_halves;
procedure pull_from_left(rec,k); -- split children with left sibling
if k = 1 or num_childr(voc(rec,k - 1)) <= wo_low_lim then
return false;
end if;
pass("wo_pull_left");
share_right(rec,k - 1);
return true;
end pull_from_left;
procedure pull_from_right(rec,k); -- split children with right sibling
if k >= (nrv := num_childr(rec)) or num_childr(voc(rec,k + 1)) <= wo_low_lim then
return false;
end if;
pass("wo_pull_right");
share_right(rec,k);
return true;
end pull_from_right;
procedure share_right(rec,k); -- share children with right-hand sibling
-- we divide the children of the k-th node, together with those of the k+1'st,
-- into two roughly equal groups, and make these the k-th and k+1'st nodes. The
-- cumulative totals must be adjusted in the k-th node, in the
-- children moved between nodes, and in the children of the k+1'st node
-- this routine must first copy the nodes among which children will move, if they have more than 1 reference.
-- but it does not change the number of references to the children, so that their refcounts need no adjustment
nchkp1 := #(rkp1 := vect_of_children(ndkp1 := voc(rec,k + 1)))/4;
nchk := #(rk := vect_of_children(ndk := voc(rec,k)))/4; -- get the two groups of children, and their lengths
numleft := (nchk + nchkp1)/2; -- half the children; the left-hand will get this number of children
if refcount(int_of_4(ndkp1)) > 1 then -- must copy
pass("wo_share_copy");
stg := dr_load(ndkp1); new_r := dr_new_rec(); dr_setrecbuf(new_r,stg); dr_dirtify(new_r);
incref(ndkp1,-1); ndkp1 := new_r; -- substitute copy for original
increfs(new_r,1); set_voc(rec,k + 1,new_r);
end if;
if refcount(int_of_4(ndk)) > 1 then -- must copy
pass("wo_share_copy2");
stg := dr_load(ndk); new_r := dr_new_rec(); dr_setrecbuf(new_r,stg); dr_dirtify(new_r);
incref(ndk,-1); ndk := new_r; -- substitute copy for original
increfs(new_r,1); set_voc(rec,k,new_r);
end if;
if numleft > nchk then -- children will move left
pass("wo_share_move_left");
num_mov := numleft - nchk; -- the number that will move left
--print("move left: ",num_mov," ",hexify(dr_load(ndk)),"\n",hexify(dr_load(ndkp1)));
-- we must subtract the cumulant of the last child moving left
-- from the cumulants of all the right-hand children which do not move,
-- and must add this to the cumulant of the k-th node. We must also
-- add the cumulant of the last left-hand child of ndk to the cumulants of all
-- the right-hand children which do move.
ndk_cum := get_ch_cum(rec,k); -- get the cumulant of ndk
right_cum := wo_get_cum(ndk); -- cumulant of the last child of ndk
moved_cum := get_ch_cum(ndkp1,num_mov); -- cumulant of the last child of ndkp1 that moves
set_ch_cum(rec,k,moved_cum + ndk_cum); -- add moved_cum to the cumulant of the k-th node
set_ch_cum2(rec,k,get_ch_cum2(ndkp1,num_mov)); -- correct the second cumulant of the k-th node
for j in [1..num_mov] loop
set_ch_cum(ndkp1,j,right_cum + get_ch_cum(ndkp1,j));
end loop;
for j in [num_mov + 1..nchkp1] loop
set_ch_cum(ndkp1,j,get_ch_cum(ndkp1,j) - moved_cum);
end loop;
--print("move the children: ",rk," ",rkp1," ",num_mov);
-- and now we must move the corresponding cums
cumvp1 := vect_of_cums(ndkp1); -- second vector of cums
--print("ndkp1: ",hexify(ndkp1)," ",hexify(dr_load(ndkp1))); stop;
set_vect_of_cums(ndk,vect_of_cums(ndk) + cumvp1(1..num_mov * 5)); -- move the cums in
set_vect_of_cums(ndkp1,cumvp1(num_mov * 5 + 1..)); -- move the cums out
-- since the number of cums of ndkp1 is defined by its number of children, we need not edit that list
cumvp1 := vect_of_cums2(ndkp1); -- second vector of second cums
--print("ndkp1: ",hexify(ndkp1)," ",hexify(dr_load(ndkp1))); stop;
set_vect_of_cums2(ndk,vect_of_cums2(ndk) + cumvp1(1..num_mov * 4)); -- move the cums in
set_vect_of_cums2(ndkp1,cumvp1(num_mov * 4 + 1..)); -- move the cums out
set_vect_of_children(ndk,rk + rkp1(1..4 * num_mov)); -- now actually move the children
set_vect_of_children(ndkp1,rkp1(4 * num_mov + 1..));
else -- children will move right
pass("wo_share_move_right");
-- we must subtract the cumulant of the last remaining child
-- from that of each of the children moving right. The remaining cumulant of
-- the last child moving right must then be added to the cumulants of
-- all the original children of the (k + 1)-st node
rem_left_cum := get_ch_cum(ndk,numleft); -- cumulant of the last remaining child of ndk
for j in [numleft + 1..nchk] loop -- subtract this from the cum of all the nodes moving right
set_ch_cum(ndk,j,get_ch_cum(ndk,j) - rem_left_cum);
end loop;
total_moved_cum := get_ch_cum(ndk,nchk); -- get the cumulant of the last node moving right
for j in [1..nchkp1] loop -- add this to the cum of all the children of the (k + 1)-st node
set_ch_cum(ndkp1,j,total_moved_cum + get_ch_cum(ndkp1,j));
end loop;
set_ch_cum(rec,k,get_ch_cum(rec,k) - total_moved_cum);
-- the cumulant of the last node moving right must be subtracted from the cumulant of the k-th node
set_ch_cum2(rec,k,get_ch_cum2(ndk,numleft)); -- update the second cum of the moved node
-- and now we must move the corresponding cums
cumv := vect_of_cums(ndk); -- first vector of cums
cumvp1 := vect_of_cums(ndkp1); -- second vector of cums
set_vect_of_cums(ndkp1,cumv(5 * numleft + 1..5 * nchk) + vect_of_cums(ndkp1)); -- move the cums
-- since the number of cums of ndk is defined by its number of children, we need not edit that list