#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <time.h>

#include "allvars.h"
#include "proto.h"
#include "domain.h"
#include "forcetree.h"
#include "mymalloc.h"
#include "endrun.h"

/*! \file forcetree.c
 *  \brief gravitational tree and code for Ewald correction
 *
 *  This file contains the computation of the gravitational force by means
 *  of a tree. The type of tree implemented is a geometrical oct-tree,
 *  starting from a cube encompassing all particles. This cube is
 *  automatically found in the domain decomposition, which also splits up
 *  the global "top-level" tree along node boundaries, moving the particles
 *  of different parts of the tree to separate processors. Tree nodes can
 *  be dynamically updated in drift/kick operations to avoid having to
 *  reconstruct the tree every timestep.
 */

struct NODE *Nodes_base,	/*!< points to the actual memory allocted for the nodes */
 *Nodes;			/*!< this is a pointer used to access the nodes which is shifted such that Nodes[All.MaxPart]
				   gives the first allocated node */


struct extNODE *Extnodes, *Extnodes_base;


int MaxNodes;			/*!< maximum allowed number of internal nodes */
int Numnodestree;		/*!< number of (internal) nodes in each tree */


int *Nextnode;			/*!< gives next node in tree walk  (nodes array) */
int *Father;			/*!< gives parent node in tree (Prenodes array) */

/*! auxiliary variable used to set-up non-recursive walk */
static int last;


static int tree_allocated_flag = 0;


static int
force_tree_build(int npart, struct unbind_data *mp);
static int
force_tree_build_single(int npart, struct unbind_data *mp);
static void
force_treeallocate(int maxnodes, int maxpart);

static void
force_update_hmax_of_node(int no, int mode);

static void
force_flag_localnodes(void);

static void
force_treeupdate_pseudos(int);

static void
force_update_node_recursive(int no, int sib, int father);

static void
force_create_empty_nodes(int no, int topnode, int bits, int x, int y, int z, int *nodecount, int *nextfree);

static void
force_exchange_pseudodata(void);

static void
force_insert_pseudo_particles(void);

int
force_tree_allocated()
{
    return tree_allocated_flag;
}

void
force_tree_rebuild()
{
    message(0, "Tree construction.  (presently allocated=%g MB)\n", AllocatedBytes / (1024.0 * 1024.0));

    if(force_tree_allocated()) {
        force_tree_free();
    }
    /* construct tree if needed */
    /* the tree is used in grav dens, hydro, bh and sfr */
    force_treeallocate((int) (All.TreeAllocFactor * All.MaxPart) + NTopnodes, All.MaxPart);

    walltime_measure("/Misc");

    force_tree_build(NumPart, NULL);

    walltime_measure("/Tree/Build");

    message(0, "Tree construction done.\n");

}

/*! This function is a driver routine for constructing the gravitational
 *  oct-tree, which is done by calling a small number of other functions.
 */
int force_tree_build(int npart, struct unbind_data *mp)
{
    int flag;

    do
    {
        Numnodestree = force_tree_build_single(npart, mp);

        MPI_Allreduce(&Numnodestree, &flag, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
        if(flag == -1)
        {
            force_tree_free();

            message(0, "Increasing TreeAllocFactor=%g", All.TreeAllocFactor);

            All.TreeAllocFactor *= 1.15;

            message(0, "new value=%g\n", All.TreeAllocFactor);

            force_treeallocate((int) (All.TreeAllocFactor * All.MaxPart) + NTopnodes, All.MaxPart);
        }
    }
    while(flag == -1);

    force_flag_localnodes();

    force_exchange_pseudodata();

    force_treeupdate_pseudos(All.MaxPart);

    TimeOfLastTreeConstruction = All.Time;

    return Numnodestree;
}



/*! Constructs the gravitational oct-tree.
 *
 *  The index convention for accessing tree nodes is the following: the
 *  indices 0...NumPart-1 reference single particles, the indices
 *  All.MaxPart.... All.MaxPart+nodes-1 reference tree nodes. `Nodes_base'
 *  points to the first tree node, while `nodes' is shifted such that
 *  nodes[All.MaxPart] gives the first tree node. Finally, node indices
 *  with values 'All.MaxPart + MaxNodes' and larger indicate "pseudo
 *  particles", i.e. multipole moments of top-level nodes that lie on
 *  different CPUs. If such a node needs to be opened, the corresponding
 *  particle must be exported to that CPU. The 'Extnodes' structure
 *  parallels that of 'Nodes'. Its information is only needed for the SPH
 *  part of the computation. (The data is split onto these two structures
 *  as a tuning measure.  If it is merged into 'Nodes' a somewhat bigger
 *  size of the nodes also for gravity would result, which would reduce
 *  cache utilization slightly.
 */
int force_tree_build_single(int npart, struct unbind_data *mp)
{
    int i, j, k, subnode = 0, shift, parent, numnodes, rep;
    int nfree, th, nn, no;
    struct NODE *nfreep;
    MyFloat lenhalf;
    peanokey key, morton, th_key, *morton_list;


    /* create an empty root node  */
    nfree = All.MaxPart;		/* index of first free node */
    nfreep = &Nodes[nfree];	/* select first node */

    nfreep->len = DomainLen;
    for(j = 0; j < 3; j++)
        nfreep->center[j] = DomainCenter[j];
    for(j = 0; j < 8; j++)
        nfreep->u.suns[j] = -1;


    numnodes = 1;
    nfreep++;
    nfree++;

    /* create a set of empty nodes corresponding to the top-level domain
     * grid. We need to generate these nodes first to make sure that we have a
     * complete top-level tree which allows the easy insertion of the
     * pseudo-particles at the right place
     */

    force_create_empty_nodes(All.MaxPart, 0, 1, 0, 0, 0, &numnodes, &nfree);

    /* if a high-resolution region in a global tree is used, we need to generate
     * an additional set empty nodes to make sure that we have a complete
     * top-level tree for the high-resolution inset
     */

    nfreep = &Nodes[nfree];
    parent = -1;			/* note: will not be used below before it is changed */

    morton_list = (peanokey *) mymalloc("morton_list", NumPart * sizeof(peanokey));

    /* now we insert all particles */
    for(k = 0; k < npart; k++)
    {
        if(mp)
            i = mp[k].index;
        else
            i = k;

        /*We sometimes want to disable the tree for hot particles*/
        if(P[i].Type == All.NoTreeType)
            continue;

        rep = 0;

        key = peano_and_morton_key((int) ((P[i].Pos[0] - DomainCorner[0]) * DomainFac),
                (int) ((P[i].Pos[1] - DomainCorner[1]) * DomainFac),
                (int) ((P[i].Pos[2] - DomainCorner[2]) * DomainFac), BITS_PER_DIMENSION,
                &morton);
        morton_list[i] = morton;

        shift = 3 * (BITS_PER_DIMENSION - 1);

        no = 0;
        while(TopNodes[no].Daughter >= 0)
        {
            no = TopNodes[no].Daughter + (key - TopNodes[no].StartKey) / (TopNodes[no].Size / 8);
            shift -= 3;
        }

        no = TopNodes[no].Leaf;
        th = DomainNodeIndex[no];

        while(1)
        {
            if(th >= All.MaxPart)	/* we are dealing with an internal node */
            {
                if(shift >= 0)
                {
                    subnode = ((morton >> shift) & 7);
                }
                else
                {
                    subnode = 0;
                    if(P[i].Pos[0] > Nodes[th].center[0])
                        subnode += 1;
                    if(P[i].Pos[1] > Nodes[th].center[1])
                        subnode += 2;
                    if(P[i].Pos[2] > Nodes[th].center[2])
                        subnode += 4;
                }

#ifndef NOTREERND
                if(Nodes[th].len < 1.0e-3 * All.ForceSoftening[P[i].Type])
                {
                    /* seems like we're dealing with particles at identical (or extremely close)
                     * locations. Randomize subnode index to allow tree construction. Note: Multipole moments
                     * of tree are still correct, but this will only happen well below gravitational softening
                     * length-scale anyway.
                     */
                    subnode = (int) (8.0 * get_random_number((P[i].ID + rep) % (RNDTABLE + (rep & 3))));

                    if(subnode >= 8)
                        subnode = 7;
                    rep++;
                }
#endif

                nn = Nodes[th].u.suns[subnode];

                shift -= 3;

                if(nn >= 0)	/* ok, something is in the daughter slot already, need to continue */
                {
                    parent = th;
                    th = nn;
                }
                else
                {
                    /* here we have found an empty slot where we can attach
                     * the new particle as a leaf.
                     */
                    Nodes[th].u.suns[subnode] = i;
                    break;	/* done for this particle */
                }
            }
            else
            {
                /* We try to insert into a leaf with a single particle.  Need
                 * to generate a new internal node at this point.
                 */
                Nodes[parent].u.suns[subnode] = nfree;

                nfreep->len = 0.5 * Nodes[parent].len;
                lenhalf = 0.25 * Nodes[parent].len;

                if(subnode & 1)
                    nfreep->center[0] = Nodes[parent].center[0] + lenhalf;
                else
                    nfreep->center[0] = Nodes[parent].center[0] - lenhalf;

                if(subnode & 2)
                    nfreep->center[1] = Nodes[parent].center[1] + lenhalf;
                else
                    nfreep->center[1] = Nodes[parent].center[1] - lenhalf;

                if(subnode & 4)
                    nfreep->center[2] = Nodes[parent].center[2] + lenhalf;
                else
                    nfreep->center[2] = Nodes[parent].center[2] - lenhalf;

                nfreep->u.suns[0] = -1;
                nfreep->u.suns[1] = -1;
                nfreep->u.suns[2] = -1;
                nfreep->u.suns[3] = -1;
                nfreep->u.suns[4] = -1;
                nfreep->u.suns[5] = -1;
                nfreep->u.suns[6] = -1;
                nfreep->u.suns[7] = -1;

                if(shift >= 0)
                {
                    th_key = morton_list[th];
                    subnode = ((th_key >> shift) & 7);
                }
                else
                {
                    subnode = 0;
                    if(P[th].Pos[0] > nfreep->center[0])
                        subnode += 1;
                    if(P[th].Pos[1] > nfreep->center[1])
                        subnode += 2;
                    if(P[th].Pos[2] > nfreep->center[2])
                        subnode += 4;
                }

#ifndef NOTREERND
                if(nfreep->len < 1.0e-3 * All.ForceSoftening[P[th].Type])
                {
                    /* seems like we're dealing with particles at identical (or extremely close)
                     * locations. Randomize subnode index to allow tree construction. Note: Multipole moments
                     * of tree are still correct, but this will only happen well below gravitational softening
                     * length-scale anyway.
                     */
                    subnode = (int) (8.0 * get_random_number((P[th].ID + rep) % (RNDTABLE + (rep & 3))));

                    if(subnode >= 8)
                        subnode = 7;
                    rep++;
                }
#endif
                nfreep->u.suns[subnode] = th;

                th = nfree;	/* resume trying to insert the new particle at
                             * the newly created internal node
                             */

                numnodes++;
                nfree++;
                nfreep++;

                if((numnodes) >= MaxNodes)
                {
                    message(1, "maximum number %d of tree-nodes reached for particle %d.\n",
                            MaxNodes, i);

                    if(All.TreeAllocFactor > 5.0)
                    {
                        message(1, "looks like a serious problem for particle %d, stopping with particle dump.\n", i);
                        savepositions(999999, 0);
                        endrun(1, "serious problem occured, snapshot saved.");
                    }
                    else
                    {
                        myfree(morton_list);
                        return -1;
                    }
                }
            }
        }
    }

    myfree(morton_list);


    /* insert the pseudo particles that represent the mass distribution of other domains */
    force_insert_pseudo_particles();


    /* now compute the multipole moments recursively */
    last = -1;

    force_update_node_recursive(All.MaxPart, -1, -1);

    if(last >= All.MaxPart)
    {
        if(last >= All.MaxPart + MaxNodes)	/* a pseudo-particle */
            Nextnode[last - MaxNodes] = -1;
        else
            Nodes[last].u.d.nextnode = -1;
    }
    else
        Nextnode[last] = -1;

    return numnodes;
}



/*! This function recursively creates a set of empty tree nodes which
 *  corresponds to the top-level tree for the domain grid. This is done to
 *  ensure that this top-level tree is always "complete" so that we can easily
 *  associate the pseudo-particles of other CPUs with tree-nodes at a given
 *  level in the tree, even when the particle population is so sparse that
 *  some of these nodes are actually empty.
 */
void force_create_empty_nodes(int no, int topnode, int bits, int x, int y, int z, int *nodecount,
        int *nextfree)
{
    int i, j, k, n, sub, count;
    MyFloat lenhalf;

    if(TopNodes[topnode].Daughter >= 0)
    {
        for(i = 0; i < 2; i++)
            for(j = 0; j < 2; j++)
                for(k = 0; k < 2; k++)
                {
                    sub = 7 & peano_hilbert_key((x << 1) + i, (y << 1) + j, (z << 1) + k, bits);

                    count = i + 2 * j + 4 * k;

                    Nodes[no].u.suns[count] = *nextfree;

                    lenhalf = 0.25 * Nodes[no].len;
                    Nodes[*nextfree].len = 0.5 * Nodes[no].len;
                    Nodes[*nextfree].center[0] = Nodes[no].center[0] + (2 * i - 1) * lenhalf;
                    Nodes[*nextfree].center[1] = Nodes[no].center[1] + (2 * j - 1) * lenhalf;
                    Nodes[*nextfree].center[2] = Nodes[no].center[2] + (2 * k - 1) * lenhalf;

                    for(n = 0; n < 8; n++)
                        Nodes[*nextfree].u.suns[n] = -1;

                    if(TopNodes[TopNodes[topnode].Daughter + sub].Daughter == -1)
                        DomainNodeIndex[TopNodes[TopNodes[topnode].Daughter + sub].Leaf] = *nextfree;

                    *nextfree = *nextfree + 1;
                    *nodecount = *nodecount + 1;

                    if((*nodecount) >= MaxNodes || (*nodecount) >= MaxTopNodes)
                    {
                        endrun(11, "maximum number MaxNodes=%d of tree-nodes reached."
                                "MaxTopNodes=%d NTopnodes=%d NTopleaves=%d nodecount=%d\n",
                                MaxNodes, MaxTopNodes, NTopnodes, NTopleaves, *nodecount);
                    }

                    force_create_empty_nodes(*nextfree - 1, TopNodes[topnode].Daughter + sub,
                            bits + 1, 2 * x + i, 2 * y + j, 2 * z + k, nodecount, nextfree);
                }
    }
}



/*! this function inserts pseudo-particles which will represent the mass
 *  distribution of the other CPUs. Initially, the mass of the
 *  pseudo-particles is set to zero, and their coordinate is set to the
 *  center of the domain-cell they correspond to. These quantities will be
 *  updated later on.
 */
void force_insert_pseudo_particles(void)
{
    int i, index;

    for(i = 0; i < NTopleaves; i++)
    {
        index = DomainNodeIndex[i];

        if(DomainTask[i] != ThisTask)
            Nodes[index].u.suns[0] = All.MaxPart + MaxNodes + i;
    }
}


/*! this routine determines the multipole moments for a given internal node
 *  and all its subnodes using a recursive computation.  The result is
 *  stored in the Nodes[] structure in the sequence of this tree-walk.
 *
 *  Note that the bitflags-variable for each node is used to store in the
 *  lowest bits some special information: Bit 0 flags whether the node
 *  belongs to the top-level tree corresponding to the domain
 *  decomposition, while Bit 1 signals whether the top-level node is
 *  dependent on local mass.
 *
 *  bits 2-4 give the particle type with
 *  the maximum softening among the particles in the node, and bit 5
 *  flags whether the node contains any particles with lower softening
 *  than that.
 */
void force_update_node_recursive(int no, int sib, int father)
{
    int j, jj, k, p, pp, nextsib, suns[8], count_particles, multiple_flag;
    MyFloat hmax, vmax, v, divVmax;
    MyFloat s[3], vs[3], mass;
    struct particle_data *pa;

#ifndef ADAPTIVE_GRAVSOFT_FORGAS
    int maxsofttype, current_maxsofttype, diffsoftflag;
#else
    MyFloat maxsoft;
#endif

    if(no >= All.MaxPart && no < All.MaxPart + MaxNodes)	/* internal node */
    {
        for(j = 0; j < 8; j++)
        suns[j] = Nodes[no].u.suns[j];	/* this "backup" is necessary because the nextnode entry will overwrite one element (union!) */
        if(last >= 0)
        {
            if(last >= All.MaxPart)
            {
                if(last >= All.MaxPart + MaxNodes)	/* a pseudo-particle */
                    Nextnode[last - MaxNodes] = no;
                else
                    Nodes[last].u.d.nextnode = no;
            }
            else
                Nextnode[last] = no;
        }

        last = no;

        mass = 0;
        s[0] = 0;
        s[1] = 0;
        s[2] = 0;
        vs[0] = 0;
        vs[1] = 0;
        vs[2] = 0;
        hmax = 0;
        vmax = 0;
        divVmax = 0;
        count_particles = 0;

#ifndef ADAPTIVE_GRAVSOFT_FORGAS
        maxsofttype = 7;
        diffsoftflag = 0;
#else
        maxsoft = 0;
#endif

        for(j = 0; j < 8; j++)
        {
            if((p = suns[j]) >= 0)
            {
                /* check if we have a sibling on the same level */
                for(jj = j + 1; jj < 8; jj++)
                    if((pp = suns[jj]) >= 0)
                        break;

                if(jj < 8)	/* yes, we do */
                    nextsib = pp;
                else
                    nextsib = sib;

                force_update_node_recursive(p, nextsib, no);

                if(p >= All.MaxPart)	/* an internal node or pseudo particle */
                {
                    if(p >= All.MaxPart + MaxNodes)	/* a pseudo particle */
                    {
                        /* nothing to be done here because the mass of the
                         * pseudo-particle is still zero. This will be changed
                         * later.
                         */
                    }
                    else
                    {
                        mass += (Nodes[p].u.d.mass);
                        s[0] += (Nodes[p].u.d.mass * Nodes[p].u.d.s[0]);
                        s[1] += (Nodes[p].u.d.mass * Nodes[p].u.d.s[1]);
                        s[2] += (Nodes[p].u.d.mass * Nodes[p].u.d.s[2]);
                        vs[0] += (Nodes[p].u.d.mass * Extnodes[p].vs[0]);
                        vs[1] += (Nodes[p].u.d.mass * Extnodes[p].vs[1]);
                        vs[2] += (Nodes[p].u.d.mass * Extnodes[p].vs[2]);

                        if(Nodes[p].u.d.mass > 0)
                        {
                            if(Nodes[p].u.d.bitflags & (1 << BITFLAG_MULTIPLEPARTICLES))
                                count_particles += 2;
                            else
                                count_particles++;
                        }

                        if(Extnodes[p].hmax > hmax)
                            hmax = Extnodes[p].hmax;

                        if(Extnodes[p].vmax > vmax)
                            vmax = Extnodes[p].vmax;

                        if(Extnodes[p].divVmax > divVmax)
                            divVmax = Extnodes[p].divVmax;

#ifndef ADAPTIVE_GRAVSOFT_FORGAS
                        diffsoftflag |= maskout_different_softening_flag(Nodes[p].u.d.bitflags);

                        if(maxsofttype == 7)
                            maxsofttype = extract_max_softening_type(Nodes[p].u.d.bitflags);
                        else
                        {
                            current_maxsofttype = extract_max_softening_type(Nodes[p].u.d.bitflags);
                            if(current_maxsofttype != 7)
                            {
                                if(All.ForceSoftening[current_maxsofttype] > All.ForceSoftening[maxsofttype])
                                {
                                    maxsofttype = current_maxsofttype;
                                    diffsoftflag = (1 << BITFLAG_MIXED_SOFTENINGS_IN_NODE);
                                }
                                else
                                {
                                    if(All.ForceSoftening[current_maxsofttype] <
                                            All.ForceSoftening[maxsofttype])
                                        diffsoftflag = (1 << BITFLAG_MIXED_SOFTENINGS_IN_NODE);
                                }
                            }
                        }
#else
                        if(Nodes[p].maxsoft > maxsoft)
                            maxsoft = Nodes[p].maxsoft;
#endif
                    }
                }
                else		/* a particle */
                {
                    count_particles++;

                    pa = &P[p];

                    mass += (pa->Mass);
                    s[0] += (pa->Mass * pa->Pos[0]);
                    s[1] += (pa->Mass * pa->Pos[1]);
                    s[2] += (pa->Mass * pa->Pos[2]);
                    vs[0] += (pa->Mass * pa->Vel[0]);
                    vs[1] += (pa->Mass * pa->Vel[1]);
                    vs[2] += (pa->Mass * pa->Vel[2]);

                    if(pa->Type == 0)
                    {
                        if(P[p].Hsml > hmax)
                            hmax = P[p].Hsml;

                        if(SPHP(p).DivVel > divVmax)
                            divVmax = SPHP(p).DivVel;
                    }

                    for(k = 0; k < 3; k++)
                        if((v = fabs(pa->Vel[k])) > vmax)
                            vmax = v;

#ifndef ADAPTIVE_GRAVSOFT_FORGAS
                    if(maxsofttype == 7)
                        maxsofttype = pa->Type;
                    else
                    {
                        if(All.ForceSoftening[pa->Type] > All.ForceSoftening[maxsofttype])
                        {
                            maxsofttype = pa->Type;
                            diffsoftflag = (1 << BITFLAG_MIXED_SOFTENINGS_IN_NODE);
                        }
                        else
                        {
                            if(All.ForceSoftening[pa->Type] < All.ForceSoftening[maxsofttype])
                                diffsoftflag = (1 << BITFLAG_MIXED_SOFTENINGS_IN_NODE);
                        }
                    }
#else
                    if(pa->Type == 0)
                    {
                        if(P[p].Hsml > maxsoft)
                            maxsoft = P[p].Hsml;
                    }
                    else
                    {
                        if(All.ForceSoftening[pa->Type] > maxsoft)
                            maxsoft = All.ForceSoftening[pa->Type];
                    }
#endif
                }
            }
        }


        if(mass)
        {
            s[0] /= mass;
            s[1] /= mass;
            s[2] /= mass;
            vs[0] /= mass;
            vs[1] /= mass;
            vs[2] /= mass;
        }
        else
        {
            s[0] = Nodes[no].center[0];
            s[1] = Nodes[no].center[1];
            s[2] = Nodes[no].center[2];
            vs[0] = 0;
            vs[1] = 0;
            vs[2] = 0;
        }


        Nodes[no].Ti_current = All.Ti_Current;
        Nodes[no].u.d.mass = mass;
        Nodes[no].u.d.s[0] = s[0];
        Nodes[no].u.d.s[1] = s[1];
        Nodes[no].u.d.s[2] = s[2];

        Extnodes[no].Ti_lastkicked = All.Ti_Current;
        Extnodes[no].Flag = GlobFlag;
        Extnodes[no].vs[0] = vs[0];
        Extnodes[no].vs[1] = vs[1];
        Extnodes[no].vs[2] = vs[2];
        Extnodes[no].hmax = hmax;
        Extnodes[no].vmax = vmax;
        Extnodes[no].divVmax = divVmax;
        Extnodes[no].dp[0] = 0;
        Extnodes[no].dp[1] = 0;
        Extnodes[no].dp[2] = 0;

        if(count_particles > 1)	/* this flags that the node represents more than one particle */
            multiple_flag = (1 << BITFLAG_MULTIPLEPARTICLES);
        else
            multiple_flag = 0;

        Nodes[no].u.d.bitflags = multiple_flag;

#ifndef ADAPTIVE_GRAVSOFT_FORGAS
        Nodes[no].u.d.bitflags |= diffsoftflag + (maxsofttype << BITFLAG_MAX_SOFTENING_TYPE);
#else
        Nodes[no].maxsoft = maxsoft;
#endif
        Nodes[no].u.d.sibling = sib;
        Nodes[no].u.d.father = father;
    }
    else				/* single particle or pseudo particle */
    {
        if(last >= 0)
        {
            if(last >= All.MaxPart)
            {
                if(last >= All.MaxPart + MaxNodes)	/* a pseudo-particle */
                    Nextnode[last - MaxNodes] = no;
                else
                    Nodes[last].u.d.nextnode = no;
            }
            else
                Nextnode[last] = no;
        }

        last = no;

        if(no < All.MaxPart)	/* only set it for single particles */
            Father[no] = father;
    }
}




/*! This function communicates the values of the multipole moments of the
 *  top-level tree-nodes of the domain grid.  This data can then be used to
 *  update the pseudo-particles on each CPU accordingly.
 */
void force_exchange_pseudodata(void)
{
    int i, no, m, ta, recvTask;
    int *recvcounts, *recvoffset;
    struct DomainNODE
    {
        MyFloat s[3];
        MyFloat vs[3];
        MyFloat mass;
        MyFloat hmax;
        MyFloat vmax;
        MyFloat divVmax;
#ifdef ADAPTIVE_GRAVSOFT_FORGAS
        MyFloat maxsoft;
#endif

        unsigned int bitflags;
    }
    *DomainMoment;


    DomainMoment = (struct DomainNODE *) mymalloc("DomainMoment", NTopleaves * sizeof(struct DomainNODE));
    memset(&DomainMoment[0], 0, sizeof(DomainMoment[0]) * NTopleaves);

    for(m = 0; m < All.DomainOverDecompositionFactor; m++)
        for(i = DomainStartList[ThisTask * All.DomainOverDecompositionFactor + m];
                i <= DomainEndList[ThisTask * All.DomainOverDecompositionFactor + m]; i++)
        {
            no = DomainNodeIndex[i];

            /* read out the multipole moments from the local base cells */
            DomainMoment[i].s[0] = Nodes[no].u.d.s[0];
            DomainMoment[i].s[1] = Nodes[no].u.d.s[1];
            DomainMoment[i].s[2] = Nodes[no].u.d.s[2];
            DomainMoment[i].vs[0] = Extnodes[no].vs[0];
            DomainMoment[i].vs[1] = Extnodes[no].vs[1];
            DomainMoment[i].vs[2] = Extnodes[no].vs[2];
            DomainMoment[i].mass = Nodes[no].u.d.mass;
            DomainMoment[i].hmax = Extnodes[no].hmax;
            DomainMoment[i].vmax = Extnodes[no].vmax;
            DomainMoment[i].divVmax = Extnodes[no].divVmax;
            DomainMoment[i].bitflags = Nodes[no].u.d.bitflags;
#ifdef ADAPTIVE_GRAVSOFT_FORGAS
            DomainMoment[i].maxsoft = Nodes[no].maxsoft;
#endif
        }

    /* share the pseudo-particle data accross CPUs */

    recvcounts = (int *) mymalloc("recvcounts", sizeof(int) * NTask);
    recvoffset = (int *) mymalloc("recvoffset", sizeof(int) * NTask);

    for(m = 0; m < All.DomainOverDecompositionFactor; m++)
    {
        for(recvTask = 0; recvTask < NTask; recvTask++)
        {
            recvcounts[recvTask] =
                (DomainEndList[recvTask * All.DomainOverDecompositionFactor + m]
               - DomainStartList[recvTask * All.DomainOverDecompositionFactor + m] +
                 1)
             * sizeof(struct DomainNODE);
            recvoffset[recvTask] = DomainStartList[recvTask * All.DomainOverDecompositionFactor + m] * sizeof(struct DomainNODE);
        }

        MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL,
                &DomainMoment[0], recvcounts, recvoffset,
                MPI_BYTE, MPI_COMM_WORLD);
    }

    myfree(recvoffset);
    myfree(recvcounts);


    for(ta = 0; ta < NTask; ta++)
        if(ta != ThisTask)
            for(m = 0; m < All.DomainOverDecompositionFactor; m++)
                for(i = DomainStartList[ta * All.DomainOverDecompositionFactor + m]; i <= DomainEndList[ta * All.DomainOverDecompositionFactor + m]; i++)
                {
                    no = DomainNodeIndex[i];

                    Nodes[no].u.d.s[0] = DomainMoment[i].s[0];
                    Nodes[no].u.d.s[1] = DomainMoment[i].s[1];
                    Nodes[no].u.d.s[2] = DomainMoment[i].s[2];
                    Extnodes[no].vs[0] = DomainMoment[i].vs[0];
                    Extnodes[no].vs[1] = DomainMoment[i].vs[1];
                    Extnodes[no].vs[2] = DomainMoment[i].vs[2];
                    Nodes[no].u.d.mass = DomainMoment[i].mass;
                    Extnodes[no].hmax = DomainMoment[i].hmax;
                    Extnodes[no].vmax = DomainMoment[i].vmax;
                    Extnodes[no].divVmax = DomainMoment[i].divVmax;
                    Nodes[no].u.d.bitflags =
                        (Nodes[no].u.d.bitflags & (~BITFLAG_MASK)) | (DomainMoment[i].bitflags & BITFLAG_MASK);
#ifdef ADAPTIVE_GRAVSOFT_FORGAS
                    Nodes[no].maxsoft = DomainMoment[i].maxsoft;
#endif
                }

    myfree(DomainMoment);
}


/*! This function updates the top-level tree after the multipole moments of
 *  the pseudo-particles have been updated.
 */
void force_treeupdate_pseudos(int no)
{
    int j, p, count_particles, multiple_flag;
    MyFloat hmax, vmax, divVmax;
    MyFloat s[3], vs[3], mass;

#ifndef ADAPTIVE_GRAVSOFT_FORGAS
    int maxsofttype, diffsoftflag, current_maxsofttype;
#else
    MyFloat maxsoft;
#endif

    mass = 0;
    s[0] = 0;
    s[1] = 0;
    s[2] = 0;
    vs[0] = 0;
    vs[1] = 0;
    vs[2] = 0;
    hmax = 0;
    vmax = 0;
    divVmax = 0;
    count_particles = 0;
#ifndef ADAPTIVE_GRAVSOFT_FORGAS
    maxsofttype = 7;
    diffsoftflag = 0;
#else
    maxsoft = 0;
#endif

    p = Nodes[no].u.d.nextnode;

    for(j = 0; j < 8; j++)	/* since we are dealing with top-level nodes, we now that there are 8 consecutive daughter nodes */
    {
        if(p >= All.MaxPart && p < All.MaxPart + MaxNodes)	/* internal node */
        {
            if(Nodes[p].u.d.bitflags & (1 << BITFLAG_INTERNAL_TOPLEVEL))
                force_treeupdate_pseudos(p);

            mass += (Nodes[p].u.d.mass);
            s[0] += (Nodes[p].u.d.mass * Nodes[p].u.d.s[0]);
            s[1] += (Nodes[p].u.d.mass * Nodes[p].u.d.s[1]);
            s[2] += (Nodes[p].u.d.mass * Nodes[p].u.d.s[2]);

            vs[0] += (Nodes[p].u.d.mass * Extnodes[p].vs[0]);
            vs[1] += (Nodes[p].u.d.mass * Extnodes[p].vs[1]);
            vs[2] += (Nodes[p].u.d.mass * Extnodes[p].vs[2]);

            if(Extnodes[p].hmax > hmax)
                hmax = Extnodes[p].hmax;
            if(Extnodes[p].vmax > vmax)
                vmax = Extnodes[p].vmax;
            if(Extnodes[p].divVmax > divVmax)
                divVmax = Extnodes[p].divVmax;

            if(Nodes[p].u.d.mass > 0)
            {
                if(Nodes[p].u.d.bitflags & (1 << BITFLAG_MULTIPLEPARTICLES))
                    count_particles += 2;
                else
                    count_particles++;
            }

#ifndef ADAPTIVE_GRAVSOFT_FORGAS
            diffsoftflag |= maskout_different_softening_flag(Nodes[p].u.d.bitflags);

            if(maxsofttype == 7)
                maxsofttype = extract_max_softening_type(Nodes[p].u.d.bitflags);
            else
            {
                current_maxsofttype = extract_max_softening_type(Nodes[p].u.d.bitflags);
                if(current_maxsofttype != 7)
                {
                    if(All.ForceSoftening[current_maxsofttype] > All.ForceSoftening[maxsofttype])
                    {
                        maxsofttype = current_maxsofttype;
                        diffsoftflag = (1 << BITFLAG_MIXED_SOFTENINGS_IN_NODE);
                    }
                    else
                    {
                        if(All.ForceSoftening[current_maxsofttype] < All.ForceSoftening[maxsofttype])
                            diffsoftflag = (1 << BITFLAG_MIXED_SOFTENINGS_IN_NODE);
                    }
                }
            }
#else
            if(Nodes[p].maxsoft > maxsoft)
                maxsoft = Nodes[p].maxsoft;
#endif
        }
        else
            endrun(6767, "may not happen");		/* may not happen */

        p = Nodes[p].u.d.sibling;
    }

    if(mass)
    {
        s[0] /= mass;
        s[1] /= mass;
        s[2] /= mass;
        vs[0] /= mass;
        vs[1] /= mass;
        vs[2] /= mass;
    }
    else
    {
        s[0] = Nodes[no].center[0];
        s[1] = Nodes[no].center[1];
        s[2] = Nodes[no].center[2];
        vs[0] = 0;
        vs[1] = 0;
        vs[2] = 0;
    }


    Nodes[no].u.d.s[0] = s[0];
    Nodes[no].u.d.s[1] = s[1];
    Nodes[no].u.d.s[2] = s[2];
    Extnodes[no].vs[0] = vs[0];
    Extnodes[no].vs[1] = vs[1];
    Extnodes[no].vs[2] = vs[2];
    Nodes[no].u.d.mass = mass;

    Extnodes[no].hmax = hmax;
    Extnodes[no].vmax = vmax;
    Extnodes[no].divVmax = divVmax;

    Extnodes[no].Flag = GlobFlag;

    if(count_particles > 1)
        multiple_flag = (1 << BITFLAG_MULTIPLEPARTICLES);
    else
        multiple_flag = 0;

    Nodes[no].u.d.bitflags &= (~BITFLAG_MASK);	/* this clears the bits */

    Nodes[no].u.d.bitflags |= multiple_flag;

#ifndef ADAPTIVE_GRAVSOFT_FORGAS
    Nodes[no].u.d.bitflags |= diffsoftflag + (maxsofttype << BITFLAG_MAX_SOFTENING_TYPE);
#else
    Nodes[no].maxsoft = maxsoft;
#endif
}



/*! This function flags nodes in the top-level tree that are dependent on
 *  local particle data.
 */
static void
force_flag_localnodes(void)
{
    int no, i, m;

    /* mark all top-level nodes */

    for(i = 0; i < NTopleaves; i++)
    {
        no = DomainNodeIndex[i];

        while(no >= 0)
        {
            if(Nodes[no].u.d.bitflags & (1 << BITFLAG_TOPLEVEL))
                break;

            Nodes[no].u.d.bitflags |= (1 << BITFLAG_TOPLEVEL);

            no = Nodes[no].u.d.father;
        }

        /* mark also internal top level nodes */

        no = DomainNodeIndex[i];
        no = Nodes[no].u.d.father;

        while(no >= 0)
        {
            if(Nodes[no].u.d.bitflags & (1 << BITFLAG_INTERNAL_TOPLEVEL))
                break;

            Nodes[no].u.d.bitflags |= (1 << BITFLAG_INTERNAL_TOPLEVEL);

            no = Nodes[no].u.d.father;
        }
    }

    /* mark top-level nodes that contain local particles */

    for(m = 0; m < All.DomainOverDecompositionFactor; m++)
        for(i = DomainStartList[ThisTask * All.DomainOverDecompositionFactor + m];
                i <= DomainEndList[ThisTask * All.DomainOverDecompositionFactor + m]; i++)
        {
            no = DomainNodeIndex[i];

            if(DomainTask[i] != ThisTask)
                endrun(131231231, "DomainTask struct is corrupted");

            while(no >= 0)
            {
                if(Nodes[no].u.d.bitflags & (1 << BITFLAG_DEPENDS_ON_LOCAL_MASS))
                    break;

                Nodes[no].u.d.bitflags |= (1 << BITFLAG_DEPENDS_ON_LOCAL_MASS);

                no = Nodes[no].u.d.father;
            }
        }
}

static void real_force_drift_node(int no, int time1);
void force_drift_node(int no, int time1) {
    force_drift_node_full(no, time1, 1);
}

int force_drift_node_full(int no, int time1, int blocking) {
    if(time1 == Nodes[no].Ti_current)
        return 0;
#pragma omp atomic
        TotalNodeDrifts ++;

#ifdef OPENMP_USE_SPINLOCK
    int lockstate;
    if (blocking) {
        lockstate = pthread_spin_lock(&Nodes[no].SpinLock);
    } else {
        lockstate = pthread_spin_trylock(&Nodes[no].SpinLock);
    }
    if(0 == lockstate) {
        if(time1 != Nodes[no].Ti_current) {
            real_force_drift_node(no, time1);
#pragma omp flush
        } else {
#pragma omp atomic
            BlockedNodeDrifts ++;
        }
        pthread_spin_unlock(&Nodes[no].SpinLock);
        return 0;
    } else {
        if(blocking) {
            endrun(99999, "shall not happend");
        }
        return -1;
    }
#else
    /* do not use spinlock */
#pragma omp critical (_driftnode_)
    {
        if(time1 != Nodes[no].Ti_current) {
            real_force_drift_node(no, time1);
        }  else {
            BlockedNodeDrifts ++;
        }
    }
    return 0;
#endif
}

static void real_force_drift_node(int no, int time1)
{
    int j;
    double dt_drift, dt_drift_hmax, fac;
    if(time1 == Nodes[no].Ti_current) return;

    if(Nodes[no].u.d.bitflags & (1 << BITFLAG_NODEHASBEENKICKED))
    {
        if(Extnodes[no].Ti_lastkicked != Nodes[no].Ti_current)
        {
            endrun(1, "inconsistency in drift node: Extnodes[no].Ti_lastkicked=%d  Nodes[no].Ti_current=%d\n",
                       Extnodes[no].Ti_lastkicked, Nodes[no].Ti_current);
        }

        if(Nodes[no].u.d.mass)
            fac = 1 / Nodes[no].u.d.mass;
        else
            fac = 0;

        for(j = 0; j < 3; j++)
        {
            Extnodes[no].vs[j] += fac * (Extnodes[no].dp[j]);
            Extnodes[no].dp[j] = 0;
        }
        Nodes[no].u.d.bitflags &= (~(1 << BITFLAG_NODEHASBEENKICKED));
    }

    dt_drift_hmax = get_drift_factor(Nodes[no].Ti_current, time1);
    dt_drift = dt_drift_hmax;

    for(j = 0; j < 3; j++)
        Nodes[no].u.d.s[j] += Extnodes[no].vs[j] * dt_drift;
    Nodes[no].len += 2 * Extnodes[no].vmax * dt_drift;

    //  Extnodes[no].hmax *= exp(0.333333333333 * Extnodes[no].divVmax * dt_drift_hmax);

    Nodes[no].Ti_current = time1;
}


void force_kick_node(int i, MyFloat * dv)
{
    int j, no;
    MyFloat dp[3], v, vmax;

    /*We sometimes want to disable the tree for hot particles*/
    if(P[i].Type == All.NoTreeType)
        return;

    for(j = 0; j < 3; j++)
    {
        dp[j] = P[i].Mass * dv[j];
    }

    for(j = 0, vmax = 0; j < 3; j++)
        if((v = fabs(P[i].Vel[j])) > vmax)
            vmax = v;

    no = Father[i];

    while(no >= 0)
    {
        real_force_drift_node(no, All.Ti_Current);

        for(j = 0; j < 3; j++)
        {
            Extnodes[no].dp[j] += dp[j];
        }

        if(Extnodes[no].vmax < vmax)
            Extnodes[no].vmax = vmax;

        Nodes[no].u.d.bitflags |= (1 << BITFLAG_NODEHASBEENKICKED);

        Extnodes[no].Ti_lastkicked = All.Ti_Current;

        if(Nodes[no].u.d.bitflags & (1 << BITFLAG_TOPLEVEL))	/* top-level tree-node reached */
        {
            if(Extnodes[no].Flag != GlobFlag)
            {
                Extnodes[no].Flag = GlobFlag;
                DomainList[DomainNumChanged++] = no;
            }
            break;
        }

        no = Nodes[no].u.d.father;
    }
}

void force_finish_kick_nodes(void)
{
    int i, j, no, ta, totDomainNumChanged;
    int *domainList_all;
    int *counts, *counts_dp, *offset_list, *offset_dp, *offset_vmax;
    MyDouble *domainDp_loc, *domainDp_all;

    MyFloat *domainVmax_loc, *domainVmax_all;

    /* share the momentum-data of the pseudo-particles accross CPUs */

    counts = (int *) mymalloc("counts", sizeof(int) * NTask);
    counts_dp = (int *) mymalloc("counts_dp", sizeof(int) * NTask);
    offset_list = (int *) mymalloc("offset_list", sizeof(int) * NTask);
    offset_dp = (int *) mymalloc("offset_dp", sizeof(int) * NTask);
    offset_vmax = (int *) mymalloc("offset_vmax", sizeof(int) * NTask);

    domainDp_loc = (MyDouble *) mymalloc("domainDp_loc", DomainNumChanged * 3 * sizeof(MyDouble));
    domainVmax_loc = (MyFloat *) mymalloc("domainVmax_loc", DomainNumChanged * sizeof(MyFloat));

    for(i = 0; i < DomainNumChanged; i++)
    {
        for(j = 0; j < 3; j++)
        {
            domainDp_loc[i * 3 + j] = Extnodes[DomainList[i]].dp[j];
        }
        domainVmax_loc[i] = Extnodes[DomainList[i]].vmax;
    }

    MPI_Allgather(&DomainNumChanged, 1, MPI_INT, counts, 1, MPI_INT, MPI_COMM_WORLD);

    for(ta = 0, totDomainNumChanged = 0, offset_list[0] = 0, offset_dp[0] = 0, offset_vmax[0] = 0; ta < NTask;
            ta++)
    {
        totDomainNumChanged += counts[ta];
        if(ta > 0)
        {
            offset_list[ta] = offset_list[ta - 1] + counts[ta - 1];
            offset_dp[ta] = offset_dp[ta - 1] + counts[ta - 1] * 3 * sizeof(MyDouble);
            offset_vmax[ta] = offset_vmax[ta - 1] + counts[ta - 1] * sizeof(MyFloat);
        }
    }

    message(0, "I exchange kick momenta for %d top-level nodes out of %d\n", totDomainNumChanged, NTopleaves);

    domainDp_all = (MyDouble *) mymalloc("domainDp_all", totDomainNumChanged * 3 * sizeof(MyDouble));
    domainVmax_all = (MyFloat *) mymalloc("domainVmax_all", totDomainNumChanged * sizeof(MyFloat));

    domainList_all = (int *) mymalloc("domainList_all", totDomainNumChanged * sizeof(int));

    MPI_Allgatherv(DomainList, DomainNumChanged, MPI_INT,
            domainList_all, counts, offset_list, MPI_INT, MPI_COMM_WORLD);

    for(ta = 0; ta < NTask; ta++)
    {
        counts_dp[ta] = counts[ta] * 3 * sizeof(MyDouble);
        counts[ta] *= sizeof(MyFloat);
    }


    MPI_Allgatherv(domainDp_loc, DomainNumChanged * 3 * sizeof(MyDouble), MPI_BYTE,
            domainDp_all, counts_dp, offset_dp, MPI_BYTE, MPI_COMM_WORLD);

    MPI_Allgatherv(domainVmax_loc, DomainNumChanged * sizeof(MyFloat), MPI_BYTE,
            domainVmax_all, counts, offset_vmax, MPI_BYTE, MPI_COMM_WORLD);


    /* construct momentum kicks in top-level tree */
    for(i = 0; i < totDomainNumChanged; i++)
    {
        no = domainList_all[i];

        if(Nodes[no].u.d.bitflags & (1 << BITFLAG_DEPENDS_ON_LOCAL_MASS))	/* to avoid that the local one is kicked twice */
            no = Nodes[no].u.d.father;

        while(no >= 0)
        {
            real_force_drift_node(no, All.Ti_Current);

            for(j = 0; j < 3; j++)
            {
                Extnodes[no].dp[j] += domainDp_all[3 * i + j];
            }

            if(Extnodes[no].vmax < domainVmax_all[i])
                Extnodes[no].vmax = domainVmax_all[i];

            Nodes[no].u.d.bitflags |= (1 << BITFLAG_NODEHASBEENKICKED);
            Extnodes[no].Ti_lastkicked = All.Ti_Current;

            no = Nodes[no].u.d.father;
        }
    }

    myfree(domainList_all);
    myfree(domainVmax_all);
    myfree(domainDp_all);
    myfree(domainVmax_loc);
    myfree(domainDp_loc);
    myfree(offset_vmax);
    myfree(offset_dp);
    myfree(offset_list);
    myfree(counts_dp);
    myfree(counts);
}


/*! This function updates the hmax-values in tree nodes that hold SPH
 *  particles. These values are needed to find all neighbors in the
 *  hydro-force computation.  Since the Hsml-values are potentially changed
 *  in the SPH-denity computation, force_update_hmax() should be carried
 *  out just before the hydrodynamical SPH forces are computed, i.e. after
 *  density().
 */
void force_update_hmax(void)
{
    int i, no, ta, totDomainNumChanged;
    int *domainList_all;
    int *counts, *offset_list, *offset_hmax;
    MyFloat *domainHmax_loc, *domainHmax_all;

    walltime_measure("/Misc");
    GlobFlag++;

    DomainNumChanged = 0;
    DomainList = (int *) mymalloc("DomainList", NTopleaves * sizeof(int));

    for(i = FirstActiveParticle; i >= 0; i = NextActiveParticle[i])
        if(P[i].Type == 0)
        {
            no = Father[i];

            while(no >= 0)
            {
                real_force_drift_node(no, All.Ti_Current);

                if(P[i].Hsml > Extnodes[no].hmax || SPHP(i).DivVel > Extnodes[no].divVmax)
                {
                    if(P[i].Hsml > Extnodes[no].hmax)
                        Extnodes[no].hmax = P[i].Hsml;

                    if(SPHP(i).DivVel > Extnodes[no].divVmax)
                        Extnodes[no].divVmax = SPHP(i).DivVel;

                    if(Nodes[no].u.d.bitflags & (1 << BITFLAG_TOPLEVEL))	/* we reached a top-level node */
                    {
                        if(Extnodes[no].Flag != GlobFlag)
                        {
                            Extnodes[no].Flag = GlobFlag;
                            DomainList[DomainNumChanged++] = no;
                        }
                        break;
                    }
                }
                else
                    break;

                no = Nodes[no].u.d.father;
            }
        }

    /* share the hmax-data of the pseudo-particles accross CPUs */

    counts = (int *) mymalloc("counts", sizeof(int) * NTask);
    offset_list = (int *) mymalloc("offset_list", sizeof(int) * NTask);
    offset_hmax = (int *) mymalloc("offset_hmax", sizeof(int) * NTask);

    domainHmax_loc = (MyFloat *) mymalloc("domainHmax_loc", DomainNumChanged * 2 * sizeof(MyFloat));

    for(i = 0; i < DomainNumChanged; i++)
    {
        domainHmax_loc[2 * i] = Extnodes[DomainList[i]].hmax;
        domainHmax_loc[2 * i + 1] = Extnodes[DomainList[i]].divVmax;
    }


    MPI_Allgather(&DomainNumChanged, 1, MPI_INT, counts, 1, MPI_INT, MPI_COMM_WORLD);

    for(ta = 0, totDomainNumChanged = 0, offset_list[0] = 0, offset_hmax[0] = 0; ta < NTask; ta++)
    {
        totDomainNumChanged += counts[ta];
        if(ta > 0)
        {
            offset_list[ta] = offset_list[ta - 1] + counts[ta - 1];
            offset_hmax[ta] = offset_hmax[ta - 1] + counts[ta - 1] * 2 * sizeof(MyFloat);
        }
    }

    message(0, "Hmax exchange: %d topleaves out of %d\n", totDomainNumChanged, NTopleaves);

    domainHmax_all = (MyFloat *) mymalloc("domainHmax_all", totDomainNumChanged * 2 * sizeof(MyFloat));
    domainList_all = (int *) mymalloc("domainList_all", totDomainNumChanged * sizeof(int));

    MPI_Allgatherv(DomainList, DomainNumChanged, MPI_INT,
            domainList_all, counts, offset_list, MPI_INT, MPI_COMM_WORLD);

    for(ta = 0; ta < NTask; ta++)
        counts[ta] *= 2 * sizeof(MyFloat);

    MPI_Allgatherv(domainHmax_loc, 2 * DomainNumChanged * sizeof(MyFloat), MPI_BYTE,
            domainHmax_all, counts, offset_hmax, MPI_BYTE, MPI_COMM_WORLD);


    for(i = 0; i < totDomainNumChanged; i++)
    {
        no = domainList_all[i];

        if(Nodes[no].u.d.bitflags & (1 << BITFLAG_DEPENDS_ON_LOCAL_MASS))	/* to avoid that the hmax is updated twice */
            no = Nodes[no].u.d.father;

        while(no >= 0)
        {
            real_force_drift_node(no, All.Ti_Current);

            if(domainHmax_all[2 * i] > Extnodes[no].hmax || domainHmax_all[2 * i + 1] > Extnodes[no].divVmax)
            {
                if(domainHmax_all[2 * i] > Extnodes[no].hmax)
                    Extnodes[no].hmax = domainHmax_all[2 * i];

                if(domainHmax_all[2 * i + 1] > Extnodes[no].divVmax)
                    Extnodes[no].divVmax = domainHmax_all[2 * i + 1];
            }
            else
                break;

            no = Nodes[no].u.d.father;
        }
    }


    myfree(domainList_all);
    myfree(domainHmax_all);
    myfree(domainHmax_loc);
    myfree(offset_hmax);
    myfree(offset_list);
    myfree(counts);
    myfree(DomainList);

    walltime_measure("/Tree/HmaxUpdate");
}

/*! This function allocates the memory used for storage of the tree and of
 *  auxiliary arrays needed for tree-walk and link-lists.  Usually,
 *  maxnodes approximately equal to 0.7*maxpart is sufficient to store the
 *  tree for up to maxpart particles.
 */
void force_treeallocate(int maxnodes, int maxpart)
{
    size_t bytes;
    double allbytes = 0, allbytes_topleaves = 0;

    tree_allocated_flag = 1;
    DomainNodeIndex = (int *) mymalloc("DomainNodeIndex", bytes = NTopleaves * sizeof(int));
    allbytes_topleaves += bytes;
    MaxNodes = maxnodes;
    if(!(Nodes_base = (struct NODE *) mymalloc("Nodes_base", bytes = (MaxNodes + 1) * sizeof(struct NODE))))
    {
        endrun(3, "failed to allocate memory for %d tree-nodes (%g MB).\n", MaxNodes, bytes / (1024.0 * 1024.0));
    }
    allbytes += bytes;
    if(!
            (Extnodes_base =
             (struct extNODE *) mymalloc("Extnodes_base", bytes = (MaxNodes + 1) * sizeof(struct extNODE))))
    {
        endrun(3, "failed to allocate memory for %d tree-extnodes (%g MB).\n",
                MaxNodes, bytes / (1024.0 * 1024.0));
    }
#ifdef OPENMP_USE_SPINLOCK
    {
        int i;
        for (i = 0; i < MaxNodes + 1; i ++) {
            /* Maybe we can directly set these guys to one
             *
             * at least with the glibc spinlock implementation.
             * */
            pthread_spin_init(&Nodes_base[i].SpinLock, 0);
        }
    }
#endif
    allbytes += bytes;
    Nodes = Nodes_base - All.MaxPart;
    Extnodes = Extnodes_base - All.MaxPart;
    if(!(Nextnode = (int *) mymalloc("Nextnode", bytes = (maxpart + NTopnodes) * sizeof(int))))
    {
        endrun(1, "Failed to allocate %d spaces for 'Nextnode' array (%g MB)\n",
                maxpart + NTopnodes, bytes / (1024.0 * 1024.0));
    }
    allbytes += bytes;
    if(!(Father = (int *) mymalloc("Father", bytes = (maxpart) * sizeof(int))))
    {
        endrun(1, "Failed to allocate %d spaces for 'Father' array (%g MB)\n", maxpart, bytes / (1024.0 * 1024.0));
    }
    allbytes += bytes;

    message(0, "Allocated %g MByte for BH-tree, and %g Mbyte for top-leaves.  (presently allocted %g MB)\n",
             allbytes / (1024.0 * 1024.0), allbytes_topleaves / (1024.0 * 1024.0),
             AllocatedBytes / (1024.0 * 1024.0));
}


/*! This function frees the memory allocated for the tree, i.e. it frees
 *  the space allocated by the function force_treeallocate().
 */
void force_tree_free(void)
{
    myfree(Father);
    myfree(Nextnode);
    myfree(Extnodes_base);
    myfree(Nodes_base);
    myfree(DomainNodeIndex);
    tree_allocated_flag = 0;
}