rygorous · July 26, 2021 01:51
diff --git a/gistfile1.txt b/gistfile1.txt
 // Merge pass
 static void merge_pass(S16 *out, const S16 *inA, const S16 *inB, size_t elemsPerRun)
 {
    // need pow2 elemsPerRun>=16!
    const S16 *endA = inA + elemsPerRun;
    const S16 *endB = inB + elemsPerRun;
    Vec vMin0 = load8_s16(inA + 0);
    Vec vMin1 = load8_s16(inA + 8);
    Vec vMax0 = load8_s16(inB + 0);
    Vec vMax1 = load8_s16(inB + 8);
    inA += 16;
    inB += 16;

    while (inA < endA && inB < endB)
    {
        // Merge two partial 16-element runs
        bitonic_merge16(vMin0,vMin1, vMax0,vMax1);

        // Store the smaller 8 elements
        store8_s16(out + 0, vMin0);
        store8_s16(out + 8, vMin1);
        out += 16;

        // Insert next batch of elements from whichever vector has the
        // next-smallest value we haven't inserted into the merger yet.
        // Note: written to avoid branches!
        bool next_is_A = *inA <= *inB;
        const S16 *incA = inA + 16;
        const S16 *incB = inB + 16;
        const S16 *loadPtr = next_is_A ? inA : inB;
        inA = next_is_A ? incA : inA;
        inB = next_is_A ? inB : incB;

        vMin0 = load8_s16(loadPtr + 0);
        vMin1 = load8_s16(loadPtr + 8);
    }

    // One of the inputs hit the end; enter tail merging phase.
    // Just swap things around so the remaining list is list A.
    bool leftover_is_A = (inA < endA);
    endA = leftover_is_A ? endA : endB;
    inA  = leftover_is_A ? inA  : inB;

    while (inA < endA)
    {
        // Merge two partial 16-element runs
        bitonic_merge16(vMin0,vMin1, vMax0,vMax1);

        // Store the smaller 8 elements
        store8_s16(out + 0, vMin0);
        store8_s16(out + 8, vMin1);
        out += 16;

        // Load next batch
        vMin0 = load8_s16(inA + 0);
        vMin1 = load8_s16(inA + 8);
        inA += 16;
    }

    // Final batch
    bitonic_merge16(vMin0,vMin1, vMax0,vMax1);

    // Store the results
    store8_s16(out +  0, vMin0);
    store8_s16(out +  8, vMin1);
    store8_s16(out + 16, vMax0);
    store8_s16(out + 24, vMax1);
    out += 32;
 }
	// Merge pass
	static void merge_pass(S16 out, const S16 inA, const S16 *inB, size_t elemsPerRun)
	{
	// need pow2 elemsPerRun>=16!
	const S16 *endA = inA + elemsPerRun;
	const S16 *endB = inB + elemsPerRun;
	Vec vMin0 = load8_s16(inA + 0);
	Vec vMin1 = load8_s16(inA + 8);
	Vec vMax0 = load8_s16(inB + 0);
	Vec vMax1 = load8_s16(inB + 8);
	inA += 16;
	inB += 16;

	while (inA < endA && inB < endB)
	{
	// Merge two partial 16-element runs
	bitonic_merge16(vMin0,vMin1, vMax0,vMax1);

	// Store the smaller 8 elements
	store8_s16(out + 0, vMin0);
	store8_s16(out + 8, vMin1);
	out += 16;

	// Insert next batch of elements from whichever vector has the
	// next-smallest value we haven't inserted into the merger yet.
	// Note: written to avoid branches!
	bool next_is_A = inA <= inB;
	const S16 *incA = inA + 16;
	const S16 *incB = inB + 16;
	const S16 *loadPtr = next_is_A ? inA : inB;
	inA = next_is_A ? incA : inA;
	inB = next_is_A ? inB : incB;

	vMin0 = load8_s16(loadPtr + 0);
	vMin1 = load8_s16(loadPtr + 8);
	}

	// One of the inputs hit the end; enter tail merging phase.
	// Just swap things around so the remaining list is list A.
	bool leftover_is_A = (inA < endA);
	endA = leftover_is_A ? endA : endB;
	inA = leftover_is_A ? inA : inB;

	while (inA < endA)
	{
	// Merge two partial 16-element runs
	bitonic_merge16(vMin0,vMin1, vMax0,vMax1);

	// Store the smaller 8 elements
	store8_s16(out + 0, vMin0);
	store8_s16(out + 8, vMin1);
	out += 16;

	// Load next batch
	vMin0 = load8_s16(inA + 0);
	vMin1 = load8_s16(inA + 8);
	inA += 16;
	}

	// Final batch
	bitonic_merge16(vMin0,vMin1, vMax0,vMax1);

	// Store the results
	store8_s16(out + 0, vMin0);
	store8_s16(out + 8, vMin1);
	store8_s16(out + 16, vMax0);
	store8_s16(out + 24, vMax1);
	out += 32;
	}
No results found