October 3, 2023 08:56 · December 5, 2018 04:43 · November 20, 2018 16:05 · October 2, 2018 16:49 · July 5, 2018 20:01 · September 21, 2018 14:45
 a1.medium $ ./build/release/meshoptimizer buddha.obj
 # buddha.obj: 549409 vertices, 1087474 triangles; read in 483.46 msec; indexed in 354.61 msec
 Original : ACMR 1.556966 ATVR 3.081784 (NV 3.124747 AMD 3.277660 Intel 2.289651) Overfetch 2.105950 Overdraw 1.200370 in 0.00 msec
 Random   : ACMR 2.999919 ATVR 5.937897 (NV 5.937882 AMD 5.937935 Intel 5.936783) Overfetch 10.839888 Overdraw 1.218682 in 33.38 msec
 Cache    : ACMR 0.661465 ATVR 1.309272 (NV 1.590738 AMD 1.434356 Intel 1.138871) Overfetch 1.509062 Overdraw 1.206893 in 477.86 msec
 CacheFifo: ACMR 0.689948 ATVR 1.365651 (NV 1.706663 AMD 1.516610 Intel 1.229416) Overfetch 1.552013 Overdraw 1.197034 in 146.00 msec
 Overdraw : ACMR 2.776432 ATVR 5.495538 (NV 5.508446 AMD 5.527603 Intel 5.314811) Overfetch 8.624212 Overdraw 1.086317 in 209.29 msec
 Fetch    : ACMR 1.556966 ATVR 3.081784 (NV 3.124747 AMD 3.277660 Intel 2.289651) Overfetch 2.105950 Overdraw 1.200370 in 25.13 msec
 FetchMap : ACMR 1.556966 ATVR 3.081784 (NV 3.124747 AMD 3.277660 Intel 2.289651) O
 ~/pugixml$ cat ucd.cpp
 #include "pugixml.hpp"
 #include <malloc.h>

 int main()
 {
        pugi::xml_document doc;
        doc.load_file("ucd.all.grouped.xml");
        malloc_stats();
 }
 class RemapInterfaceIdsPass : public spvtools::opt::Pass
 {
 public:
    const char* name() const override { return "remap-interface-ids"; }

    RemapInterfaceIdsPass(uint32_t start_id): start_id(start_id)
    {
    }

    Status Process() override
 #pragma once

 #include <string>

 namespace fmt {

 #define KIND(X) \
 	X(bool,bool) \
 	X(signed char,schar) \
 	X(unsigned char,uchar) \
 namespace tstd
 {
 	template <typename T>
 	class vector
 	{
 	public:
 		typedef T* iterator;
 		typedef const T* const_iterator;

 		iterator begin() { return begin_; }
 #include <stdio.h>
 #include <stdint.h>

 __declspec(noinline)
 uint32_t giveMeRand()
 {
 	static uint32_t result = 0xdeadbeef;
 	result *= 17;
 	return ++result;
 }
 diff --git a/src/indexcodec.cpp b/src/indexcodec.cpp
 index 4ab92cb..18ccd43 100644
 --- a/src/indexcodec.cpp
 +++ b/src/indexcodec.cpp
 @@ -7,9 +7,6 @@
 // This work is based on:
 // Fabian Giesen. Simple lossless index buffer compression & follow-up. 2013
 // Conor Stokes. Vertex Cache Optimised Index Buffer Compression. 2014
 -namespace meshopt
 -{
 Using fixed-point encoding for position (14 bits per component) and texture coordinates (12 bits per component), with 32-bit index buffer
 and this vertex format:

 // 12 bytes
 struct PackedVertexOct
 {
 	unsigned short px, py, pz;
 	unsigned char nu, nv; // octahedron encoded normal, aliases .pw
 	unsigned short tx, ty;
 };
 Algorithms used for Cone* preprocess the mesh in some way, then split sequentially into 64-triangle clusters:
 ConeBase: optimize mesh for transform cache
 ConeSort: split mesh into large planar connected clusters, bin clusters into 6 buckets by cardinal axes, optimize each bucket for transform cache
 ConeAcmr: optimize mesh for transform cache, split sequentially into variable length clusters that are relatively planar, sort clusters by avg normal
 ConeCash: optimize mesh for transform cache, picking triangles that reduce ACMR but prioritizing those that keep current cluster planar
 MaskBase: split sequentially into 64-triangle clusters, store a 64-bit conservative triangle mask for 6 frustums (cube faces)
 ManyConeN: split sequentially into 64-triangle clusters, store N (up to 4) cones for each cluster and a cone id per triangle (2 bit)

 Note that all Cone* solutions get significantly worse results with 128 or 256 triangle clusters; it doesn't matter much for Mask.
 The biggest challenge with Cone* algorithms is t
API	Share
Direct3D 11+	85% (+5%)
Direct3D 10.1	8.5% (-1.5%)
Direct3D 10.0	5.5% (-2.5%)
Direct3D 9	1% (-1%)
	a1.medium $ ./build/release/meshoptimizer buddha.obj
	# buddha.obj: 549409 vertices, 1087474 triangles; read in 483.46 msec; indexed in 354.61 msec
	Original : ACMR 1.556966 ATVR 3.081784 (NV 3.124747 AMD 3.277660 Intel 2.289651) Overfetch 2.105950 Overdraw 1.200370 in 0.00 msec
	Random : ACMR 2.999919 ATVR 5.937897 (NV 5.937882 AMD 5.937935 Intel 5.936783) Overfetch 10.839888 Overdraw 1.218682 in 33.38 msec
	Cache : ACMR 0.661465 ATVR 1.309272 (NV 1.590738 AMD 1.434356 Intel 1.138871) Overfetch 1.509062 Overdraw 1.206893 in 477.86 msec
	CacheFifo: ACMR 0.689948 ATVR 1.365651 (NV 1.706663 AMD 1.516610 Intel 1.229416) Overfetch 1.552013 Overdraw 1.197034 in 146.00 msec
	Overdraw : ACMR 2.776432 ATVR 5.495538 (NV 5.508446 AMD 5.527603 Intel 5.314811) Overfetch 8.624212 Overdraw 1.086317 in 209.29 msec
	Fetch : ACMR 1.556966 ATVR 3.081784 (NV 3.124747 AMD 3.277660 Intel 2.289651) Overfetch 2.105950 Overdraw 1.200370 in 25.13 msec
	FetchMap : ACMR 1.556966 ATVR 3.081784 (NV 3.124747 AMD 3.277660 Intel 2.289651) O
	~/pugixml$ cat ucd.cpp
	#include "pugixml.hpp"
	#include <malloc.h>

	int main()
	{
	pugi::xml_document doc;
	doc.load_file("ucd.all.grouped.xml");
	malloc_stats();
	}
	class RemapInterfaceIdsPass : public spvtools::opt::Pass
	{
	public:
	const char* name() const override { return "remap-interface-ids"; }

	RemapInterfaceIdsPass(uint32_t start_id): start_id(start_id)
	{
	}

	Status Process() override
	#pragma once

	#include <string>

	namespace fmt {

	#define KIND(X) \
	X(bool,bool) \
	X(signed char,schar) \
	X(unsigned char,uchar) \
	namespace tstd
	{
	template <typename T>
	class vector
	{
	public:
	typedef T* iterator;
	typedef const T* const_iterator;

	iterator begin() { return begin_; }
	#include <stdio.h>
	#include <stdint.h>

	__declspec(noinline)
	uint32_t giveMeRand()
	{
	static uint32_t result = 0xdeadbeef;
	result *= 17;
	return ++result;
	}
	diff --git a/src/indexcodec.cpp b/src/indexcodec.cpp
	index 4ab92cb..18ccd43 100644
	--- a/src/indexcodec.cpp
	+++ b/src/indexcodec.cpp
	@@ -7,9 +7,6 @@
	// This work is based on:
	// Fabian Giesen. Simple lossless index buffer compression & follow-up. 2013
	// Conor Stokes. Vertex Cache Optimised Index Buffer Compression. 2014
	-namespace meshopt
	-{
	Using fixed-point encoding for position (14 bits per component) and texture coordinates (12 bits per component), with 32-bit index buffer
	and this vertex format:

	// 12 bytes
	struct PackedVertexOct
	{
	unsigned short px, py, pz;
	unsigned char nu, nv; // octahedron encoded normal, aliases .pw
	unsigned short tx, ty;
	};
	Algorithms used for Cone* preprocess the mesh in some way, then split sequentially into 64-triangle clusters:
	ConeBase: optimize mesh for transform cache
	ConeSort: split mesh into large planar connected clusters, bin clusters into 6 buckets by cardinal axes, optimize each bucket for transform cache
	ConeAcmr: optimize mesh for transform cache, split sequentially into variable length clusters that are relatively planar, sort clusters by avg normal
	ConeCash: optimize mesh for transform cache, picking triangles that reduce ACMR but prioritizing those that keep current cluster planar
	MaskBase: split sequentially into 64-triangle clusters, store a 64-bit conservative triangle mask for 6 frustums (cube faces)
	ManyConeN: split sequentially into 64-triangle clusters, store N (up to 4) cones for each cluster and a cone id per triangle (2 bit)

	Note that all Cone* solutions get significantly worse results with 128 or 256 triangle clusters; it doesn't matter much for Mask.
	The biggest challenge with Cone* algorithms is t