hughperkins · May 29, 2017 12:25
diff --git a/gistfile1.txt b/gistfile1.txt
 (env3) (tensorflow-cl|…10△2) ~/git/tensorflow-cl$ git diff
 diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
 index f18ee5e..ba664ea 100644
 --- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
 +++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
 @@ -18,6 +18,8 @@ limitations under the License.
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/protobuf/config.pb.h"

 +#include <sstream>
 +
 namespace gpu = ::perftools::gputools;

 namespace tensorflow {
 @@ -142,23 +144,45 @@ void EventMgr::PollLoop() {
   polling_stopped_->Notify();
 }

 +std::string EventMgr::debugIU(const InUse &iu) {
 +   std::ostringstream ss;
 +   std::cout << "         debugui iu=" << &iu << std::endl;
 +   std::cout << "         debugui origfn=" << iu.funcOrig << std::endl;
 +   std::cout << "         debugui pre=" << iu.pre << std::endl;
 +   std::cout << "         debugui post=" << iu.post << std::endl;
 +   std::cout << "         debugui &iu.func=" << &iu.func << std::endl;
 +   std::cout << "         debugui (char *)&iu.func=" << (char *)&iu.func << std::endl;
 +   std::cout << "         debugui (long *)(char *)&iu.func=" << (long *)(char *)&iu.func << std::endl;
 +   std::cout << "         debugui *(long *)(char *)&iu.func=" << *(long *)(char *)&iu.func << std::endl;
 +   ss << "iu=" << &iu << " origfn=" << iu.funcOrig <<  " pre=" << iu.pre << " func=" << *(long*)(char*)(&iu.func) << " post=" << iu.post;
 +   return ss.str();
 +}
 +
 void EventMgr::QueueInUse(gpu::Stream* stream, InUse iu) {
   VLOG(2) << "QueueInUse  free_events_ " << free_events_.size()
           << " used_events_ " << used_events_.size();
 +      std::cout << "QueueInUse() " << debugIU(iu) << std::endl;
 +
   // Events are created on demand, and repeatedly reused.  There is no
   // limit placed here on the number of allocated Events.
   if (free_events_.empty()) {
 +   std::cout << "    queueInUse no free events: creating new one" << std::endl;
     free_events_.push_back(new gpu::Event(exec_));
     free_events_.back()->Init();
   }
   gpu::Event* e = free_events_.back();
 +  // std::cout << "    queueInUse event " << e << std::endl;
   free_events_.pop_back();
   stream->ThenRecordEvent(e);
   iu.event = e;
 +  std::cout << "    queueInUse event=" << e << " " << debugIU(iu) << std::endl;
   bool was_empty = used_events_.empty();
   used_events_.push_back(iu);
 +  std::cout << "    queueInUse queued iu used_events[used_events.size() - 1] " << debugIU(used_events_[used_events_.size() - 1]) << " used_events_.size() " << used_events_.size() << std::endl;
 +  //InUse *iuqueued = &used_events_[used_events_.size() - 1];
   // Maybe wake up the polling thread
   if (was_empty) events_pending_.notify_all();
 +  std::cout << "    queueInUse after notify_all(): used_events_.size() " << used_events_.size() << std::endl;
 }

 // This function must be called periodically to check whether pending
 diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
 index b97b5e4..9975f06 100644
 --- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
 +++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
 @@ -102,12 +102,16 @@ class EventMgr {
     perftools::gputools::Event* event;
     TensorReferenceVector* mem;
     BufRec bufrec;
 +    long funcOrig;
 +    long long pre;
     const std::function<void()> &func;
 +    long long post;
   };

   typedef gtl::InlinedVector<InUse, 4> ToFreeVector;

   void FreeMemory(const ToFreeVector& to_free) {
 +    std::cout << "core/common_runtime/gpu/gpu_event_mgr.h FreeMemory()" << std::endl;
     for (const auto& iu : to_free) {
       if (iu.mem != nullptr) {
         for (auto& t : *(iu.mem)) {
 @@ -124,10 +128,13 @@ class EventMgr {
         iu.bufrec.alloc->DeallocateRaw(iu.bufrec.buf);
       }
       // The function must be called in another thread.
 +      std::cout << debugIU(iu);
       if (iu.func != nullptr) threadpool_.Schedule(iu.func);
     }
   }

 +  std::string debugIU(const InUse &iu);
 +
   // Stream-enqueue an unused Event and save with it a collection of
   // Tensors and/or a BufRec to be deleted only after the Event
   // records.
 @@ -137,17 +144,17 @@ class EventMgr {
   void QueueTensors(perftools::gputools::Stream* stream,
                     TensorReferenceVector* tensors)
       EXCLUSIVE_LOCKS_REQUIRED(mu_) {
 -    QueueInUse(stream, {nullptr, tensors, BufRec(), nullptr});
 +    QueueInUse(stream, {nullptr, tensors, BufRec(), 0, 123, nullptr, 123});
   }

   void QueueBuffer(perftools::gputools::Stream* stream, BufRec bufrec)
       EXCLUSIVE_LOCKS_REQUIRED(mu_) {
 -    QueueInUse(stream, {nullptr, nullptr, bufrec, nullptr});
 +    QueueInUse(stream, {nullptr, nullptr, bufrec, 0, 123, nullptr, 123});
   }

   void QueueFunc(perftools::gputools::Stream* stream,
                  const std::function<void()> &func) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
 -    QueueInUse(stream, {nullptr, nullptr, BufRec(), func});
 +    QueueInUse(stream, {nullptr, nullptr, BufRec(), *(long *)(char *)&func, 123, func, 123});
   }

   // This function should be called at roughly the same tempo as
	(env3) (tensorflow-cl\|…10△2) ~/git/tensorflow-cl$ git diff
	diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
	index f18ee5e..ba664ea 100644
	--- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
	+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
	@@ -18,6 +18,8 @@ limitations under the License.
	#include "tensorflow/core/platform/stream_executor.h"
	#include "tensorflow/core/protobuf/config.pb.h"

	+#include <sstream>
	+
	namespace gpu = ::perftools::gputools;

	namespace tensorflow {
	@@ -142,23 +144,45 @@ void EventMgr::PollLoop() {
	polling_stopped_->Notify();
	}

	+std::string EventMgr::debugIU(const InUse &iu) {
	+ std::ostringstream ss;
	+ std::cout << " debugui iu=" << &iu << std::endl;
	+ std::cout << " debugui origfn=" << iu.funcOrig << std::endl;
	+ std::cout << " debugui pre=" << iu.pre << std::endl;
	+ std::cout << " debugui post=" << iu.post << std::endl;
	+ std::cout << " debugui &iu.func=" << &iu.func << std::endl;
	+ std::cout << " debugui (char )&iu.func=" << (char )&iu.func << std::endl;
	+ std::cout << " debugui (long )(char )&iu.func=" << (long )(char )&iu.func << std::endl;
	+ std::cout << " debugui (long )(char )&iu.func=" << (long )(char )&iu.func << std::endl;
	+ ss << "iu=" << &iu << " origfn=" << iu.funcOrig << " pre=" << iu.pre << " func=" << (long)(char*)(&iu.func) << " post=" << iu.post;
	+ return ss.str();
	+}
	+
	void EventMgr::QueueInUse(gpu::Stream* stream, InUse iu) {
	VLOG(2) << "QueueInUse free_events_ " << free_events_.size()
	<< " used_events_ " << used_events_.size();
	+ std::cout << "QueueInUse() " << debugIU(iu) << std::endl;
	+
	// Events are created on demand, and repeatedly reused. There is no
	// limit placed here on the number of allocated Events.
	if (free_events_.empty()) {
	+ std::cout << " queueInUse no free events: creating new one" << std::endl;
	free_events_.push_back(new gpu::Event(exec_));
	free_events_.back()->Init();
	}
	gpu::Event* e = free_events_.back();
	+ // std::cout << " queueInUse event " << e << std::endl;
	free_events_.pop_back();
	stream->ThenRecordEvent(e);
	iu.event = e;
	+ std::cout << " queueInUse event=" << e << " " << debugIU(iu) << std::endl;
	bool was_empty = used_events_.empty();
	used_events_.push_back(iu);
	+ std::cout << " queueInUse queued iu used_events[used_events.size() - 1] " << debugIU(used_events_[used_events_.size() - 1]) << " used_events_.size() " << used_events_.size() << std::endl;
	+ //InUse *iuqueued = &used_events_[used_events_.size() - 1];
	// Maybe wake up the polling thread
	if (was_empty) events_pending_.notify_all();
	+ std::cout << " queueInUse after notify_all(): used_events_.size() " << used_events_.size() << std::endl;
	}

	// This function must be called periodically to check whether pending
	diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
	index b97b5e4..9975f06 100644
	--- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
	+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
	@@ -102,12 +102,16 @@ class EventMgr {
	perftools::gputools::Event* event;
	TensorReferenceVector* mem;
	BufRec bufrec;
	+ long funcOrig;
	+ long long pre;
	const std::function<void()> &func;
	+ long long post;
	};

	typedef gtl::InlinedVector<InUse, 4> ToFreeVector;

	void FreeMemory(const ToFreeVector& to_free) {
	+ std::cout << "core/common_runtime/gpu/gpu_event_mgr.h FreeMemory()" << std::endl;
	for (const auto& iu : to_free) {
	if (iu.mem != nullptr) {
	for (auto& t : *(iu.mem)) {
	@@ -124,10 +128,13 @@ class EventMgr {
	iu.bufrec.alloc->DeallocateRaw(iu.bufrec.buf);
	}
	// The function must be called in another thread.
	+ std::cout << debugIU(iu);
	if (iu.func != nullptr) threadpool_.Schedule(iu.func);
	}
	}

	+ std::string debugIU(const InUse &iu);
	+
	// Stream-enqueue an unused Event and save with it a collection of
	// Tensors and/or a BufRec to be deleted only after the Event
	// records.
	@@ -137,17 +144,17 @@ class EventMgr {
	void QueueTensors(perftools::gputools::Stream* stream,
	TensorReferenceVector* tensors)
	EXCLUSIVE_LOCKS_REQUIRED(mu_) {
	- QueueInUse(stream, {nullptr, tensors, BufRec(), nullptr});
	+ QueueInUse(stream, {nullptr, tensors, BufRec(), 0, 123, nullptr, 123});
	}

	void QueueBuffer(perftools::gputools::Stream* stream, BufRec bufrec)
	EXCLUSIVE_LOCKS_REQUIRED(mu_) {
	- QueueInUse(stream, {nullptr, nullptr, bufrec, nullptr});
	+ QueueInUse(stream, {nullptr, nullptr, bufrec, 0, 123, nullptr, 123});
	}

	void QueueFunc(perftools::gputools::Stream* stream,
	const std::function<void()> &func) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
	- QueueInUse(stream, {nullptr, nullptr, BufRec(), func});
	+ QueueInUse(stream, {nullptr, nullptr, BufRec(), (long )(char *)&func, 123, func, 123});
	}

	// This function should be called at roughly the same tempo as