mattatcha · December 28, 2017 18:12
diff --git a/connectbug-deployment.yaml b/connectbug-deployment.yaml
 apiVersion: extensions/v1beta1
 kind: Deployment
 metadata:
  name: connectbug
 spec:
  replicas: 4
  template:
    metadata:
      labels:
        app: connectbug
    spec:
      # avoid dnsmasq/kube-dns bottleneck: do not use cluster dns
      # dnsPolicy: Default

      containers:
      - name: connectbug
        image: gcr.io/bluecore-qa/connectbug:20171005-9d5b955659
        args: ["50"]

        resources:
          requests:
            memory: 1Gi
            cpu: 0.6
          limits:
            memory: 1Gi
diff --git a/connectbug.py b/connectbug.py
 # To reproduce the DNS errors:
 #
 # 1. Build and publish the image with the attached Dockerfile.
 # 2. Create a new GKE cluster with 5 n1-standard-1 nodes
 # 3. Edit DNS auto-scaler to have one instance (you can exceed the limit with more than
 #    one, but need more nodes). Set "preventSinglePointFailure":false by running;
 #    kubectl edit configmap --namespace=kube-system kube-dns-autoscaler
 # 4. kubectl apply -f connectbug-deployment.yaml
 # 5. Observe the logs of one of the pods with kubectl logs -f. Notice that they say:
 #    WARNING:root:Ignoring error while making request: <urlopen error [Errno -2] Name or service not known>
 # 6. Run dmesg on the host with kube-dns, notice it says:
 #    [ 2110.051724] nf_conntrack: table full, dropping packet
 #    [ 2110.056937] nf_conntrack: table full, dropping packet
 #    [ 2110.062123] nf_conntrack: table full, dropping packet
 # 7. Edit connectbug-deployment.yaml and uncomment dnsPolicy: Default
 # 8. kubectl apply -f connectbug-deployment.yaml
 # 9. Observe that the errors go away
 import logging
 import socket
 import sys
 import thread
 import threading
 import time
 import urllib2


 REQUEST_CODE = 200
 REQUEST_URL = 'http://bluecore-qa.appspot.com/api/track'

 class Counter(object):
    def __init__(self):
        self.value = 0

    def increment(self):
        # relying on the global interpreter lock to make this atomic
        self.value += 1

    def get(self):
        return self.value


 def send_requests(counter):
    try:
        while True:
            req = urllib2.Request(REQUEST_URL, None, {})
            try:
                resp = urllib2.urlopen(req, timeout=30)
            except urllib2.HTTPError as resp:
                # this is an error response: fall through and treat it as a normal response
                pass
            except (socket.timeout, urllib2.URLError) as e:
                logging.warning('Ignoring error while making request: %s %s', e, REQUEST_URL)
                continue
            try:
                _ = resp.read()
                if resp.getcode() != REQUEST_CODE:
                    raise Exception("request did not return expected code")
            finally:
                resp.close()

            counter.increment()

    except Exception as e:
        # attempt to crash the program if this fails
        logging.exception('ERROR in send_request')
        thread.interrupt_main()


 def main():
    if len(sys.argv) != 2:
        sys.stderr.write('Usage: connectbug.py (threads)\n')
        sys.exit(1)
    num_threads = int(sys.argv[1])

    # Enable all log levels; send to stdout
    logging.getLogger().setLevel(logging.DEBUG)

    counter = Counter()
    threads = []
    for i in xrange(num_threads):
        thread = threading.Thread(target=send_requests, args=[counter])
        # mark as daemon so we exit if the main thread crashes
        thread.daemon = True
        thread.start()
        threads.append(thread)

    logging.info('started %d threads ...', len(threads))

    while True:
        REPORT_SECONDS = 15
        start = counter.get()
        time.sleep(REPORT_SECONDS)
        end = counter.get()

        logging.info('%f requests/second', (end-start)/float(REPORT_SECONDS))


 if __name__ == '__main__':
    main()
diff --git a/Dockerfile b/Dockerfile
 FROM python:2.7.13-slim
 COPY . /app/
 WORKDIR /app
 ENTRYPOINT ["python", "connectbug.py"]
	apiVersion: extensions/v1beta1
	kind: Deployment
	metadata:
	name: connectbug
	spec:
	replicas: 4
	template:
	metadata:
	labels:
	app: connectbug
	spec:
	# avoid dnsmasq/kube-dns bottleneck: do not use cluster dns
	# dnsPolicy: Default

	containers:
	- name: connectbug
	image: gcr.io/bluecore-qa/connectbug:20171005-9d5b955659
	args: ["50"]

	resources:
	requests:
	memory: 1Gi
	cpu: 0.6
	limits:
	memory: 1Gi
	# To reproduce the DNS errors:
	#
	# 1. Build and publish the image with the attached Dockerfile.
	# 2. Create a new GKE cluster with 5 n1-standard-1 nodes
	# 3. Edit DNS auto-scaler to have one instance (you can exceed the limit with more than
	# one, but need more nodes). Set "preventSinglePointFailure":false by running;
	# kubectl edit configmap --namespace=kube-system kube-dns-autoscaler
	# 4. kubectl apply -f connectbug-deployment.yaml
	# 5. Observe the logs of one of the pods with kubectl logs -f. Notice that they say:
	# WARNING:root:Ignoring error while making request: <urlopen error [Errno -2] Name or service not known>
	# 6. Run dmesg on the host with kube-dns, notice it says:
	# [ 2110.051724] nf_conntrack: table full, dropping packet
	# [ 2110.056937] nf_conntrack: table full, dropping packet
	# [ 2110.062123] nf_conntrack: table full, dropping packet
	# 7. Edit connectbug-deployment.yaml and uncomment dnsPolicy: Default
	# 8. kubectl apply -f connectbug-deployment.yaml
	# 9. Observe that the errors go away
	import logging
	import socket
	import sys
	import thread
	import threading
	import time
	import urllib2


	REQUEST_CODE = 200
	REQUEST_URL = 'http://bluecore-qa.appspot.com/api/track'

	class Counter(object):
	def __init__(self):
	self.value = 0

	def increment(self):
	# relying on the global interpreter lock to make this atomic
	self.value += 1

	def get(self):
	return self.value


	def send_requests(counter):
	try:
	while True:
	req = urllib2.Request(REQUEST_URL, None, {})
	try:
	resp = urllib2.urlopen(req, timeout=30)
	except urllib2.HTTPError as resp:
	# this is an error response: fall through and treat it as a normal response
	pass
	except (socket.timeout, urllib2.URLError) as e:
	logging.warning('Ignoring error while making request: %s %s', e, REQUEST_URL)
	continue
	try:
	_ = resp.read()
	if resp.getcode() != REQUEST_CODE:
	raise Exception("request did not return expected code")
	finally:
	resp.close()

	counter.increment()

	except Exception as e:
	# attempt to crash the program if this fails
	logging.exception('ERROR in send_request')
	thread.interrupt_main()


	def main():
	if len(sys.argv) != 2:
	sys.stderr.write('Usage: connectbug.py (threads)\n')
	sys.exit(1)
	num_threads = int(sys.argv[1])

	# Enable all log levels; send to stdout
	logging.getLogger().setLevel(logging.DEBUG)

	counter = Counter()
	threads = []
	for i in xrange(num_threads):
	thread = threading.Thread(target=send_requests, args=[counter])
	# mark as daemon so we exit if the main thread crashes
	thread.daemon = True
	thread.start()
	threads.append(thread)

	logging.info('started %d threads ...', len(threads))

	while True:
	REPORT_SECONDS = 15
	start = counter.get()
	time.sleep(REPORT_SECONDS)
	end = counter.get()

	logging.info('%f requests/second', (end-start)/float(REPORT_SECONDS))


	if __name__ == '__main__':
	main()
	FROM python:2.7.13-slim
	COPY . /app/
	WORKDIR /app
	ENTRYPOINT ["python", "connectbug.py"]