sowich · November 22, 2016 12:16 · sowich · Nov 22, 2016
diff --git a/test.py b/test.py
 # -*- coding: UTF-8 -*-

 import threading
 import Queue
 import os
 import datetime
 import sys
 import time

 if __name__ == '__build__':
    raise Exception


 class Worker(threading.Thread):
    def __init__(self, work_queue, output):
        super(Worker, self).__init__()
        self.setDaemon(True)

        self.work_queue = work_queue
        self.output = output

    def run(self):
        while (True):
            try:
                start = datetime.datetime.now().strftime('%H:%M:%S')
                target = self.work_queue.get(block=False)

                self.process(target[0], target[1], target[2], target[3])

                print '%s: %s %s -> %s' % (start, self.getName(), target[2], target[3])
                self.output.put(target, block=False)
                self.work_queue.task_done()
            except Queue.Empty:
                sys.stderr.write('%s get Queue.EMPTY exception\r\n' % self.getName())
                break
            except Exception, e:
                self.work_queue.task_done()
                sys.stderr.write('%s get %s exception\r\n' % (self.getName(), e))
                self.work_queue.put(target, block=False)
            finally:
                pass

    def process(self, text1, text2, fn1, fn2):
        cmp1 = self.genshingle(self.canonize(text1))
        cmp2 = self.genshingle(self.canonize(text2))

        res = self.compaire(cmp1, cmp2)
        if res >= 0.9:
            print "Source: %s -> Dest: %s! Matches: %f" % (fn1, fn2, res)

    def canonize(self, source):
        stopSymbols = '.,!?:;-\n\r()'
        stopWords = (
            'and', 'in', 'into', 'not', 'what', 'that', 'he', 'on', 'onto', 'i', 'from', 'how', 'all', 'she',
            'so', 'thus', 'him', 'but', 'yes', 'and', 'tho', 'towards', 'by', 'around', 'chez',
            'intensifier particle', 'yo', 'beyond', 'behind', 'conditional', 'up to', 'along', 'only', 'her',
            'to me', 'it was', 'here is', 'here are', 'particle', 'away from', 'me', 'still', 'yet', 'more',
            'no',
            'about', 'out of', 'to him', 'now', 'when', 'even', 'so', 'well', 'suddenly',
            'interrogative particle',
            'if', 'already', 'or', 'neither', 'to be', 'he was', 'prepositional form of его', 'up to',
            'you accusative', 'again', 'to yo', 'he said', 'there', 'then', 'oneself', 'nothing', 'to her',
            'they',
            'here', 'where', 'got to', 'must', 'for', 'we', 'thee', 'them', 'their', 'than', 'she was', 'self',
            'in order to', 'without', 'as if', 'man', 'person', 'once', 'also', 'to oneself', 'beneath', 'life',
            'will be', 'then', 'who', 'this', 'was saying', 'for that reason', 'which', 'altogether', 'here',
            'one',
            'almost', 'my', 'instrumental', 'dative', 'it seems', 'now', 'they were', 'where to', 'why', 'to say',
            'all', 'never', 'today', 'possible', 'one can', 'by', 'finally', 'two', 'about', 'another', 'even',
            'after', 'above', 'more', 'across', 'these', 'us', 'about', 'in all', 'of all', 'they', 'which',
            'feminine', 'lots', 'interrogative particle', 'she said', 'three', 'this', 'my', 'feminine',
            'moreover',
            'besides', 'good', 'ones own', 'this', 'in front of', 'sometimes', 'better', 'a little', 'preposn',
            'one must not', 'such a one', 'to them', 'more', 'always', 'of course', 'all', 'between')

        return ([x for x in [y.strip(stopSymbols) for y in source.lower().split()] if x and (x not in stopWords)])

    def genshingle(self, source):
        import binascii
        shingleLen = 10
        out = []
        for i in range(len(source) - (shingleLen - 1)):
            out.append(binascii.crc32(' '.join([x.decode('utf-8') for x in source[i:i + shingleLen]]).encode('utf-8')))

        return out

    def compaire(self, source1, source2):
        same = 0
        for i in range(len(source1)):
            if source1[i] in source2:
                same = same + 1

        if len(source1) > 0 or len(source2) > 0:
            return same * 2 / float(len(source1) + len(source2)) * 100

        return 0.0


 class Test(object):
    def __init__(self, data, number_threads):
        self.queue = Queue.Queue()
        self.output = Queue.Queue()

        for item in data:
            self.queue.put(item)

        self.NUMBER_THREADS = number_threads
        self.threads = []

    def execute(self):
        for i in xrange(self.NUMBER_THREADS):
            self.threads.append(Worker(self.queue, self.output))
            self.threads[-1].start()

        self.queue.join()


 files = os.listdir(os.getcwd() + '/files')
 filelistMain = filter(lambda x: x.endswith('.txt'), files)

 tasks = []
 for fn in filelistMain:
    source = open(os.getcwd() + '/files/' + fn, 'r')
    text = source.read()
    source.close()

    for filename in filelistMain:
        if filename == fn:
            continue

        data = []
        data.append(text)

        dest = open(os.getcwd() + '/files/' + filename, 'r')
        data.append(dest.read())
        dest.close()

        data.append(fn)
        data.append(filename)

        tasks.append(data)

 t = datetime.datetime.now()
 test = Test(tasks, 10)
 test.execute()
 print '\r\nthe end in %s\r\n' % (datetime.datetime.now() - t)

 print len(list(test.output.__dict__['queue']))
	# -- coding: UTF-8 --

	import threading
	import Queue
	import os
	import datetime
	import sys
	import time

	if __name__ == '__build__':
	raise Exception


	class Worker(threading.Thread):
	def __init__(self, work_queue, output):
	super(Worker, self).__init__()
	self.setDaemon(True)

	self.work_queue = work_queue
	self.output = output

	def run(self):
	while (True):
	try:
	start = datetime.datetime.now().strftime('%H:%M:%S')
	target = self.work_queue.get(block=False)

	self.process(target[0], target[1], target[2], target[3])

	print '%s: %s %s -> %s' % (start, self.getName(), target[2], target[3])
	self.output.put(target, block=False)
	self.work_queue.task_done()
	except Queue.Empty:
	sys.stderr.write('%s get Queue.EMPTY exception\r\n' % self.getName())
	break
	except Exception, e:
	self.work_queue.task_done()
	sys.stderr.write('%s get %s exception\r\n' % (self.getName(), e))
	self.work_queue.put(target, block=False)
	finally:
	pass

	def process(self, text1, text2, fn1, fn2):
	cmp1 = self.genshingle(self.canonize(text1))
	cmp2 = self.genshingle(self.canonize(text2))

	res = self.compaire(cmp1, cmp2)
	if res >= 0.9:
	print "Source: %s -> Dest: %s! Matches: %f" % (fn1, fn2, res)

	def canonize(self, source):
	stopSymbols = '.,!?:;-\n\r()'
	stopWords = (
	'and', 'in', 'into', 'not', 'what', 'that', 'he', 'on', 'onto', 'i', 'from', 'how', 'all', 'she',
	'so', 'thus', 'him', 'but', 'yes', 'and', 'tho', 'towards', 'by', 'around', 'chez',
	'intensifier particle', 'yo', 'beyond', 'behind', 'conditional', 'up to', 'along', 'only', 'her',
	'to me', 'it was', 'here is', 'here are', 'particle', 'away from', 'me', 'still', 'yet', 'more',
	'no',
	'about', 'out of', 'to him', 'now', 'when', 'even', 'so', 'well', 'suddenly',
	'interrogative particle',
	'if', 'already', 'or', 'neither', 'to be', 'he was', 'prepositional form of его', 'up to',
	'you accusative', 'again', 'to yo', 'he said', 'there', 'then', 'oneself', 'nothing', 'to her',
	'they',
	'here', 'where', 'got to', 'must', 'for', 'we', 'thee', 'them', 'their', 'than', 'she was', 'self',
	'in order to', 'without', 'as if', 'man', 'person', 'once', 'also', 'to oneself', 'beneath', 'life',
	'will be', 'then', 'who', 'this', 'was saying', 'for that reason', 'which', 'altogether', 'here',
	'one',
	'almost', 'my', 'instrumental', 'dative', 'it seems', 'now', 'they were', 'where to', 'why', 'to say',
	'all', 'never', 'today', 'possible', 'one can', 'by', 'finally', 'two', 'about', 'another', 'even',
	'after', 'above', 'more', 'across', 'these', 'us', 'about', 'in all', 'of all', 'they', 'which',
	'feminine', 'lots', 'interrogative particle', 'she said', 'three', 'this', 'my', 'feminine',
	'moreover',
	'besides', 'good', 'ones own', 'this', 'in front of', 'sometimes', 'better', 'a little', 'preposn',
	'one must not', 'such a one', 'to them', 'more', 'always', 'of course', 'all', 'between')

	return ([x for x in [y.strip(stopSymbols) for y in source.lower().split()] if x and (x not in stopWords)])

	def genshingle(self, source):
	import binascii
	shingleLen = 10
	out = []
	for i in range(len(source) - (shingleLen - 1)):
	out.append(binascii.crc32(' '.join([x.decode('utf-8') for x in source[i:i + shingleLen]]).encode('utf-8')))

	return out

	def compaire(self, source1, source2):
	same = 0
	for i in range(len(source1)):
	if source1[i] in source2:
	same = same + 1

	if len(source1) > 0 or len(source2) > 0:
	return same * 2 / float(len(source1) + len(source2)) * 100

	return 0.0


	class Test(object):
	def __init__(self, data, number_threads):
	self.queue = Queue.Queue()
	self.output = Queue.Queue()

	for item in data:
	self.queue.put(item)

	self.NUMBER_THREADS = number_threads
	self.threads = []

	def execute(self):
	for i in xrange(self.NUMBER_THREADS):
	self.threads.append(Worker(self.queue, self.output))
	self.threads[-1].start()

	self.queue.join()


	files = os.listdir(os.getcwd() + '/files')
	filelistMain = filter(lambda x: x.endswith('.txt'), files)

	tasks = []
	for fn in filelistMain:
	source = open(os.getcwd() + '/files/' + fn, 'r')
	text = source.read()
	source.close()

	for filename in filelistMain:
	if filename == fn:
	continue

	data = []
	data.append(text)

	dest = open(os.getcwd() + '/files/' + filename, 'r')
	data.append(dest.read())
	dest.close()

	data.append(fn)
	data.append(filename)

	tasks.append(data)

	t = datetime.datetime.now()
	test = Test(tasks, 10)
	test.execute()
	print '\r\nthe end in %s\r\n' % (datetime.datetime.now() - t)

	print len(list(test.output.__dict__['queue']))