Created
November 22, 2016 12:16
-
-
Save sowich/ef16e7b755cd4a7fb6b0f2086234cbe3 to your computer and use it in GitHub Desktop.
test.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: UTF-8 -*- | |
import threading | |
import Queue | |
import os | |
import datetime | |
import sys | |
import time | |
if __name__ == '__build__': | |
raise Exception | |
class Worker(threading.Thread): | |
def __init__(self, work_queue, output): | |
super(Worker, self).__init__() | |
self.setDaemon(True) | |
self.work_queue = work_queue | |
self.output = output | |
def run(self): | |
while (True): | |
try: | |
start = datetime.datetime.now().strftime('%H:%M:%S') | |
target = self.work_queue.get(block=False) | |
self.process(target[0], target[1], target[2], target[3]) | |
print '%s: %s %s -> %s' % (start, self.getName(), target[2], target[3]) | |
self.output.put(target, block=False) | |
self.work_queue.task_done() | |
except Queue.Empty: | |
sys.stderr.write('%s get Queue.EMPTY exception\r\n' % self.getName()) | |
break | |
except Exception, e: | |
self.work_queue.task_done() | |
sys.stderr.write('%s get %s exception\r\n' % (self.getName(), e)) | |
self.work_queue.put(target, block=False) | |
finally: | |
pass | |
def process(self, text1, text2, fn1, fn2): | |
cmp1 = self.genshingle(self.canonize(text1)) | |
cmp2 = self.genshingle(self.canonize(text2)) | |
res = self.compaire(cmp1, cmp2) | |
if res >= 0.9: | |
print "Source: %s -> Dest: %s! Matches: %f" % (fn1, fn2, res) | |
def canonize(self, source): | |
stopSymbols = '.,!?:;-\n\r()' | |
stopWords = ( | |
'and', 'in', 'into', 'not', 'what', 'that', 'he', 'on', 'onto', 'i', 'from', 'how', 'all', 'she', | |
'so', 'thus', 'him', 'but', 'yes', 'and', 'tho', 'towards', 'by', 'around', 'chez', | |
'intensifier particle', 'yo', 'beyond', 'behind', 'conditional', 'up to', 'along', 'only', 'her', | |
'to me', 'it was', 'here is', 'here are', 'particle', 'away from', 'me', 'still', 'yet', 'more', | |
'no', | |
'about', 'out of', 'to him', 'now', 'when', 'even', 'so', 'well', 'suddenly', | |
'interrogative particle', | |
'if', 'already', 'or', 'neither', 'to be', 'he was', 'prepositional form of его', 'up to', | |
'you accusative', 'again', 'to yo', 'he said', 'there', 'then', 'oneself', 'nothing', 'to her', | |
'they', | |
'here', 'where', 'got to', 'must', 'for', 'we', 'thee', 'them', 'their', 'than', 'she was', 'self', | |
'in order to', 'without', 'as if', 'man', 'person', 'once', 'also', 'to oneself', 'beneath', 'life', | |
'will be', 'then', 'who', 'this', 'was saying', 'for that reason', 'which', 'altogether', 'here', | |
'one', | |
'almost', 'my', 'instrumental', 'dative', 'it seems', 'now', 'they were', 'where to', 'why', 'to say', | |
'all', 'never', 'today', 'possible', 'one can', 'by', 'finally', 'two', 'about', 'another', 'even', | |
'after', 'above', 'more', 'across', 'these', 'us', 'about', 'in all', 'of all', 'they', 'which', | |
'feminine', 'lots', 'interrogative particle', 'she said', 'three', 'this', 'my', 'feminine', | |
'moreover', | |
'besides', 'good', 'ones own', 'this', 'in front of', 'sometimes', 'better', 'a little', 'preposn', | |
'one must not', 'such a one', 'to them', 'more', 'always', 'of course', 'all', 'between') | |
return ([x for x in [y.strip(stopSymbols) for y in source.lower().split()] if x and (x not in stopWords)]) | |
def genshingle(self, source): | |
import binascii | |
shingleLen = 10 | |
out = [] | |
for i in range(len(source) - (shingleLen - 1)): | |
out.append(binascii.crc32(' '.join([x.decode('utf-8') for x in source[i:i + shingleLen]]).encode('utf-8'))) | |
return out | |
def compaire(self, source1, source2): | |
same = 0 | |
for i in range(len(source1)): | |
if source1[i] in source2: | |
same = same + 1 | |
if len(source1) > 0 or len(source2) > 0: | |
return same * 2 / float(len(source1) + len(source2)) * 100 | |
return 0.0 | |
class Test(object): | |
def __init__(self, data, number_threads): | |
self.queue = Queue.Queue() | |
self.output = Queue.Queue() | |
for item in data: | |
self.queue.put(item) | |
self.NUMBER_THREADS = number_threads | |
self.threads = [] | |
def execute(self): | |
for i in xrange(self.NUMBER_THREADS): | |
self.threads.append(Worker(self.queue, self.output)) | |
self.threads[-1].start() | |
self.queue.join() | |
files = os.listdir(os.getcwd() + '/files') | |
filelistMain = filter(lambda x: x.endswith('.txt'), files) | |
tasks = [] | |
for fn in filelistMain: | |
source = open(os.getcwd() + '/files/' + fn, 'r') | |
text = source.read() | |
source.close() | |
for filename in filelistMain: | |
if filename == fn: | |
continue | |
data = [] | |
data.append(text) | |
dest = open(os.getcwd() + '/files/' + filename, 'r') | |
data.append(dest.read()) | |
dest.close() | |
data.append(fn) | |
data.append(filename) | |
tasks.append(data) | |
t = datetime.datetime.now() | |
test = Test(tasks, 10) | |
test.execute() | |
print '\r\nthe end in %s\r\n' % (datetime.datetime.now() - t) | |
print len(list(test.output.__dict__['queue'])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Скрипт для поиска аналогичных текстов на основе алгоритмов шингла!
Вопщем все должно работать многопоточно, но как это не парадокстально, но 1 поток работает быстрее чем 5 например.
Итак, кол-во потоков задается на 144 строке(в данном случае их 10).
Вся полезная работа в функции
Но почему то если я ставлю 10 потоков, то скрипт выполняет все за 17 секунд, а если в 1 поток то за 10!
Причем результат работы верный в обоих случаях, и все анализируется как и запланировано.
Но почему при бОльшем кол-ве потоков скрипт работает дольше чем при одном? Какой то парадокс либо я туплю жестко!
P.S. Мой первый скрипт на змее.
P.S.S На винде на кор ай5 тестировалось и на вирт машине(центос), везде по 4 ядра(если это актуально) результат примерно один, плюс минус секунда(ну это грубо).